From 75182c755e95a4e6e283e7d8064398d8550a2172 Mon Sep 17 00:00:00 2001
From: AlongWY <AlongWY@users.noreply.github.com>
Date: Thu, 19 Oct 2023 05:21:27 +0000
Subject: [PATCH] deploy: 72066be21ad467c8ffc76b74c152b38decf3f0ac

---
 .nojekyll   |     0
 cache.json  |     1 +
 favicon.ico |   Bin 0 -> 15086 bytes
 index.css   |   355 +
 index.html  | 82160 ++++++++++++++++++++++++++++++++++++++++++++++++++
 index.js    |    39 +
 6 files changed, 82555 insertions(+)
 create mode 100644 .nojekyll
 create mode 100644 cache.json
 create mode 100644 favicon.ico
 create mode 100644 index.css
 create mode 100644 index.html
 create mode 100644 index.js

diff --git a/.nojekyll b/.nojekyll
new file mode 100644
index 00000000..e69de29b
diff --git a/cache.json b/cache.json
new file mode 100644
index 00000000..61e54dfe
--- /dev/null
+++ b/cache.json
@@ -0,0 +1 @@
+{"2023-10-11T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2305.05658v2","updated":"2023-10-11T17:59:44Z","published":"2023-05-09T17:52:59Z","title":"TidyBot: Personalized Robot Assistance with Large Language Models","summary":"  For a robot to personalize physical assistance effectively, it must learn\nuser preferences that can be generally reapplied to future scenarios. In this\nwork, we investigate personalization of household cleanup with robots that can\ntidy up rooms by picking up objects and putting them away. A key challenge is\ndetermining the proper place to put each object, as people's preferences can\nvary greatly depending on personal taste or cultural background. For instance,\none person may prefer storing shirts in the drawer, while another may prefer\nthem on the shelf. We aim to build systems that can learn such preferences from\njust a handful of examples via prior interactions with a particular person. We\nshow that robots can combine language-based planning and perception with the\nfew-shot summarization capabilities of large language models (LLMs) to infer\ngeneralized user preferences that are broadly applicable to future\ninteractions. This approach enables fast adaptation and achieves 91.2% accuracy\non unseen objects in our benchmark dataset. We also demonstrate our approach on\na real-world mobile manipulator called TidyBot, which successfully puts away\n85.0% of objects in real-world test scenarios.\n","authors":["Jimmy Wu","Rika Antonova","Adam Kan","Marion Lepert","Andy Zeng","Shuran Song","Jeannette Bohg","Szymon Rusinkiewicz","Thomas Funkhouser"],"pdf_url":"https://arxiv.org/pdf/2305.05658v2.pdf","comment":"Accepted to Autonomous Robots (AuRo) - Special Issue: Large Language\n  Models in Robotics, 2023 and IEEE/RSJ International Conference on Intelligent\n  Robots and Systems (IROS), 2023. Project page:\n  https://tidybot.cs.princeton.edu"},{"id":"http://arxiv.org/abs/2310.07715v1","updated":"2023-10-11T17:59:36Z","published":"2023-10-11T17:59:36Z","title":"To Build Our Future, We Must Know Our Past: Contextualizing Paradigm\n  Shifts in Natural Language Processing","summary":"  NLP is in a period of disruptive change that is impacting our methodologies,\nfunding sources, and public perception. In this work, we seek to understand how\nto shape our future by better understanding our past. We study factors that\nshape NLP as a field, including culture, incentives, and infrastructure by\nconducting long-form interviews with 26 NLP researchers of varying seniority,\nresearch area, institution, and social identity. Our interviewees identify\ncyclical patterns in the field, as well as new shifts without historical\nparallel, including changes in benchmark culture and software infrastructure.\nWe complement this discussion with quantitative analysis of citation,\nauthorship, and language use in the ACL Anthology over time. We conclude by\ndiscussing shared visions, concerns, and hopes for the future of NLP. We hope\nthat this study of our field's past and present can prompt informed discussion\nof our community's implicit norms and more deliberate action to consciously\nshape the future.\n","authors":["Sireesh Gururaja","Amanda Bertsch","Clara Na","David Gray Widder","Emma Strubell"],"pdf_url":"https://arxiv.org/pdf/2310.07715v1.pdf","comment":"Accepted to EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.07713v1","updated":"2023-10-11T17:59:05Z","published":"2023-10-11T17:59:05Z","title":"InstructRetro: Instruction Tuning post Retrieval-Augmented Pretraining","summary":"  Pretraining auto-regressive large language models (LLMs) with retrieval\ndemonstrates better perplexity and factual accuracy by leveraging external\ndatabases. However, the size of existing pretrained retrieval-augmented LLM is\nstill limited (e.g., Retro has 7.5B parameters), which limits the effectiveness\nof instruction tuning and zero-shot generalization. In this work, we introduce\nRetro 48B, the largest LLM pretrained with retrieval before instruction tuning.\nSpecifically, we continue to pretrain the 43B GPT model on additional 100\nbillion tokens using the Retro augmentation method by retrieving from 1.2\ntrillion tokens. The obtained foundation model, Retro 48B, largely outperforms\nthe original 43B GPT in terms of perplexity. After instruction tuning on Retro,\nInstructRetro demonstrates significant improvement over the instruction tuned\nGPT on zero-shot question answering (QA) tasks. Specifically, the average\nimprovement of InstructRetro is 7% over its GPT counterpart across 8 short-form\nQA tasks, and 10% over GPT across 4 challenging long-form QA tasks.\nSurprisingly, we find that one can ablate the encoder from InstructRetro\narchitecture and directly use its decoder backbone, while achieving comparable\nresults. We hypothesize that pretraining with retrieval makes its decoder good\nat incorporating context for QA. Our results highlights the promising direction\nto obtain a better GPT decoder for QA through continued pretraining with\nretrieval before instruction tuning.\n","authors":["Boxin Wang","Wei Ping","Lawrence McAfee","Peng Xu","Bo Li","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2310.07713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07712v1","updated":"2023-10-11T17:59:02Z","published":"2023-10-11T17:59:02Z","title":"Found in the Middle: Permutation Self-Consistency Improves Listwise\n  Ranking in Large Language Models","summary":"  Large language models (LLMs) exhibit positional bias in how they use context,\nwhich especially complicates listwise ranking. To address this, we propose\npermutation self-consistency, a form of self-consistency over ranking list\noutputs of black-box LLMs. Our key idea is to marginalize out different list\norders in the prompt to produce an order-independent ranking with less\npositional bias. First, given some input prompt, we repeatedly shuffle the list\nin the prompt and pass it through the LLM while holding the instructions the\nsame. Next, we aggregate the resulting sample of rankings by computing the\ncentral ranking closest in distance to all of them, marginalizing out prompt\norder biases in the process. Theoretically, we prove the robustness of our\nmethod, showing convergence to the true ranking in the presence of random\nperturbations. Empirically, on five list-ranking datasets in sorting and\npassage reranking, our approach improves scores from conventional inference by\nup to 7-18% for GPT-3.5 and 8-16% for LLaMA v2 (70B), surpassing the previous\nstate of the art in passage reranking. Our code is at\nhttps://github.com/castorini/perm-sc.\n","authors":["Raphael Tang","Xinyu Zhang","Xueguang Ma","Jimmy Lin","Ferhan Ture"],"pdf_url":"https://arxiv.org/pdf/2310.07712v1.pdf","comment":"First two authors contributed equally; 10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2310.07710v1","updated":"2023-10-11T17:57:35Z","published":"2023-10-11T17:57:35Z","title":"DiPmark: A Stealthy, Efficient and Resilient Watermark for Large\n  Language Models","summary":"  Watermarking techniques offer a promising way to secure data via embedding\ncovert information into the data. A paramount challenge in the domain lies in\npreserving the distribution of original data during watermarking. Our research\nextends and refines existing watermarking framework, placing emphasis on the\nimportance of a distribution-preserving (DiP) watermark. Contrary to the\ncurrent strategies, our proposed DiPmark preserves the original token\ndistribution during watermarking (stealthy), is detectable without access to\nthe language model API or weights (efficient), and is robust to moderate\nchanges of tokens (resilient). This is achieved by incorporating a novel\nreweight strategy, combined with a hash function that assigns unique\n\\textit{i.i.d.} ciphers based on the context. The empirical benchmarks of our\napproach underscore its stealthiness, efficiency, and resilience, making it a\nrobust solution for watermarking tasks that demand impeccable quality\npreservation.\n","authors":["Yihan Wu","Zhengmian Hu","Hongyang Zhang","Heng Huang"],"pdf_url":"https://arxiv.org/pdf/2310.07710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07707v1","updated":"2023-10-11T17:57:14Z","published":"2023-10-11T17:57:14Z","title":"MatFormer: Nested Transformer for Elastic Inference","summary":"  Transformer models are deployed in a wide range of settings, from\nmulti-accelerator clusters to standalone mobile phones. The diverse inference\nconstraints in these scenarios necessitate practitioners to train foundation\nmodels such as PaLM 2, Llama, & ViTs as a series of models of varying sizes.\nDue to significant training costs, only a select few model sizes are trained\nand supported, limiting more fine-grained control over relevant tradeoffs,\nincluding latency, cost, and accuracy. This work introduces MatFormer, a nested\nTransformer architecture designed to offer elasticity in a variety of\ndeployment constraints. Each Feed Forward Network (FFN) block of a MatFormer\nmodel is jointly optimized with a few nested smaller FFN blocks. This training\nprocedure allows for the Mix'n'Match of model granularities across layers --\ni.e., a trained universal MatFormer model enables extraction of hundreds of\naccurate smaller models, which were never explicitly optimized. We empirically\ndemonstrate MatFormer's effectiveness across different model classes (decoders\n& encoders), modalities (language & vision), and scales (up to 2.6B\nparameters). We find that a 2.6B decoder-only MatFormer language model (MatLM)\nallows us to extract smaller models spanning from 1.5B to 2.6B, each exhibiting\ncomparable validation loss and one-shot downstream evaluations to their\nindependently trained counterparts. Furthermore, we observe that smaller\nencoders extracted from a universal MatFormer-based ViT (MatViT) encoder\npreserve the metric-space structure for adaptive large-scale retrieval.\nFinally, we showcase that speculative decoding with the accurate and consistent\nsubmodels extracted from MatFormer can further reduce inference latency.\n","authors":[" Devvrit","Sneha Kudugunta","Aditya Kusupati","Tim Dettmers","Kaifeng Chen","Inderjit Dhillon","Yulia Tsvetkov","Hannaneh Hajishirzi","Sham Kakade","Ali Farhadi","Prateek Jain"],"pdf_url":"https://arxiv.org/pdf/2310.07707v1.pdf","comment":"31 pages, 12 figures, first three authors contributed equally"},{"id":"http://arxiv.org/abs/2310.07704v1","updated":"2023-10-11T17:55:15Z","published":"2023-10-11T17:55:15Z","title":"Ferret: Refer and Ground Anything Anywhere at Any Granularity","summary":"  We introduce Ferret, a new Multimodal Large Language Model (MLLM) capable of\nunderstanding spatial referring of any shape or granularity within an image and\naccurately grounding open-vocabulary descriptions. To unify referring and\ngrounding in the LLM paradigm, Ferret employs a novel and powerful hybrid\nregion representation that integrates discrete coordinates and continuous\nfeatures jointly to represent a region in the image. To extract the continuous\nfeatures of versatile regions, we propose a spatial-aware visual sampler, adept\nat handling varying sparsity across different shapes. Consequently, Ferret can\naccept diverse region inputs, such as points, bounding boxes, and free-form\nshapes. To bolster the desired capability of Ferret, we curate GRIT, a\ncomprehensive refer-and-ground instruction tuning dataset including 1.1M\nsamples that contain rich hierarchical spatial knowledge, with 95K hard\nnegative data to promote model robustness. The resulting model not only\nachieves superior performance in classical referring and grounding tasks, but\nalso greatly outperforms existing MLLMs in region-based and\nlocalization-demanded multimodal chatting. Our evaluations also reveal a\nsignificantly improved capability of describing image details and a remarkable\nalleviation in object hallucination. Code and data will be available at\nhttps://github.com/apple/ml-ferret\n","authors":["Haoxuan You","Haotian Zhang","Zhe Gan","Xianzhi Du","Bowen Zhang","Zirui Wang","Liangliang Cao","Shih-Fu Chang","Yinfei Yang"],"pdf_url":"https://arxiv.org/pdf/2310.07704v1.pdf","comment":"30 pages, 10 figures. Code/Project Website:\n  https://github.com/apple/ml-ferret"},{"id":"http://arxiv.org/abs/2310.07700v1","updated":"2023-10-11T17:51:28Z","published":"2023-10-11T17:51:28Z","title":"Knowledge-enhanced Memory Model for Emotional Support Conversation","summary":"  The prevalence of mental disorders has become a significant issue, leading to\nthe increased focus on Emotional Support Conversation as an effective\nsupplement for mental health support. Existing methods have achieved compelling\nresults, however, they still face three challenges: 1) variability of emotions,\n2) practicality of the response, and 3) intricate strategy modeling. To address\nthese challenges, we propose a novel knowledge-enhanced Memory mODEl for\nemotional suppoRt coNversation (MODERN). Specifically, we first devise a\nknowledge-enriched dialogue context encoding to perceive the dynamic emotion\nchange of different periods of the conversation for coherent user state\nmodeling and select context-related concepts from ConceptNet for practical\nresponse generation. Thereafter, we implement a novel memory-enhanced strategy\nmodeling module to model the semantic patterns behind the strategy categories.\nExtensive experiments on a widely used large-scale dataset verify the\nsuperiority of our model over cutting-edge baselines.\n","authors":["Mengzhao Jia","Qianglong Chen","Liqiang Jing","Dawei Fu","Renyu Li"],"pdf_url":"https://arxiv.org/pdf/2310.07700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08896v3","updated":"2023-10-11T17:43:28Z","published":"2023-03-15T19:31:21Z","title":"SelfCheckGPT: Zero-Resource Black-Box Hallucination Detection for\n  Generative Large Language Models","summary":"  Generative Large Language Models (LLMs) such as GPT-3 are capable of\ngenerating highly fluent responses to a wide variety of user prompts. However,\nLLMs are known to hallucinate facts and make non-factual statements which can\nundermine trust in their output. Existing fact-checking approaches either\nrequire access to the output probability distribution (which may not be\navailable for systems such as ChatGPT) or external databases that are\ninterfaced via separate, often complex, modules. In this work, we propose\n\"SelfCheckGPT\", a simple sampling-based approach that can be used to fact-check\nthe responses of black-box models in a zero-resource fashion, i.e. without an\nexternal database. SelfCheckGPT leverages the simple idea that if an LLM has\nknowledge of a given concept, sampled responses are likely to be similar and\ncontain consistent facts. However, for hallucinated facts, stochastically\nsampled responses are likely to diverge and contradict one another. We\ninvestigate this approach by using GPT-3 to generate passages about individuals\nfrom the WikiBio dataset, and manually annotate the factuality of the generated\npassages. We demonstrate that SelfCheckGPT can: i) detect non-factual and\nfactual sentences; and ii) rank passages in terms of factuality. We compare our\napproach to several baselines and show that our approach has considerably\nhigher AUC-PR scores in sentence-level hallucination detection and higher\ncorrelation scores in passage-level factuality assessment compared to grey-box\nmethods.\n","authors":["Potsawee Manakul","Adian Liusie","Mark J. F. Gales"],"pdf_url":"https://arxiv.org/pdf/2303.08896v3.pdf","comment":"EMNLP 2023 (main conference)"},{"id":"http://arxiv.org/abs/2310.04381v2","updated":"2023-10-11T17:36:12Z","published":"2023-10-06T17:19:40Z","title":"Hermes: Unlocking Security Analysis of Cellular Network Protocols by\n  Synthesizing Finite State Machines from Natural Language Specifications","summary":"  In this paper, we present Hermes, an end-to-end framework to automatically\ngenerate formal representations from natural language cellular specifications.\nWe first develop a neural constituency parser, NEUTREX, to process\ntransition-relevant texts and extract transition components (i.e., states,\nconditions, and actions). We also design a domain-specific language to\ntranslate these transition components to logical formulas by leveraging\ndependency parse trees. Finally, we compile these logical formulas to generate\ntransitions and create the formal model as finite state machines. To\ndemonstrate the effectiveness of Hermes, we evaluate it on 4G NAS, 5G NAS, and\n5G RRC specifications and obtain an overall accuracy of 81-87%, which is a\nsubstantial improvement over the state-of-the-art. Our security analysis of the\nextracted models uncovers 3 new vulnerabilities and identifies 19 previous\nattacks in 4G and 5G specifications, and 7 deviations in commercial 4G\nbasebands.\n","authors":["Abdullah Al Ishtiaq","Sarkar Snigdha Sarathi Das","Syed Md Mukit Rashid","Ali Ranjbar","Kai Tu","Tianwei Wu","Zhezheng Song","Weixuan Wang","Mujtahid Akon","Rui Zhang","Syed Rafiul Hussain"],"pdf_url":"https://arxiv.org/pdf/2310.04381v2.pdf","comment":"Accepted at USENIX Security 24"},{"id":"http://arxiv.org/abs/2306.12424v2","updated":"2023-10-11T17:34:19Z","published":"2023-06-21T17:59:51Z","title":"VisoGender: A dataset for benchmarking gender bias in image-text pronoun\n  resolution","summary":"  We introduce VisoGender, a novel dataset for benchmarking gender bias in\nvision-language models. We focus on occupation-related biases within a\nhegemonic system of binary gender, inspired by Winograd and Winogender schemas,\nwhere each image is associated with a caption containing a pronoun relationship\nof subjects and objects in the scene. VisoGender is balanced by gender\nrepresentation in professional roles, supporting bias evaluation in two ways:\ni) resolution bias, where we evaluate the difference between pronoun resolution\naccuracies for image subjects with gender presentations perceived as masculine\nversus feminine by human annotators and ii) retrieval bias, where we compare\nratios of professionals perceived to have masculine and feminine gender\npresentations retrieved for a gender-neutral search query. We benchmark several\nstate-of-the-art vision-language models and find that they demonstrate bias in\nresolving binary gender in complex scenes. While the direction and magnitude of\ngender bias depends on the task and the model being evaluated, captioning\nmodels are generally less biased than Vision-Language Encoders. Dataset and\ncode are available at https://github.com/oxai/visogender\n","authors":["Siobhan Mackenzie Hall","Fernanda Gonçalves Abrantes","Hanwen Zhu","Grace Sodunke","Aleksandar Shtedritski","Hannah Rose Kirk"],"pdf_url":"https://arxiv.org/pdf/2306.12424v2.pdf","comment":"Data and code available at https://github.com/oxai/visogender"},{"id":"http://arxiv.org/abs/2310.07676v1","updated":"2023-10-11T17:21:03Z","published":"2023-10-11T17:21:03Z","title":"Composite Backdoor Attacks Against Large Language Models","summary":"  Large language models (LLMs) have demonstrated superior performance compared\nto previous methods on various tasks, and often serve as the foundation models\nfor many researches and services. However, the untrustworthy third-party LLMs\nmay covertly introduce vulnerabilities for downstream tasks. In this paper, we\nexplore the vulnerability of LLMs through the lens of backdoor attacks.\nDifferent from existing backdoor attacks against LLMs, ours scatters multiple\ntrigger keys in different prompt components. Such a Composite Backdoor Attack\n(CBA) is shown to be stealthier than implanting the same multiple trigger keys\nin only a single component. CBA ensures that the backdoor is activated only\nwhen all trigger keys appear. Our experiments demonstrate that CBA is effective\nin both natural language processing (NLP) and multimodal tasks. For instance,\nwith $3\\%$ poisoning samples against the LLaMA-7B model on the Emotion dataset,\nour attack achieves a $100\\%$ Attack Success Rate (ASR) with a False Triggered\nRate (FTR) below $2.06\\%$ and negligible model accuracy degradation. The unique\ncharacteristics of our CBA can be tailored for various practical scenarios,\ne.g., targeting specific user groups. Our work highlights the necessity of\nincreased security research on the trustworthiness of foundation LLMs.\n","authors":["Hai Huang","Zhengyu Zhao","Michael Backes","Yun Shen","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.07676v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08703v3","updated":"2023-10-11T17:00:34Z","published":"2023-05-15T15:06:20Z","title":"Schema-adaptable Knowledge Graph Construction","summary":"  Conventional Knowledge Graph Construction (KGC) approaches typically follow\nthe static information extraction paradigm with a closed set of pre-defined\nschema. As a result, such approaches fall short when applied to dynamic\nscenarios or domains, whereas a new type of knowledge emerges. This\nnecessitates a system that can handle evolving schema automatically to extract\ninformation for KGC. To address this need, we propose a new task called\nschema-adaptable KGC, which aims to continually extract entity, relation, and\nevent based on a dynamically changing schema graph without re-training. We\nfirst split and convert existing datasets based on three principles to build a\nbenchmark, i.e., horizontal schema expansion, vertical schema expansion, and\nhybrid schema expansion; then investigate the schema-adaptable performance of\nseveral well-known approaches such as Text2Event, TANL, UIE and GPT-3.5. We\nfurther propose a simple yet effective baseline dubbed \\textsc{AdaKGC}, which\ncontains schema-enriched prefix instructor and schema-conditioned dynamic\ndecoding to better handle evolving schema. Comprehensive experimental results\nillustrate that AdaKGC can outperform baselines but still have room for\nimprovement. We hope the proposed work can deliver benefits to the community.\nCode and datasets available at https://github.com/zjunlp/AdaKGC.\n","authors":["Hongbin Ye","Honghao Gui","Xin Xu","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.08703v3.pdf","comment":"EMNLP 2023 (Findings)"},{"id":"http://arxiv.org/abs/2310.07659v1","updated":"2023-10-11T17:00:29Z","published":"2023-10-11T17:00:29Z","title":"Well Begun is Half Done: Generator-agnostic Knowledge Pre-Selection for\n  Knowledge-Grounded Dialogue","summary":"  Accurate knowledge selection is critical in knowledge-grounded dialogue\nsystems. Towards a closer look at it, we offer a novel perspective to organize\nexisting literature, i.e., knowledge selection coupled with, after, and before\ngeneration. We focus on the third under-explored category of study, which can\nnot only select knowledge accurately in advance, but has the advantage to\nreduce the learning, adjustment, and interpretation burden of subsequent\nresponse generation models, especially LLMs. We propose GATE, a\ngenerator-agnostic knowledge selection method, to prepare knowledge for\nsubsequent response generation models by selecting context-related knowledge\namong different knowledge structures and variable knowledge requirements.\nExperimental results demonstrate the superiority of GATE, and indicate that\nknowledge selection before generation is a lightweight yet effective way to\nfacilitate LLMs (e.g., ChatGPT) to generate more informative responses.\n","authors":["Qin Lang","Zhang Yao","Liang Hongru","Wang jun","Yang Zhenglu"],"pdf_url":"https://arxiv.org/pdf/2310.07659v1.pdf","comment":"Accepted by EMNLP2023 main conference"},{"id":"http://arxiv.org/abs/2310.07654v1","updated":"2023-10-11T16:54:57Z","published":"2023-10-11T16:54:57Z","title":"Audio-Visual Neural Syntax Acquisition","summary":"  We study phrase structure induction from visually-grounded speech. The core\nidea is to first segment the speech waveform into sequences of word segments,\nand subsequently induce phrase structure using the inferred segment-level\ncontinuous representations. We present the Audio-Visual Neural Syntax Learner\n(AV-NSL) that learns phrase structure by listening to audio and looking at\nimages, without ever being exposed to text. By training on paired images and\nspoken captions, AV-NSL exhibits the capability to infer meaningful phrase\nstructures that are comparable to those derived by naturally-supervised text\nparsers, for both English and German. Our findings extend prior work in\nunsupervised language acquisition from speech and grounded grammar induction,\nand present one approach to bridge the gap between the two topics.\n","authors":["Cheng-I Jeff Lai","Freda Shi","Puyuan Peng","Yoon Kim","Kevin Gimpel","Shiyu Chang","Yung-Sung Chuang","Saurabhchand Bhati","David Cox","David Harwath","Yang Zhang","Karen Livescu","James Glass"],"pdf_url":"https://arxiv.org/pdf/2310.07654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13172v2","updated":"2023-10-11T16:51:50Z","published":"2023-05-22T16:00:00Z","title":"Editing Large Language Models: Problems, Methods, and Opportunities","summary":"  Despite the ability to train capable LLMs, the methodology for maintaining\ntheir relevancy and rectifying errors remains elusive. To this end, the past\nfew years have witnessed a surge in techniques for editing LLMs, the objective\nof which is to efficiently alter the behavior of LLMs within a specific domain\nwithout negatively impacting performance across other inputs. This paper\nembarks on a deep exploration of the problems, methods, and opportunities\nrelated to model editing for LLMs. In particular, we provide an exhaustive\noverview of the task definition and challenges associated with model editing,\nalong with an in-depth empirical analysis of the most progressive methods\ncurrently at our disposal. We also build a new benchmark dataset to facilitate\na more robust evaluation and pinpoint enduring issues intrinsic to existing\ntechniques. Our objective is to provide valuable insights into the\neffectiveness and feasibility of each editing technique, thereby assisting the\ncommunity in making informed decisions on the selection of the most appropriate\nmethod for a specific task or context. Code and datasets are available at\nhttps://github.com/zjunlp/EasyEdit.\n","authors":["Yunzhi Yao","Peng Wang","Bozhong Tian","Siyuan Cheng","Zhoubo Li","Shumin Deng","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.13172v2.pdf","comment":"EMNLP 2023. Updated with new experiments"},{"id":"http://arxiv.org/abs/2310.07652v1","updated":"2023-10-11T16:51:46Z","published":"2023-10-11T16:51:46Z","title":"LLM4Vis: Explainable Visualization Recommendation using ChatGPT","summary":"  Data visualization is a powerful tool for exploring and communicating\ninsights in various domains. To automate visualization choice for datasets, a\ntask known as visualization recommendation has been proposed. Various\nmachine-learning-based approaches have been developed for this purpose, but\nthey often require a large corpus of dataset-visualization pairs for training\nand lack natural explanations for their results. To address this research gap,\nwe propose LLM4Vis, a novel ChatGPT-based prompting approach to perform\nvisualization recommendation and return human-like explanations using very few\ndemonstration examples. Our approach involves feature description,\ndemonstration example selection, explanation generation, demonstration example\nconstruction, and inference steps. To obtain demonstration examples with\nhigh-quality explanations, we propose a new explanation generation\nbootstrapping to iteratively refine generated explanations by considering the\nprevious generation and template-based hint. Evaluations on the VizML dataset\nshow that LLM4Vis outperforms or performs similarly to supervised learning\nmodels like Random Forest, Decision Tree, and MLP in both few-shot and\nzero-shot settings. The qualitative evaluation also shows the effectiveness of\nexplanations generated by LLM4Vis. We make our code publicly available at\n\\href{https://github.com/demoleiwang/LLM4Vis}{https://github.com/demoleiwang/LLM4Vis}.\n","authors":["Lei Wang","Songheng Zhang","Yun Wang","Ee-Peng Lim","Yong Wang"],"pdf_url":"https://arxiv.org/pdf/2310.07652v1.pdf","comment":"EMNLP 2023 (Industry Track)"},{"id":"http://arxiv.org/abs/2307.15770v2","updated":"2023-10-11T16:49:29Z","published":"2023-07-28T18:58:16Z","title":"CHATREPORT: Democratizing Sustainability Disclosure Analysis through\n  LLM-based Tools","summary":"  In the face of climate change, are companies really taking substantial steps\ntoward more sustainable operations? A comprehensive answer lies in the dense,\ninformation-rich landscape of corporate sustainability reports. However, the\nsheer volume and complexity of these reports make human analysis very costly.\nTherefore, only a few entities worldwide have the resources to analyze these\nreports at scale, which leads to a lack of transparency in sustainability\nreporting. Empowering stakeholders with LLM-based automatic analysis tools can\nbe a promising way to democratize sustainability report analysis. However,\ndeveloping such tools is challenging due to (1) the hallucination of LLMs and\n(2) the inefficiency of bringing domain experts into the AI development loop.\nIn this paper, we ChatReport, a novel LLM-based system to automate the analysis\nof corporate sustainability reports, addressing existing challenges by (1)\nmaking the answers traceable to reduce the harm of hallucination and (2)\nactively involving domain experts in the development loop. We make our\nmethodology, annotated datasets, and generated analyses of 1015 reports\npublicly available.\n","authors":["Jingwei Ni","Julia Bingler","Chiara Colesanti-Senni","Mathias Kraus","Glen Gostlow","Tobias Schimanski","Dominik Stammbach","Saeid Ashraf Vaghefi","Qian Wang","Nicolas Webersinke","Tobias Wekhof","Tingyu Yu","Markus Leippold"],"pdf_url":"https://arxiv.org/pdf/2307.15770v2.pdf","comment":"6 pages. arXiv admin note: text overlap with arXiv:2306.15518"},{"id":"http://arxiv.org/abs/2310.07644v1","updated":"2023-10-11T16:40:57Z","published":"2023-10-11T16:40:57Z","title":"Rethinking the BERT-like Pretraining for DNA Sequences","summary":"  With the success of large-scale pretraining in NLP, there is an increasing\ntrend of applying it to the domain of life sciences. In particular, pretraining\nmethods based on DNA sequences have garnered growing attention due to their\npotential to capture generic information about genes. However, existing\npretraining methods for DNA sequences largely rely on direct adoptions of BERT\npretraining from NLP, lacking a comprehensive understanding and a specifically\ntailored approach. To address this research gap, we first conducted a series of\nexploratory experiments and gained several insightful observations: 1) In the\nfine-tuning phase of downstream tasks, when using K-mer overlapping\ntokenization instead of K-mer non-overlapping tokenization, both overlapping\nand non-overlapping pretraining weights show consistent performance\nimprovement.2) During the pre-training process, using K-mer overlapping\ntokenization quickly produces clear K-mer embeddings and reduces the loss to a\nvery low level, while using K-mer non-overlapping tokenization results in less\ndistinct embeddings and continuously decreases the loss. 3) Using overlapping\ntokenization causes the self-attention in the intermediate layers of\npre-trained models to tend to overly focus on certain tokens, reflecting that\nthese layers are not adequately optimized. In summary, overlapping tokenization\ncan benefit the fine-tuning of downstream tasks but leads to inadequate\npretraining with fast convergence. To unleash the pretraining potential, we\nintroduce a novel approach called RandomMask, which gradually increases the\ntask difficulty of BERT-like pretraining by continuously expanding its mask\nboundary, forcing the model to learn more knowledge. RandomMask is simple but\neffective, achieving top-tier performance across 26 datasets of 28 datasets\nspanning 7 downstream tasks.\n","authors":["Chaoqi Liang","Weiqiang Bai","Lifeng Qiao","Yuchen Ren","Jianle Sun","Peng Ye","Hongliang Yan","Xinzhu Ma","Wangmeng Zuo","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2310.07644v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07641v1","updated":"2023-10-11T16:38:11Z","published":"2023-10-11T16:38:11Z","title":"Evaluating Large Language Models at Evaluating Instruction Following","summary":"  As research in large language models (LLMs) continues to accelerate,\nLLM-based evaluation has emerged as a scalable and cost-effective alternative\nto human evaluations for comparing the ever increasing list of models. This\npaper investigates the efficacy of these \"LLM evaluators\", particularly in\nusing them to assess instruction following, a metric that gauges how closely\ngenerated text adheres to the given instruction. We introduce a challenging\nmeta-evaluation benchmark, LLMBar, designed to test the ability of an LLM\nevaluator in discerning instruction-following outputs. The authors manually\ncurated 419 pairs of outputs, one adhering to instructions while the other\ndiverging, yet may possess deceptive qualities that mislead an LLM evaluator,\ne.g., a more engaging tone. Contrary to existing meta-evaluation, we discover\nthat different evaluators (i.e., combinations of LLMs and prompts) exhibit\ndistinct performance on LLMBar and even the highest-scoring ones have\nsubstantial room for improvement. We also present a novel suite of prompting\nstrategies that further close the gap between LLM and human evaluators. With\nLLMBar, we hope to offer more insight into LLM evaluators and foster future\nresearch in developing better instruction-following models.\n","authors":["Zhiyuan Zeng","Jiatong Yu","Tianyu Gao","Yu Meng","Tanya Goyal","Danqi Chen"],"pdf_url":"https://arxiv.org/pdf/2310.07641v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2306.13047v3","updated":"2023-10-11T16:33:39Z","published":"2023-06-22T17:13:08Z","title":"Analysis of the Cambridge Multiple-Choice Questions Reading Dataset with\n  a Focus on Candidate Response Distribution","summary":"  Multiple choice exams are widely used to assess candidates across a diverse\nrange of domains and tasks. To moderate question quality, newly proposed\nquestions often pass through pre-test evaluation stages before being deployed\ninto real-world exams. Currently, this evaluation process is manually\nintensive, which can lead to time lags in the question development cycle.\nStreamlining this process via automation can significantly enhance efficiency,\nhowever, there's a current lack of datasets with adequate pre-test analysis\ninformation. In this paper we analyse the Cambridge Multiple-Choice Questions\nReading Dataset; a multiple-choice comprehension dataset of questions at\ndifferent target levels, with corresponding candidate selection distributions.\nWe introduce the task of candidate distribution matching, propose several\nevaluation metrics for the task, and demonstrate that automatic systems trained\non RACE++ can be leveraged as baselines for our task. We further demonstrate\nthat these automatic systems can be used for practical pre-test evaluation\ntasks such as detecting underperforming distractors, where our detection\nsystems can automatically identify poor distractors that few candidates select.\n","authors":["Adian Liusie","Vatsal Raina","Andrew Mullooly","Kate Knill","Mark J. F. Gales"],"pdf_url":"https://arxiv.org/pdf/2306.13047v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07629v1","updated":"2023-10-11T16:18:13Z","published":"2023-10-11T16:18:13Z","title":"The Past, Present and Better Future of Feedback Learning in Large\n  Language Models for Subjective Human Preferences and Values","summary":"  Human feedback is increasingly used to steer the behaviours of Large Language\nModels (LLMs). However, it is unclear how to collect and incorporate feedback\nin a way that is efficient, effective and unbiased, especially for highly\nsubjective human preferences and values. In this paper, we survey existing\napproaches for learning from human feedback, drawing on 95 papers primarily\nfrom the ACL and arXiv repositories.First, we summarise the past, pre-LLM\ntrends for integrating human feedback into language models. Second, we give an\noverview of present techniques and practices, as well as the motivations for\nusing feedback; conceptual frameworks for defining values and preferences; and\nhow feedback is collected and from whom. Finally, we encourage a better future\nof feedback learning in LLMs by raising five unresolved conceptual and\npractical challenges.\n","authors":["Hannah Rose Kirk","Andrew M. Bean","Bertie Vidgen","Paul Röttger","Scott A. Hale"],"pdf_url":"https://arxiv.org/pdf/2310.07629v1.pdf","comment":"Accepted for the 2023 Conference on Empirical Methods in Natural\n  Language Processing (EMNLP, Main)"},{"id":"http://arxiv.org/abs/2303.08268v3","updated":"2023-10-11T16:17:20Z","published":"2023-03-14T23:01:27Z","title":"Chat with the Environment: Interactive Multimodal Perception Using Large\n  Language Models","summary":"  Programming robot behavior in a complex world faces challenges on multiple\nlevels, from dextrous low-level skills to high-level planning and reasoning.\nRecent pre-trained Large Language Models (LLMs) have shown remarkable reasoning\nability in few-shot robotic planning. However, it remains challenging to ground\nLLMs in multimodal sensory input and continuous action output, while enabling a\nrobot to interact with its environment and acquire novel information as its\npolicies unfold. We develop a robot interaction scenario with a partially\nobservable state, which necessitates a robot to decide on a range of epistemic\nactions in order to sample sensory information among multiple modalities,\nbefore being able to execute the task correctly. Matcha (Multimodal environment\nchatting) agent, an interactive perception framework, is therefore proposed\nwith an LLM as its backbone, whose ability is exploited to instruct epistemic\nactions and to reason over the resulting multimodal sensations (vision, sound,\nhaptics, proprioception), as well as to plan an entire task execution based on\nthe interactively acquired information. Our study demonstrates that LLMs can\nprovide high-level planning and reasoning skills and control interactive robot\nbehavior in a multimodal environment, while multimodal modules with the context\nof the environmental state help ground the LLMs and extend their processing\nability. The project website can be found at https://matcha-agent.github.io.\n","authors":["Xufeng Zhao","Mengdi Li","Cornelius Weber","Muhammad Burhan Hafez","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2303.08268v3.pdf","comment":"IROS2023, Detroit. See the project website at\n  https://matcha-agent.github.io"},{"id":"http://arxiv.org/abs/2310.07611v1","updated":"2023-10-11T15:56:00Z","published":"2023-10-11T15:56:00Z","title":"Democratizing LLMs: An Exploration of Cost-Performance Trade-offs in\n  Self-Refined Open-Source Models","summary":"  The dominance of proprietary LLMs has led to restricted access and raised\ninformation privacy concerns. High-performing open-source alternatives are\ncrucial for information-sensitive and high-volume applications but often lag\nbehind in performance. To address this gap, we propose (1) A untargeted variant\nof iterative self-critique and self-refinement devoid of external influence.\n(2) A novel ranking metric - Performance, Refinement, and Inference Cost Score\n(PeRFICS) - to find the optimal model for a given task considering refined\nperformance and cost. Our experiments show that SoTA open source models of\nvarying sizes from 7B - 65B, on average, improve 8.2% from their baseline\nperformance. Strikingly, even models with extremely small memory footprints,\nsuch as Vicuna-7B, show a 11.74% improvement overall and up to a 25.39%\nimprovement in high-creativity, open ended tasks on the Vicuna benchmark.\nVicuna-13B takes it a step further and outperforms ChatGPT post-refinement.\nThis work has profound implications for resource-constrained and\ninformation-sensitive environments seeking to leverage LLMs without incurring\nprohibitive costs, compromising on performance and privacy. The domain-agnostic\nself-refinement process coupled with our novel ranking metric facilitates\ninformed decision-making in model selection, thereby reducing costs and\ndemocratizing access to high-performing language models, as evidenced by case\nstudies.\n","authors":["Sumuk Shashidhar","Abhinav Chinta","Vaibhav Sahai","Zhenhailong Wang","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2310.07611v1.pdf","comment":"Initial Preprint. Accepted to Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.07609v1","updated":"2023-10-11T15:51:53Z","published":"2023-10-11T15:51:53Z","title":"QACHECK: A Demonstration System for Question-Guided Multi-Hop\n  Fact-Checking","summary":"  Fact-checking real-world claims often requires complex, multi-step reasoning\ndue to the absence of direct evidence to support or refute them. However,\nexisting fact-checking systems often lack transparency in their\ndecision-making, making it challenging for users to comprehend their reasoning\nprocess. To address this, we propose the Question-guided Multi-hop\nFact-Checking (QACHECK) system, which guides the model's reasoning process by\nasking a series of questions critical for verifying a claim. QACHECK has five\nkey modules: a claim verifier, a question generator, a question-answering\nmodule, a QA validator, and a reasoner. Users can input a claim into QACHECK,\nwhich then predicts its veracity and provides a comprehensive report detailing\nits reasoning process, guided by a sequence of (question, answer) pairs.\nQACHECK also provides the source of evidence supporting each question,\nfostering a transparent, explainable, and user-friendly fact-checking process.\nA recorded video of QACHECK is at https://www.youtube.com/watch?v=ju8kxSldM64\n","authors":["Liangming Pan","Xinyuan Lu","Min-Yen Kan","Preslav Nakov"],"pdf_url":"https://arxiv.org/pdf/2310.07609v1.pdf","comment":"Accepted at EMNLP 2023 System Demonstrations Track"},{"id":"http://arxiv.org/abs/2310.07588v1","updated":"2023-10-11T15:28:44Z","published":"2023-10-11T15:28:44Z","title":"Accurate Use of Label Dependency in Multi-Label Text Classification\n  Through the Lens of Causality","summary":"  Multi-Label Text Classification (MLTC) aims to assign the most relevant\nlabels to each given text. Existing methods demonstrate that label dependency\ncan help to improve the model's performance. However, the introduction of label\ndependency may cause the model to suffer from unwanted prediction bias. In this\nstudy, we attribute the bias to the model's misuse of label dependency, i.e.,\nthe model tends to utilize the correlation shortcut in label dependency rather\nthan fusing text information and label dependency for prediction. Motivated by\ncausal inference, we propose a CounterFactual Text Classifier (CFTC) to\neliminate the correlation bias, and make causality-based predictions.\nSpecifically, our CFTC first adopts the predict-then-modify backbone to extract\nprecise label information embedded in label dependency, then blocks the\ncorrelation shortcut through the counterfactual de-bias technique with the help\nof the human causal graph. Experimental results on three datasets demonstrate\nthat our CFTC significantly outperforms the baselines and effectively\neliminates the correlation bias in datasets.\n","authors":["Caoyun Fan","Wenqing Chen","Jidong Tian","Yitian Li","Hao He","Yaohui Jin"],"pdf_url":"https://arxiv.org/pdf/2310.07588v1.pdf","comment":"Applied Intelligence 2023"},{"id":"http://arxiv.org/abs/2304.14933v2","updated":"2023-10-11T15:08:51Z","published":"2023-04-28T15:43:21Z","title":"An Empirical Study of Multimodal Model Merging","summary":"  Model merging (e.g., via interpolation or task arithmetic) fuses multiple\nmodels trained on different tasks to generate a multi-task solution. The\ntechnique has been proven successful in previous studies, where the models are\ntrained on similar tasks and with the same initialization. In this paper, we\nexpand on this concept to a multimodal setup by merging transformers trained on\ndifferent modalities. Furthermore, we conduct our study for a novel goal where\nwe can merge vision, language, and cross-modal transformers of a\nmodality-specific architecture to create a parameter-efficient\nmodality-agnostic architecture. Through comprehensive experiments, we\nsystematically investigate the key factors impacting model performance after\nmerging, including initialization, merging mechanisms, and model architectures.\nWe also propose two metrics that assess the distance between weights to be\nmerged and can serve as an indicator of the merging outcomes. Our analysis\nleads to an effective training recipe for matching the performance of the\nmodality-agnostic baseline (i.e., pre-trained from scratch) via model merging.\nOur method also outperforms naive merging significantly on various tasks, with\nimprovements of 3% on VQA, 7% on COCO retrieval, 25% on NLVR2, 14% on Flickr30k\nand 3% on ADE20k. Our code is available at https://github.com/ylsung/vl-merging\n","authors":["Yi-Lin Sung","Linjie Li","Kevin Lin","Zhe Gan","Mohit Bansal","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2304.14933v2.pdf","comment":"EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2308.12067v2","updated":"2023-10-11T14:49:26Z","published":"2023-08-23T11:27:30Z","title":"InstructionGPT-4: A 200-Instruction Paradigm for Fine-Tuning MiniGPT-4","summary":"  Multimodal large language models are typically trained in two stages: first\npre-training on image-text pairs, and then fine-tuning using supervised\nvision-language instruction data. Recent studies have shown that large language\nmodels can achieve satisfactory results even with a limited amount of\nhigh-quality instruction-following data. In this paper, we introduce\nInstructionGPT-4, which is fine-tuned on a small dataset comprising only 200\nexamples, amounting to approximately 6\\% of the instruction-following data used\nin the alignment dataset for MiniGPT-4. To achieve this, we first propose\nseveral metrics to access the quality of multimodal instruction data. Based on\nthese metrics, we present an effective and trainable data selector to\nautomatically identify and filter low-quality vision-language data. By\nemploying this method, InstructionGPT-4 outperforms the original MiniGPT-4 on\nvarious evaluations. Overall, our findings demonstrate that less but\nhigh-quality instruction tuning data is efficient in enabling multimodal large\nlanguage models to generate better output. Our code is available at\nhttps://github.com/waltonfuture/InstructionGPT-4.\n","authors":["Lai Wei","Zihao Jiang","Weiran Huang","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2308.12067v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07521v1","updated":"2023-10-11T14:18:03Z","published":"2023-10-11T14:18:03Z","title":"Survey on Factuality in Large Language Models: Knowledge, Retrieval and\n  Domain-Specificity","summary":"  This survey addresses the crucial issue of factuality in Large Language\nModels (LLMs). As LLMs find applications across diverse domains, the\nreliability and accuracy of their outputs become vital. We define the\nFactuality Issue as the probability of LLMs to produce content inconsistent\nwith established facts. We first delve into the implications of these\ninaccuracies, highlighting the potential consequences and challenges posed by\nfactual errors in LLM outputs. Subsequently, we analyze the mechanisms through\nwhich LLMs store and process facts, seeking the primary causes of factual\nerrors. Our discussion then transitions to methodologies for evaluating LLM\nfactuality, emphasizing key metrics, benchmarks, and studies. We further\nexplore strategies for enhancing LLM factuality, including approaches tailored\nfor specific domains. We focus two primary LLM configurations standalone LLMs\nand Retrieval-Augmented LLMs that utilizes external data, we detail their\nunique challenges and potential enhancements. Our survey offers a structured\nguide for researchers aiming to fortify the factual reliability of LLMs.\n","authors":["Cunxiang Wang","Xiaoze Liu","Yuanhao Yue","Xiangru Tang","Tianhang Zhang","Cheng Jiayang","Yunzhi Yao","Wenyang Gao","Xuming Hu","Zehan Qi","Yidong Wang","Linyi Yang","Jindong Wang","Xing Xie","Zheng Zhang","Yue Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.07521v1.pdf","comment":"43 pages; 300+ references"},{"id":"http://arxiv.org/abs/2310.07488v1","updated":"2023-10-11T13:35:05Z","published":"2023-10-11T13:35:05Z","title":"KwaiYiiMath: Technical Report","summary":"  Recent advancements in large language models (LLMs) have demonstrated\nremarkable abilities in handling a variety of natural language processing (NLP)\ndownstream tasks, even on mathematical tasks requiring multi-step reasoning. In\nthis report, we introduce the KwaiYiiMath which enhances the mathematical\nreasoning abilities of KwaiYiiBase1, by applying Supervised Fine-Tuning (SFT)\nand Reinforced Learning from Human Feedback (RLHF), including on both English\nand Chinese mathematical tasks. Meanwhile, we also constructed a small-scale\nChinese primary school mathematics test set (named KMath), consisting of 188\nexamples to evaluate the correctness of the problem-solving process generated\nby the models. Empirical studies demonstrate that KwaiYiiMath can achieve\nstate-of-the-art (SOTA) performance on GSM8k, CMath, and KMath compared with\nthe similar size models, respectively.\n","authors":["Jiayi Fu","Lei Lin","Xiaoyang Gao","Pengli Liu","Zhengzong Chen","Zhirui Yang","Shengnan Zhang","Xue Zheng","Yan Li","Yuliang Liu","Xucheng Ye","Yiqiao Liao","Chao Liao","Bin Chen","Chengru Song","Junchen Wan","Zijia Lin","Fuzheng Zhang","Zhongyuan Wang","Di Zhang","Kun Gai"],"pdf_url":"https://arxiv.org/pdf/2310.07488v1.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2310.07487v1","updated":"2023-10-11T13:34:22Z","published":"2023-10-11T13:34:22Z","title":"Cognate Transformer for Automated Phonological Reconstruction and\n  Cognate Reflex Prediction","summary":"  Phonological reconstruction is one of the central problems in historical\nlinguistics where a proto-word of an ancestral language is determined from the\nobserved cognate words of daughter languages. Computational approaches to\nhistorical linguistics attempt to automate the task by learning models on\navailable linguistic data. Several ideas and techniques drawn from\ncomputational biology have been successfully applied in the area of\ncomputational historical linguistics. Following these lines, we adapt MSA\nTransformer, a protein language model, to the problem of automated phonological\nreconstruction. MSA Transformer trains on multiple sequence alignments as input\nand is, thus, apt for application on aligned cognate words. We, hence, name our\nmodel as Cognate Transformer. We also apply the model on another associated\ntask, namely, cognate reflex prediction, where a reflex word in a daughter\nlanguage is predicted based on cognate words from other daughter languages. We\nshow that our model outperforms the existing models on both tasks, especially\nwhen it is pre-trained on masked word prediction task.\n","authors":["V. S. D. S. Mahesh Akavarapu","Arnab Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2310.07487v1.pdf","comment":"Accepted to appear at the conference of EMNLP-2023"},{"id":"http://arxiv.org/abs/2310.07423v1","updated":"2023-10-11T12:15:24Z","published":"2023-10-11T12:15:24Z","title":"Adapting the adapters for code-switching in multilingual ASR","summary":"  Recently, large pre-trained multilingual speech models have shown potential\nin scaling Automatic Speech Recognition (ASR) to many low-resource languages.\nSome of these models employ language adapters in their formulation, which helps\nto improve monolingual performance and avoids some of the drawbacks of\nmulti-lingual modeling on resource-rich languages. However, this formulation\nrestricts the usability of these models on code-switched speech, where two\nlanguages are mixed together in the same utterance. In this work, we propose\nways to effectively fine-tune such models on code-switched speech, by\nassimilating information from both language adapters at each language\nadaptation point in the network. We also model code-switching as a sequence of\nlatent binary sequences that can be used to guide the flow of information from\neach language adapter at the frame level. The proposed approaches are evaluated\non three code-switched datasets encompassing Arabic, Mandarin, and Hindi\nlanguages paired with English, showing consistent improvements in\ncode-switching performance with at least 10\\% absolute reduction in CER across\nall test sets.\n","authors":["Atharva Kulkarni","Ajinkya Kulkarni","Miguel Couceiro","Hanan Aldarmaki"],"pdf_url":"https://arxiv.org/pdf/2310.07423v1.pdf","comment":"Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2310.07403v1","updated":"2023-10-11T11:39:36Z","published":"2023-10-11T11:39:36Z","title":"DASpeech: Directed Acyclic Transformer for Fast and High-quality\n  Speech-to-Speech Translation","summary":"  Direct speech-to-speech translation (S2ST) translates speech from one\nlanguage into another using a single model. However, due to the presence of\nlinguistic and acoustic diversity, the target speech follows a complex\nmultimodal distribution, posing challenges to achieving both high-quality\ntranslations and fast decoding speeds for S2ST models. In this paper, we\npropose DASpeech, a non-autoregressive direct S2ST model which realizes both\nfast and high-quality S2ST. To better capture the complex distribution of the\ntarget speech, DASpeech adopts the two-pass architecture to decompose the\ngeneration process into two steps, where a linguistic decoder first generates\nthe target text, and an acoustic decoder then generates the target speech based\non the hidden states of the linguistic decoder. Specifically, we use the\ndecoder of DA-Transformer as the linguistic decoder, and use FastSpeech 2 as\nthe acoustic decoder. DA-Transformer models translations with a directed\nacyclic graph (DAG). To consider all potential paths in the DAG during\ntraining, we calculate the expected hidden states for each target token via\ndynamic programming, and feed them into the acoustic decoder to predict the\ntarget mel-spectrogram. During inference, we select the most probable path and\ntake hidden states on that path as input to the acoustic decoder. Experiments\non the CVSS Fr-En benchmark demonstrate that DASpeech can achieve comparable or\neven better performance than the state-of-the-art S2ST model Translatotron 2,\nwhile preserving up to 18.53x speedup compared to the autoregressive baseline.\nCompared with the previous non-autoregressive S2ST model, DASpeech does not\nrely on knowledge distillation and iterative decoding, achieving significant\nimprovements in both translation quality and decoding speed. Furthermore,\nDASpeech shows the ability to preserve the speaker's voice of the source speech\nduring translation.\n","authors":["Qingkai Fang","Yan Zhou","Yang Feng"],"pdf_url":"https://arxiv.org/pdf/2310.07403v1.pdf","comment":"NeurIPS 2023. Audio samples are available at\n  https://ictnlp.github.io/daspeech-demo/"},{"id":"http://arxiv.org/abs/2310.07397v1","updated":"2023-10-11T11:32:57Z","published":"2023-10-11T11:32:57Z","title":"Target-oriented Proactive Dialogue Systems with Personalization: Problem\n  Formulation and Dataset Curation","summary":"  Target-oriented dialogue systems, designed to proactively steer conversations\ntoward predefined targets or accomplish specific system-side goals, are an\nexciting area in conversational AI. In this work, by formulating a <dialogue\nact, topic> pair as the conversation target, we explore a novel problem of\npersonalized target-oriented dialogue by considering personalization during the\ntarget accomplishment process. However, there remains an emergent need for\nhigh-quality datasets, and building one from scratch requires tremendous human\neffort. To address this, we propose an automatic dataset curation framework\nusing a role-playing approach. Based on this framework, we construct a\nlarge-scale personalized target-oriented dialogue dataset, TopDial, which\ncomprises about 18K multi-turn dialogues. The experimental results show that\nthis dataset is of high quality and could contribute to exploring personalized\ntarget-oriented dialogue.\n","authors":["Jian Wang","Yi Cheng","Dongding Lin","Chak Tou Leong","Wenjie Li"],"pdf_url":"https://arxiv.org/pdf/2310.07397v1.pdf","comment":"Accepted to EMNLP-2023 main conference"},{"id":"http://arxiv.org/abs/2310.07387v1","updated":"2023-10-11T11:08:20Z","published":"2023-10-11T11:08:20Z","title":"Linguistic laws in biology","summary":"  Linguistic laws, the common statistical patterns of human language, have been\ninvestigated by quantitative linguists for nearly a century. Recently,\nbiologists from a range of disciplines have started to explore the prevalence\nof these laws beyond language, finding patterns consistent with linguistic laws\nacross multiple levels of biological organisation, from molecular (genomes,\ngenes, and proteins) to organismal (animal behaviour) to ecological\n(populations and ecosystems). We propose a new conceptual framework for the\nstudy of linguistic laws in biology, comprising and integrating distinct levels\nof analysis, from description to prediction to theory building. Adopting this\nframework will provide critical new insights into the fundamental rules of\norganisation underpinning natural systems, unifying linguistic laws and core\ntheory in biology.\n","authors":["Stuart Semple","Ramon Ferrer-i-Cancho","Morgan L. Gustison"],"pdf_url":"https://arxiv.org/pdf/2310.07387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08732v3","updated":"2023-10-11T10:51:12Z","published":"2023-05-15T15:47:09Z","title":"Knowledge Rumination for Pre-trained Language Models","summary":"  Previous studies have revealed that vanilla pre-trained language models\n(PLMs) lack the capacity to handle knowledge-intensive NLP tasks alone; thus,\nseveral works have attempted to integrate external knowledge into PLMs.\nHowever, despite the promising outcome, we empirically observe that PLMs may\nhave already encoded rich knowledge in their pre-trained parameters but fail to\nfully utilize them when applying them to knowledge-intensive tasks. In this\npaper, we propose a new paradigm dubbed Knowledge Rumination to help the\npre-trained language model utilize that related latent knowledge without\nretrieving it from the external corpus. By simply adding a prompt like \"As far\nas I know\" to the PLMs, we try to review related latent knowledge and inject\nthem back into the model for knowledge consolidation. We apply the proposed\nknowledge rumination to various language models, including RoBERTa, DeBERTa,\nand GPT-3. Experimental results on six commonsense reasoning tasks and GLUE\nbenchmarks demonstrate the effectiveness of our proposed approach, which proves\nthat the knowledge stored in PLMs can be better exploited to enhance\nperformance. Code is available in\nhttps://github.com/zjunlp/knowledge-rumination.\n","authors":["Yunzhi Yao","Peng Wang","Shengyu Mao","Chuanqi Tan","Fei Huang","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.08732v3.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.06692v2","updated":"2023-10-11T10:05:29Z","published":"2023-10-10T15:10:03Z","title":"Meta-CoT: Generalizable Chain-of-Thought Prompting in Mixed-task\n  Scenarios with Large Language Models","summary":"  Large language models (LLMs) have unveiled remarkable reasoning capabilities\nby exploiting chain-of-thought (CoT) prompting, which generates intermediate\nreasoning chains to serve as the rationale for deriving the answer. However,\ncurrent CoT methods either simply employ general prompts such as Let's think\nstep by step, or heavily rely on handcrafted task-specific demonstrations to\nattain preferable performances, thereby engendering an inescapable gap between\nperformance and generalization. To bridge this gap, we propose Meta-CoT, a\ngeneralizable CoT prompting method in mixed-task scenarios where the type of\ninput questions is unknown. Meta-CoT firstly categorizes the scenario based on\nthe input question and subsequently constructs diverse demonstrations from the\ncorresponding data pool in an automatic pattern. Meta-CoT simultaneously enjoys\nremarkable performances on ten public benchmark reasoning tasks and superior\ngeneralization capabilities. Notably, Meta-CoT achieves the state-of-the-art\nresult on SVAMP (93.7%) without any additional program-aided methods. Our\nfurther experiments on five out-of-distribution datasets verify the stability\nand generality of Meta-CoT.\n","authors":["Anni Zou","Zhuosheng Zhang","Hai Zhao","Xiangru Tang"],"pdf_url":"https://arxiv.org/pdf/2310.06692v2.pdf","comment":"17 pages, 7 figures"},{"id":"http://arxiv.org/abs/2310.07347v1","updated":"2023-10-11T09:55:46Z","published":"2023-10-11T09:55:46Z","title":"Fast-ELECTRA for Efficient Pre-training","summary":"  ELECTRA pre-trains language models by detecting tokens in a sequence that\nhave been replaced by an auxiliary model. Although ELECTRA offers a significant\nboost in efficiency, its potential is constrained by the training cost brought\nby the auxiliary model. Notably, this model, which is jointly trained with the\nmain model, only serves to assist the training of the main model and is\ndiscarded post-training. This results in a substantial amount of training cost\nbeing expended in vain. To mitigate this issue, we propose Fast-ELECTRA, which\nleverages an existing language model as the auxiliary model. To construct a\nlearning curriculum for the main model, we smooth its output distribution via\ntemperature scaling following a descending schedule. Our approach rivals the\nperformance of state-of-the-art ELECTRA-style pre-training methods, while\nsignificantly eliminating the computation and memory cost brought by the joint\ntraining of the auxiliary model. Our method also reduces the sensitivity to\nhyper-parameters and enhances the pre-training stability.\n","authors":["Chengyu Dong","Liyuan Liu","Hao Cheng","Jingbo Shang","Jianfeng Gao","Xiaodong Liu"],"pdf_url":"https://arxiv.org/pdf/2310.07347v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07345v1","updated":"2023-10-11T09:53:17Z","published":"2023-10-11T09:53:17Z","title":"Investigating the Effect of Language Models in Sequence Discriminative\n  Training for Neural Transducers","summary":"  In this work, we investigate the effect of language models (LMs) with\ndifferent context lengths and label units (phoneme vs. word) used in sequence\ndiscriminative training for phoneme-based neural transducers. Both lattice-free\nand N-best-list approaches are examined. For lattice-free methods with\nphoneme-level LMs, we propose a method to approximate the context history to\nemploy LMs with full-context dependency. This approximation can be extended to\narbitrary context length and enables the usage of word-level LMs in\nlattice-free methods. Moreover, a systematic comparison is conducted across\nlattice-free and N-best-list-based methods. Experimental results on Librispeech\nshow that using the word-level LM in training outperforms the phoneme-level LM.\nBesides, we find that the context size of the LM used for probability\ncomputation has a limited effect on performance. Moreover, our results reveal\nthe pivotal importance of the hypothesis space quality in sequence\ndiscriminative training.\n","authors":["Zijian Yang","Wei Zhou","Ralf Schlüter","Hermann Ney"],"pdf_url":"https://arxiv.org/pdf/2310.07345v1.pdf","comment":"accepted at ASRU 2023"},{"id":"http://arxiv.org/abs/2304.14767v2","updated":"2023-10-11T09:49:59Z","published":"2023-04-28T11:26:17Z","title":"Dissecting Recall of Factual Associations in Auto-Regressive Language\n  Models","summary":"  Transformer-based language models (LMs) are known to capture factual\nknowledge in their parameters. While previous work looked into where factual\nassociations are stored, only little is known about how they are retrieved\ninternally during inference. We investigate this question through the lens of\ninformation flow. Given a subject-relation query, we study how the model\naggregates information about the subject and relation to predict the correct\nattribute. With interventions on attention edges, we first identify two\ncritical points where information propagates to the prediction: one from the\nrelation positions followed by another from the subject positions. Next, by\nanalyzing the information at these points, we unveil a three-step internal\nmechanism for attribute extraction. First, the representation at the\nlast-subject position goes through an enrichment process, driven by the early\nMLP sublayers, to encode many subject-related attributes. Second, information\nfrom the relation propagates to the prediction. Third, the prediction\nrepresentation \"queries\" the enriched subject to extract the attribute. Perhaps\nsurprisingly, this extraction is typically done via attention heads, which\noften encode subject-attribute mappings in their parameters. Overall, our\nfindings introduce a comprehensive view of how factual associations are stored\nand extracted internally in LMs, facilitating future research on knowledge\nlocalization and editing.\n","authors":["Mor Geva","Jasmijn Bastings","Katja Filippova","Amir Globerson"],"pdf_url":"https://arxiv.org/pdf/2304.14767v2.pdf","comment":"Accepted at EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.07343v1","updated":"2023-10-11T09:46:32Z","published":"2023-10-11T09:46:32Z","title":"How Do Large Language Models Capture the Ever-changing World Knowledge?\n  A Review of Recent Advances","summary":"  Although large language models (LLMs) are impressive in solving various\ntasks, they can quickly be outdated after deployment. Maintaining their\nup-to-date status is a pressing concern in the current era. This paper provides\na comprehensive review of recent advances in aligning LLMs with the\never-changing world knowledge without re-training from scratch. We categorize\nresearch works systemically and provide in-depth comparisons and discussion. We\nalso discuss existing challenges and highlight future directions to facilitate\nresearch in this field. We release the paper list at\nhttps://github.com/hyintell/awesome-refreshing-llms\n","authors":["Zihan Zhang","Meng Fang","Ling Chen","Mohammad-Reza Namazi-Rad","Jun Wang"],"pdf_url":"https://arxiv.org/pdf/2310.07343v1.pdf","comment":"EMNLP 2023 main conference, paper link at\n  https://github.com/hyintell/awesome-refreshing-llms"},{"id":"http://arxiv.org/abs/2304.11082v4","updated":"2023-10-11T09:45:15Z","published":"2023-04-19T17:50:09Z","title":"Fundamental Limitations of Alignment in Large Language Models","summary":"  An important aspect in developing language models that interact with humans\nis aligning their behavior to be useful and unharmful for their human users.\nThis is usually achieved by tuning the model in a way that enhances desired\nbehaviors and inhibits undesired ones, a process referred to as alignment. In\nthis paper, we propose a theoretical approach called Behavior Expectation\nBounds (BEB) which allows us to formally investigate several inherent\ncharacteristics and limitations of alignment in large language models.\nImportantly, we prove that within the limits of this framework, for any\nbehavior that has a finite probability of being exhibited by the model, there\nexist prompts that can trigger the model into outputting this behavior, with\nprobability that increases with the length of the prompt. This implies that any\nalignment process that attenuates an undesired behavior but does not remove it\naltogether, is not safe against adversarial prompting attacks. Furthermore, our\nframework hints at the mechanism by which leading alignment approaches such as\nreinforcement learning from human feedback make the LLM prone to being prompted\ninto the undesired behaviors. This theoretical result is being experimentally\ndemonstrated in large scale by the so called contemporary \"chatGPT jailbreaks\",\nwhere adversarial users trick the LLM into breaking its alignment guardrails by\ntriggering it into acting as a malicious persona. Our results expose\nfundamental limitations in alignment of LLMs and bring to the forefront the\nneed to devise reliable mechanisms for ensuring AI safety.\n","authors":["Yotam Wolf","Noam Wies","Oshri Avnery","Yoav Levine","Amnon Shashua"],"pdf_url":"https://arxiv.org/pdf/2304.11082v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07328v1","updated":"2023-10-11T09:18:09Z","published":"2023-10-11T09:18:09Z","title":"An Empirical Study of Instruction-tuning Large Language Models in\n  Chinese","summary":"  The success of ChatGPT validates the potential of large language models\n(LLMs) in artificial general intelligence (AGI). Subsequently, the release of\nLLMs has sparked the open-source community's interest in instruction-tuning,\nwhich is deemed to accelerate ChatGPT's replication process. However, research\non instruction-tuning LLMs in Chinese, the world's most spoken language, is\nstill in its early stages. Therefore, this paper makes an in-depth empirical\nstudy of instruction-tuning LLMs in Chinese, which can serve as a cookbook that\nprovides valuable findings for effectively customizing LLMs that can better\nrespond to Chinese instructions. Specifically, we systematically explore the\nimpact of LLM bases, parameter-efficient methods, instruction data types, which\nare the three most important elements for instruction-tuning. Besides, we also\nconduct experiment to study the impact of other factors, e.g., chain-of-thought\ndata and human-value alignment. We hope that this empirical study can make a\nmodest contribution to the open Chinese version of ChatGPT. This paper will\nrelease a powerful Chinese LLMs that is comparable to ChatGLM. The code and\ndata are available at https://github.com/PhoebusSi/Alpaca-CoT.\n","authors":["Qingyi Si","Tong Wang","Zheng Lin","Xu Zhang","Yanan Cao","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2310.07328v1.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.07321v1","updated":"2023-10-11T09:09:55Z","published":"2023-10-11T09:09:55Z","title":"On the Impact of Cross-Domain Data on German Language Models","summary":"  Traditionally, large language models have been either trained on general web\ncrawls or domain-specific data. However, recent successes of generative large\nlanguage models, have shed light on the benefits of cross-domain datasets. To\nexamine the significance of prioritizing data diversity over quality, we\npresent a German dataset comprising texts from five domains, along with another\ndataset aimed at containing high-quality data. Through training a series of\nmodels ranging between 122M and 750M parameters on both datasets, we conduct a\ncomprehensive benchmark on multiple downstream tasks. Our findings demonstrate\nthat the models trained on the cross-domain dataset outperform those trained on\nquality data alone, leading to improvements up to $4.45\\%$ over the previous\nstate-of-the-art. The models are available at\nhttps://huggingface.co/ikim-uk-essen\n","authors":["Amin Dada","Aokun Chen","Cheng Peng","Kaleb E Smith","Ahmad Idrissi-Yaghir","Constantin Marc Seibold","Jianning Li","Lars Heiliger","Christoph M. Friedrich","Daniel Truhn","Jan Egger","Jiang Bian","Jens Kleesiek","Yonghui Wu"],"pdf_url":"https://arxiv.org/pdf/2310.07321v1.pdf","comment":"13 pages, 1 figure, accepted at Findings of the Association for\n  Computational Linguistics: EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.07306v1","updated":"2023-10-11T08:40:06Z","published":"2023-10-11T08:40:06Z","title":"SNOiC: Soft Labeling and Noisy Mixup based Open Intent Classification\n  Model","summary":"  This paper presents a Soft Labeling and Noisy Mixup-based open intent\nclassification model (SNOiC). Most of the previous works have used\nthreshold-based methods to identify open intents, which are prone to\noverfitting and may produce biased predictions. Additionally, the need for more\navailable data for an open intent class presents another limitation for these\nexisting models. SNOiC combines Soft Labeling and Noisy Mixup strategies to\nreduce the biasing and generate pseudo-data for open intent class. The\nexperimental results on four benchmark datasets show that the SNOiC model\nachieves a minimum and maximum performance of 68.72\\% and 94.71\\%,\nrespectively, in identifying open intents. Moreover, compared to\nstate-of-the-art models, the SNOiC model improves the performance of\nidentifying open intents by 0.93\\% (minimum) and 12.76\\% (maximum). The model's\nefficacy is further established by analyzing various parameters used in the\nproposed model. An ablation study is also conducted, which involves creating\nthree model variants to validate the effectiveness of the SNOiC model.\n","authors":["Aditi Kanwar","Aditi Seetha","Satyendra Singh Chouhan","Rajdeep Niyogi"],"pdf_url":"https://arxiv.org/pdf/2310.07306v1.pdf","comment":"9 Pages, 6 figures"},{"id":"http://arxiv.org/abs/2310.07301v1","updated":"2023-10-11T08:36:43Z","published":"2023-10-11T08:36:43Z","title":"Parrot: Enhancing Multi-Turn Chat Models by Learning to Ask Questions","summary":"  Impressive progress has been made on chat models based on Large Language\nModels (LLMs) recently; however, there is a noticeable lag in multi-turn\nconversations between open-source chat models (e.g., Alpaca and Vicuna) and the\nleading chat models (e.g., ChatGPT and GPT-4). Through a series of analyses, we\nattribute the lag to the lack of enough high-quality multi-turn\ninstruction-tuning data. The available instruction-tuning data for the\ncommunity are either single-turn conversations or multi-turn ones with certain\nissues, such as non-human-like instructions, less detailed responses, or rare\ntopic shifts. In this paper, we address these challenges by introducing Parrot,\na highly scalable solution designed to automatically generate high-quality\ninstruction-tuning data, which are then used to enhance the effectiveness of\nchat models in multi-turn conversations. Specifically, we start by training the\nParrot-Ask model, which is designed to emulate real users in generating\ninstructions. We then utilize Parrot-Ask to engage in multi-turn conversations\nwith ChatGPT across a diverse range of topics, resulting in a collection of 40K\nhigh-quality multi-turn dialogues (Parrot-40K). These data are subsequently\nemployed to train a chat model that we have named Parrot-Chat. We demonstrate\nthat the dialogues gathered from Parrot-Ask markedly outperform existing\nmulti-turn instruction-following datasets in critical metrics, including topic\ndiversity, number of turns, and resemblance to human conversation. With only\n40K training examples, Parrot-Chat achieves strong performance against other\n13B open-source models across a range of instruction-following benchmarks, and\nparticularly excels in evaluations of multi-turn capabilities. We make all\ncodes, datasets, and two versions of the Parrot-Ask model based on LLaMA2-13B\nand KuaiYii-13B available at https://github.com/kwai/KwaiYii/Parrot.\n","authors":["Yuchong Sun","Che Liu","Jinwen Huang","Ruihua Song","Fuzheng Zhang","Di Zhang","Zhongyuan Wang","Kun Gai"],"pdf_url":"https://arxiv.org/pdf/2310.07301v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.07678v2","updated":"2023-10-11T08:34:42Z","published":"2023-03-14T07:27:30Z","title":"Query2doc: Query Expansion with Large Language Models","summary":"  This paper introduces a simple yet effective query expansion approach,\ndenoted as query2doc, to improve both sparse and dense retrieval systems. The\nproposed method first generates pseudo-documents by few-shot prompting large\nlanguage models (LLMs), and then expands the query with generated\npseudo-documents. LLMs are trained on web-scale text corpora and are adept at\nknowledge memorization. The pseudo-documents from LLMs often contain highly\nrelevant information that can aid in query disambiguation and guide the\nretrievers. Experimental results demonstrate that query2doc boosts the\nperformance of BM25 by 3% to 15% on ad-hoc IR datasets, such as MS-MARCO and\nTREC DL, without any model fine-tuning. Furthermore, our method also benefits\nstate-of-the-art dense retrievers in terms of both in-domain and out-of-domain\nresults.\n","authors":["Liang Wang","Nan Yang","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2303.07678v2.pdf","comment":"Accepted to EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.07299v1","updated":"2023-10-11T08:33:23Z","published":"2023-10-11T08:33:23Z","title":"RobustGEC: Robust Grammatical Error Correction Against Subtle Context\n  Perturbation","summary":"  Grammatical Error Correction (GEC) systems play a vital role in assisting\npeople with their daily writing tasks. However, users may sometimes come across\na GEC system that initially performs well but fails to correct errors when the\ninputs are slightly modified. To ensure an ideal user experience, a reliable\nGEC system should have the ability to provide consistent and accurate\nsuggestions when encountering irrelevant context perturbations, which we refer\nto as context robustness. In this paper, we introduce RobustGEC, a benchmark\ndesigned to evaluate the context robustness of GEC systems. RobustGEC comprises\n5,000 GEC cases, each with one original error-correct sentence pair and five\nvariants carefully devised by human annotators. Utilizing RobustGEC, we reveal\nthat state-of-the-art GEC systems still lack sufficient robustness against\ncontext perturbations. In addition, we propose a simple yet effective method\nfor remitting this issue.\n","authors":["Yue Zhang","Leyang Cui","Enbo Zhao","Wei Bi","Shuming Shi"],"pdf_url":"https://arxiv.org/pdf/2310.07299v1.pdf","comment":"Accepted to EMNLP 2023 (main conference, long paper)"},{"id":"http://arxiv.org/abs/2310.07289v1","updated":"2023-10-11T08:22:37Z","published":"2023-10-11T08:22:37Z","title":"Beyond Factuality: A Comprehensive Evaluation of Large Language Models\n  as Knowledge Generators","summary":"  Large language models (LLMs) outperform information retrieval techniques for\ndownstream knowledge-intensive tasks when being prompted to generate world\nknowledge. However, community concerns abound regarding the factuality and\npotential implications of using this uncensored knowledge. In light of this, we\nintroduce CONNER, a COmpreheNsive kNowledge Evaluation fRamework, designed to\nsystematically and automatically evaluate generated knowledge from six\nimportant perspectives -- Factuality, Relevance, Coherence, Informativeness,\nHelpfulness and Validity. We conduct an extensive empirical analysis of the\ngenerated knowledge from three different types of LLMs on two widely studied\nknowledge-intensive tasks, i.e., open-domain question answering and\nknowledge-grounded dialogue. Surprisingly, our study reveals that the\nfactuality of generated knowledge, even if lower, does not significantly hinder\ndownstream tasks. Instead, the relevance and coherence of the outputs are more\nimportant than small factual mistakes. Further, we show how to use CONNER to\nimprove knowledge-intensive tasks by designing two strategies: Prompt\nEngineering and Knowledge Selection. Our evaluation code and LLM-generated\nknowledge with human annotations will be released to facilitate future\nresearch.\n","authors":["Liang Chen","Yang Deng","Yatao Bian","Zeyu Qin","Bingzhe Wu","Tat-Seng Chua","Kam-Fai Wong"],"pdf_url":"https://arxiv.org/pdf/2310.07289v1.pdf","comment":"Accepted to EMNLP 2023 main conference"},{"id":"http://arxiv.org/abs/2310.07284v1","updated":"2023-10-11T08:17:54Z","published":"2023-10-11T08:17:54Z","title":"Typing to Listen at the Cocktail Party: Text-Guided Target Speaker\n  Extraction","summary":"  Humans possess an extraordinary ability to selectively focus on the sound\nsource of interest amidst complex acoustic environments, commonly referred to\nas cocktail party scenarios. In an attempt to replicate this remarkable\nauditory attention capability in machines, target speaker extraction (TSE)\nmodels have been developed. These models leverage the pre-registered cues of\nthe target speaker to extract the sound source of interest. However, the\neffectiveness of these models is hindered in real-world scenarios due to the\npotential variation or even absence of pre-registered cues. To address this\nlimitation, this study investigates the integration of natural language to\nenhance the flexibility and controllability of existing TSE models.\nSpecifically, we propose a model named LLM-TSE, wherein a large language model\n(LLM) to extract useful semantic cues from the user's typed text input, which\ncan complement the pre-registered cues or work independently to control the TSE\nprocess. Our experimental results demonstrate competitive performance when only\ntext-based cues are presented, and a new state-of-the-art is set when combined\nwith pre-registered acoustic cues. To the best of our knowledge, this is the\nfirst work that has successfully incorporated text-based cues to guide target\nspeaker extraction, which can be a cornerstone for cocktail party problem\nresearch.\n","authors":["Xiang Hao","Jibin Wu","Jianwei Yu","Chenglin Xu","Kay Chen Tan"],"pdf_url":"https://arxiv.org/pdf/2310.07284v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2310.07282v1","updated":"2023-10-11T08:16:35Z","published":"2023-10-11T08:16:35Z","title":"An Analysis on Large Language Models in Healthcare: A Case Study of\n  BioBERT","summary":"  This paper conducts a comprehensive investigation into applying large\nlanguage models, particularly on BioBERT, in healthcare. It begins with\nthoroughly examining previous natural language processing (NLP) approaches in\nhealthcare, shedding light on the limitations and challenges these methods\nface. Following that, this research explores the path that led to the\nincorporation of BioBERT into healthcare applications, highlighting its\nsuitability for addressing the specific requirements of tasks related to\nbiomedical text mining. The analysis outlines a systematic methodology for\nfine-tuning BioBERT to meet the unique needs of the healthcare domain. This\napproach includes various components, including the gathering of data from a\nwide range of healthcare sources, data annotation for tasks like identifying\nmedical entities and categorizing them, and the application of specialized\npreprocessing techniques tailored to handle the complexities found in\nbiomedical texts. Additionally, the paper covers aspects related to model\nevaluation, with a focus on healthcare benchmarks and functions like processing\nof natural language in biomedical, question-answering, clinical document\nclassification, and medical entity recognition. It explores techniques to\nimprove the model's interpretability and validates its performance compared to\nexisting healthcare-focused language models. The paper thoroughly examines\nethical considerations, particularly patient privacy and data security. It\nhighlights the benefits of incorporating BioBERT into healthcare contexts,\nincluding enhanced clinical decision support and more efficient information\nretrieval. Nevertheless, it acknowledges the impediments and complexities of\nthis integration, encompassing concerns regarding data privacy, transparency,\nresource-intensive requirements, and the necessity for model customization to\nalign with diverse healthcare domains.\n","authors":["Shyni Sharaf","V. S. Anoop"],"pdf_url":"https://arxiv.org/pdf/2310.07282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03347v4","updated":"2023-10-11T08:13:28Z","published":"2023-04-06T19:53:59Z","title":"Towards Interpretable Mental Health Analysis with Large Language Models","summary":"  The latest large language models (LLMs) such as ChatGPT, exhibit strong\ncapabilities in automated mental health analysis. However, existing relevant\nstudies bear several limitations, including inadequate evaluations, lack of\nprompting strategies, and ignorance of exploring LLMs for explainability. To\nbridge these gaps, we comprehensively evaluate the mental health analysis and\nemotional reasoning ability of LLMs on 11 datasets across 5 tasks. We explore\nthe effects of different prompting strategies with unsupervised and distantly\nsupervised emotional information. Based on these prompts, we explore LLMs for\ninterpretable mental health analysis by instructing them to generate\nexplanations for each of their decisions. We convey strict human evaluations to\nassess the quality of the generated explanations, leading to a novel dataset\nwith 163 human-assessed explanations. We benchmark existing automatic\nevaluation metrics on this dataset to guide future related works. According to\nthe results, ChatGPT shows strong in-context learning ability but still has a\nsignificant gap with advanced task-specific methods. Careful prompt engineering\nwith emotional cues and expert-written few-shot examples can also effectively\nimprove performance on mental health analysis. In addition, ChatGPT generates\nexplanations that approach human performance, showing its great potential in\nexplainable mental health analysis.\n","authors":["Kailai Yang","Shaoxiong Ji","Tianlin Zhang","Qianqian Xie","Ziyan Kuang","Sophia Ananiadou"],"pdf_url":"https://arxiv.org/pdf/2304.03347v4.pdf","comment":"Accepted by EMNLP 2023 main conference as a long paper"},{"id":"http://arxiv.org/abs/2310.07279v1","updated":"2023-10-11T08:07:22Z","published":"2023-10-11T08:07:22Z","title":"Enhancing expressivity transfer in textless speech-to-speech translation","summary":"  Textless speech-to-speech translation systems are rapidly advancing, thanks\nto the integration of self-supervised learning techniques. However, existing\nstate-of-the-art systems fall short when it comes to capturing and transferring\nexpressivity accurately across different languages. Expressivity plays a vital\nrole in conveying emotions, nuances, and cultural subtleties, thereby enhancing\ncommunication across diverse languages. To address this issue this study\npresents a novel method that operates at the discrete speech unit level and\nleverages multilingual emotion embeddings to capture language-agnostic\ninformation. Specifically, we demonstrate how these embeddings can be used to\neffectively predict the pitch and duration of speech units in the target\nlanguage. Through objective and subjective experiments conducted on a\nFrench-to-English translation task, our findings highlight the superior\nexpressivity transfer achieved by our approach compared to current\nstate-of-the-art systems.\n","authors":["Jarod Duret","Benjamin O'Brien","Yannick Estève","Titouan Parcollet"],"pdf_url":"https://arxiv.org/pdf/2310.07279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07276v1","updated":"2023-10-11T07:57:08Z","published":"2023-10-11T07:57:08Z","title":"BioT5: Enriching Cross-modal Integration in Biology with Chemical\n  Knowledge and Natural Language Associations","summary":"  Recent advancements in biological research leverage the integration of\nmolecules, proteins, and natural language to enhance drug discovery. However,\ncurrent models exhibit several limitations, such as the generation of invalid\nmolecular SMILES, underutilization of contextual information, and equal\ntreatment of structured and unstructured knowledge. To address these issues, we\npropose $\\mathbf{BioT5}$, a comprehensive pre-training framework that enriches\ncross-modal integration in biology with chemical knowledge and natural language\nassociations. $\\mathbf{BioT5}$ utilizes SELFIES for $100%$ robust molecular\nrepresentations and extracts knowledge from the surrounding context of\nbio-entities in unstructured biological literature. Furthermore,\n$\\mathbf{BioT5}$ distinguishes between structured and unstructured knowledge,\nleading to more effective utilization of information. After fine-tuning, BioT5\nshows superior performance across a wide range of tasks, demonstrating its\nstrong capability of capturing underlying relations and properties of\nbio-entities. Our code is available at\n$\\href{https://github.com/QizhiPei/BioT5}{Github}$.\n","authors":["Qizhi Pei","Wei Zhang","Jinhua Zhu","Kehan Wu","Kaiyuan Gao","Lijun Wu","Yingce Xia","Rui Yan"],"pdf_url":"https://arxiv.org/pdf/2310.07276v1.pdf","comment":"Empirical Methods in Natural Language Processing (EMNLP 2023)"},{"id":"http://arxiv.org/abs/2310.07251v1","updated":"2023-10-11T07:27:34Z","published":"2023-10-11T07:27:34Z","title":"Ethical Reasoning over Moral Alignment: A Case and Framework for\n  In-Context Ethical Policies in LLMs","summary":"  In this position paper, we argue that instead of morally aligning LLMs to\nspecific set of ethical principles, we should infuse generic ethical reasoning\ncapabilities into them so that they can handle value pluralism at a global\nscale. When provided with an ethical policy, an LLM should be capable of making\ndecisions that are ethically consistent to the policy. We develop a framework\nthat integrates moral dilemmas with moral principles pertaining to different\nforamlisms of normative ethics, and at different levels of abstractions.\nInitial experiments with GPT-x models shows that while GPT-4 is a nearly\nperfect ethical reasoner, the models still have bias towards the moral values\nof Western and English speaking societies.\n","authors":["Abhinav Rao","Aditi Khandelwal","Kumar Tanmay","Utkarsh Agarwal","Monojit Choudhury"],"pdf_url":"https://arxiv.org/pdf/2310.07251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06450v2","updated":"2023-10-11T07:04:04Z","published":"2023-10-10T09:20:14Z","title":"Constructive Large Language Models Alignment with Diverse Feedback","summary":"  In recent research on large language models (LLMs), there has been a growing\nemphasis on aligning these models with human values to reduce the impact of\nharmful content. However, current alignment methods often rely solely on\nsingular forms of human feedback, such as preferences, annotated labels, or\nnatural language critiques, overlooking the potential advantages of combining\nthese feedback types. This limitation leads to suboptimal performance, even\nwhen ample training data is available. In this paper, we introduce Constructive\nand Diverse Feedback (CDF) as a novel method to enhance LLM alignment, inspired\nby constructivist learning theory. Our approach involves collecting three\ndistinct types of feedback tailored to problems of varying difficulty levels\nwithin the training dataset. Specifically, we exploit critique feedback for\neasy problems, refinement feedback for medium problems, and preference feedback\nfor hard problems. By training our model with this diversified feedback, we\nachieve enhanced alignment performance while using less training data. To\nassess the effectiveness of CDF, we evaluate it against previous methods in\nthree downstream tasks: question answering, dialog generation, and text\nsummarization. Experimental results demonstrate that CDF achieves superior\nperformance even with a smaller training dataset.\n","authors":["Tianshu Yu","Ting-En Lin","Yuchuan Wu","Min Yang","Fei Huang","Yongbin Li"],"pdf_url":"https://arxiv.org/pdf/2310.06450v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07225v1","updated":"2023-10-11T06:26:19Z","published":"2023-10-11T06:26:19Z","title":"Exploring the Landscape of Large Language Models In Medical Question\n  Answering: Observations and Open Questions","summary":"  Large Language Models (LLMs) have shown promise in medical question answering\nby achieving passing scores in standardised exams and have been suggested as\ntools for supporting healthcare workers. Deploying LLMs into such a high-risk\ncontext requires a clear understanding of the limitations of these models. With\nthe rapid development and release of new LLMs, it is especially valuable to\nidentify patterns which exist across models and may, therefore, continue to\nappear in newer versions. In this paper, we evaluate a wide range of popular\nLLMs on their knowledge of medical questions in order to better understand\ntheir properties as a group. From this comparison, we provide preliminary\nobservations and raise open questions for further research.\n","authors":["Karolina Korgul","Andrew M. Bean","Felix Krones","Robert McCraith","Adam Mahdi"],"pdf_url":"https://arxiv.org/pdf/2310.07225v1.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2310.05028v3","updated":"2023-10-11T06:16:30Z","published":"2023-10-08T06:17:39Z","title":"Revisiting Large Language Models as Zero-shot Relation Extractors","summary":"  Relation extraction (RE) consistently involves a certain degree of labeled or\nunlabeled data even if under zero-shot setting. Recent studies have shown that\nlarge language models (LLMs) transfer well to new tasks out-of-the-box simply\ngiven a natural language prompt, which provides the possibility of extracting\nrelations from text without any data and parameter tuning. This work focuses on\nthe study of exploring LLMs, such as ChatGPT, as zero-shot relation extractors.\nOn the one hand, we analyze the drawbacks of existing RE prompts and attempt to\nincorporate recent prompt techniques such as chain-of-thought (CoT) to improve\nzero-shot RE. We propose the summarize-and-ask (\\textsc{SumAsk}) prompting, a\nsimple prompt recursively using LLMs to transform RE inputs to the effective\nquestion answering (QA) format. On the other hand, we conduct comprehensive\nexperiments on various benchmarks and settings to investigate the capabilities\nof LLMs on zero-shot RE. Specifically, we have the following findings: (i)\n\\textsc{SumAsk} consistently and significantly improves LLMs performance on\ndifferent model sizes, benchmarks and settings; (ii) Zero-shot prompting with\nChatGPT achieves competitive or superior results compared with zero-shot and\nfully supervised methods; (iii) LLMs deliver promising performance in\nextracting overlapping relations; (iv) The performance varies greatly regarding\ndifferent relations. Different from small language models, LLMs are effective\nin handling challenge none-of-the-above (NoTA) relation.\n","authors":["Guozheng Li","Peng Wang","Wenjun Ke"],"pdf_url":"https://arxiv.org/pdf/2310.05028v3.pdf","comment":"Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2303.08518v3","updated":"2023-10-11T05:40:41Z","published":"2023-03-15T10:53:49Z","title":"UPRISE: Universal Prompt Retrieval for Improving Zero-Shot Evaluation","summary":"  Large Language Models (LLMs) are popular for their impressive abilities, but\nthe need for model-specific fine-tuning or task-specific prompt engineering can\nhinder their generalization. We propose UPRISE (Universal Prompt Retrieval for\nImproving zero-Shot Evaluation), which tunes a lightweight and versatile\nretriever that automatically retrieves prompts for a given zero-shot task\ninput. Specifically, we demonstrate universality in a cross-task and\ncross-model scenario: the retriever is tuned on a diverse set of tasks, but\ntested on unseen task types; we use a small frozen LLM, GPT-Neo-2.7B, for\ntuning the retriever, but test the retriever on different LLMs of much larger\nscales, such as BLOOM-7.1B, OPT-66B and GPT3-175B. Additionally, we show that\nUPRISE mitigates the hallucination problem in our experiments with ChatGPT,\nsuggesting its potential to improve even the strongest LLMs. Our model and code\nare available at https://github.com/microsoft/LMOps.\n","authors":["Daixuan Cheng","Shaohan Huang","Junyu Bi","Yuefeng Zhan","Jianfeng Liu","Yujing Wang","Hao Sun","Furu Wei","Denvy Deng","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.08518v3.pdf","comment":"EMNLP 2023 Main Conference"},{"id":"http://arxiv.org/abs/2305.16340v2","updated":"2023-10-11T05:32:13Z","published":"2023-05-24T03:47:22Z","title":"Segmented Recurrent Transformer: An Efficient Sequence-to-Sequence Model","summary":"  Transformers have shown dominant performance across a range of domains\nincluding language and vision. However, their computational cost grows\nquadratically with the sequence length, making their usage prohibitive for\nresource-constrained applications. To counter this, our approach is to divide\nthe whole sequence into segments and use local attention mechanism on the\nindividual segments. We propose a segmented recurrent transformer (SRformer)\nthat combines segmented (local) attention with recurrent attention. The loss\ncaused by reducing the attention window length is compensated by aggregating\ninformation across segments with recurrent attention. SRformer leverages\nRecurrent Accumulate-and-Fire (RAF) neurons' inherent memory to update the\ncumulative product of keys and values. The segmented attention and lightweight\nRAF neurons ensure the efficiency of the proposed transformer. Such an approach\nleads to models with sequential processing capability at a lower\ncomputation/memory cost. We apply the proposed method to T5 and BART\ntransformers. The modified models are tested on summarization datasets\nincluding CNN-dailymail, XSUM, ArXiv, and MediaSUM. Notably, using segmented\ninputs of varied sizes, the proposed model achieves $6-22\\%$ higher ROUGE1\nscores than a segmented transformer and outperforms other recurrent transformer\napproaches. Furthermore, compared to full attention, the proposed model reduces\nthe computational complexity of cross attention by around $40\\%$.\n","authors":["Yinghan Long","Sayeed Shafayet Chowdhury","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2305.16340v2.pdf","comment":"Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2305.14251v2","updated":"2023-10-11T05:27:50Z","published":"2023-05-23T17:06:00Z","title":"FActScore: Fine-grained Atomic Evaluation of Factual Precision in Long\n  Form Text Generation","summary":"  Evaluating the factuality of long-form text generated by large language\nmodels (LMs) is non-trivial because (1) generations often contain a mixture of\nsupported and unsupported pieces of information, making binary judgments of\nquality inadequate, and (2) human evaluation is time-consuming and costly. In\nthis paper, we introduce FACTSCORE, a new evaluation that breaks a generation\ninto a series of atomic facts and computes the percentage of atomic facts\nsupported by a reliable knowledge source. We conduct an extensive human\nevaluation to obtain FACTSCOREs of people biographies generated by several\nstate-of-the-art commercial LMs -- InstructGPT, ChatGPT, and the\nretrieval-augmented PerplexityAI -- and report new analysis demonstrating the\nneed for such a fine-grained score (e.g., ChatGPT only achieves 58%). Since\nhuman evaluation is costly, we also introduce an automated model that estimates\nFACTSCORE using retrieval and a strong language model, with less than a 2%\nerror rate. Finally, we use this automated metric to evaluate 6,500 generations\nfrom a new set of 13 recent LMs that would have cost $26K if evaluated by\nhumans, with various findings: GPT-4 and ChatGPT are more factual than public\nmodels, and Vicuna and Alpaca are some of the best public models. FACTSCORE is\navailable for public use via `pip install factscore`.\n","authors":["Sewon Min","Kalpesh Krishna","Xinxi Lyu","Mike Lewis","Wen-tau Yih","Pang Wei Koh","Mohit Iyyer","Luke Zettlemoyer","Hannaneh Hajishirzi"],"pdf_url":"https://arxiv.org/pdf/2305.14251v2.pdf","comment":"25 pages; 7 figures. Published as a main conference paper at EMNLP\n  2023. Code available at https://github.com/shmsw25/FActScore"},{"id":"http://arxiv.org/abs/2204.07994v2","updated":"2023-10-11T05:12:10Z","published":"2022-04-17T12:33:34Z","title":"Knowledgeable Salient Span Mask for Enhancing Language Models as\n  Knowledge Base","summary":"  Pre-trained language models (PLMs) like BERT have made significant progress\nin various downstream NLP tasks. However, by asking models to do cloze-style\ntests, recent work finds that PLMs are short in acquiring knowledge from\nunstructured text. To understand the internal behaviour of PLMs in retrieving\nknowledge, we first define knowledge-baring (K-B) tokens and knowledge-free\n(K-F) tokens for unstructured text and ask professional annotators to label\nsome samples manually. Then, we find that PLMs are more likely to give wrong\npredictions on K-B tokens and attend less attention to those tokens inside the\nself-attention module. Based on these observations, we develop two solutions to\nhelp the model learn more knowledge from unstructured text in a fully\nself-supervised manner. Experiments on knowledge-intensive tasks show the\neffectiveness of the proposed methods. To our best knowledge, we are the first\nto explore fully self-supervised learning of knowledge in continual\npre-training.\n","authors":["Cunxiang Wang","Fuli Luo","Yanyang Li","Runxin Xu","Fei Huang","Yue Zhang"],"pdf_url":"https://arxiv.org/pdf/2204.07994v2.pdf","comment":"NLPCC-2023"},{"id":"http://arxiv.org/abs/2309.17421v2","updated":"2023-10-11T05:07:37Z","published":"2023-09-29T17:34:51Z","title":"The Dawn of LMMs: Preliminary Explorations with GPT-4V(ision)","summary":"  Large multimodal models (LMMs) extend large language models (LLMs) with\nmulti-sensory skills, such as visual understanding, to achieve stronger generic\nintelligence. In this paper, we analyze the latest model, GPT-4V(ision), to\ndeepen the understanding of LMMs. The analysis focuses on the intriguing tasks\nthat GPT-4V can perform, containing test samples to probe the quality and\ngenericity of GPT-4V's capabilities, its supported inputs and working modes,\nand the effective ways to prompt the model. In our approach to exploring\nGPT-4V, we curate and organize a collection of carefully designed qualitative\nsamples spanning a variety of domains and tasks. Observations from these\nsamples demonstrate that GPT-4V's unprecedented ability in processing\narbitrarily interleaved multimodal inputs and the genericity of its\ncapabilities together make GPT-4V a powerful multimodal generalist system.\nFurthermore, GPT-4V's unique capability of understanding visual markers drawn\non input images can give rise to new human-computer interaction methods such as\nvisual referring prompting. We conclude the report with in-depth discussions on\nthe emerging application scenarios and the future research directions for\nGPT-4V-based systems. We hope that this preliminary exploration will inspire\nfuture research on the next-generation multimodal task formulation, new ways to\nexploit and enhance LMMs to solve real-world problems, and gaining better\nunderstanding of multimodal foundation models. Finally, we acknowledge that the\nmodel under our study is solely the product of OpenAI's innovative work, and\nthey should be fully credited for its development. Please see the GPT-4V\ncontributions paper for the authorship and credit attribution:\nhttps://cdn.openai.com/contributions/gpt-4v.pdf\n","authors":["Zhengyuan Yang","Linjie Li","Kevin Lin","Jianfeng Wang","Chung-Ching Lin","Zicheng Liu","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2309.17421v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06200v2","updated":"2023-10-11T05:00:10Z","published":"2023-10-09T23:02:07Z","title":"The Importance of Prompt Tuning for Automated Neuron Explanations","summary":"  Recent advances have greatly increased the capabilities of large language\nmodels (LLMs), but our understanding of the models and their safety has not\nprogressed as fast. In this paper we aim to understand LLMs deeper by studying\ntheir individual neurons. We build upon previous work showing large language\nmodels such as GPT-4 can be useful in explaining what each neuron in a language\nmodel does. Specifically, we analyze the effect of the prompt used to generate\nexplanations and show that reformatting the explanation prompt in a more\nnatural way can significantly improve neuron explanation quality and greatly\nreduce computational cost. We demonstrate the effects of our new prompts in\nthree different ways, incorporating both automated and human evaluations.\n","authors":["Justin Lee","Tuomas Oikarinen","Arjun Chatha","Keng-Chi Chang","Yilan Chen","Tsui-Wei Weng"],"pdf_url":"https://arxiv.org/pdf/2310.06200v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07188v1","updated":"2023-10-11T04:30:18Z","published":"2023-10-11T04:30:18Z","title":"Adaptive Gating in Mixture-of-Experts based Language Models","summary":"  Large language models, such as OpenAI's ChatGPT, have demonstrated\nexceptional language understanding capabilities in various NLP tasks. Sparsely\nactivated mixture-of-experts (MoE) has emerged as a promising solution for\nscaling models while maintaining a constant number of computational operations.\nExisting MoE model adopts a fixed gating network where each token is computed\nby the same number of experts. However, this approach contradicts our intuition\nthat the tokens in each sequence vary in terms of their linguistic complexity\nand, consequently, require different computational costs. Little is discussed\nin prior research on the trade-off between computation per token and model\nperformance. This paper introduces adaptive gating in MoE, a flexible training\nstrategy that allows tokens to be processed by a variable number of experts\nbased on expert probability distribution. The proposed framework preserves\nsparsity while improving training efficiency. Additionally, curriculum learning\nis leveraged to further reduce training time. Extensive experiments on diverse\nNLP tasks show that adaptive gating reduces at most 22.5% training time while\nmaintaining inference quality. Moreover, we conduct a comprehensive analysis of\nthe routing decisions and present our insights when adaptive gating is used.\n","authors":["Jiamin Li","Qiang Su","Yitao Yang","Yimin Jiang","Cong Wang","Hong Xu"],"pdf_url":"https://arxiv.org/pdf/2310.07188v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.12920v4","updated":"2023-10-11T04:08:26Z","published":"2023-01-30T14:19:29Z","title":"Active Learning for Multilingual Semantic Parser","summary":"  Current multilingual semantic parsing (MSP) datasets are almost all collected\nby translating the utterances in the existing datasets from the resource-rich\nlanguage to the target language. However, manual translation is costly. To\nreduce the translation effort, this paper proposes the first active learning\nprocedure for MSP (AL-MSP). AL-MSP selects only a subset from the existing\ndatasets to be translated. We also propose a novel selection method that\nprioritizes the examples diversifying the logical form structures with more\nlexical choices, and a novel hyperparameter tuning method that needs no extra\nannotation cost. Our experiments show that AL-MSP significantly reduces\ntranslation costs with ideal selection methods. Our selection method with\nproper hyperparameters yields better parsing performance than the other\nbaselines on two multilingual datasets.\n","authors":["Zhuang Li","Gholamreza Haffari"],"pdf_url":"https://arxiv.org/pdf/2301.12920v4.pdf","comment":"EACL 2023 (findings)"},{"id":"http://arxiv.org/abs/2310.07177v1","updated":"2023-10-11T04:03:42Z","published":"2023-10-11T04:03:42Z","title":"Online Speculative Decoding","summary":"  Speculative decoding is a pivotal technique to accelerate the inference of\nlarge language models (LLMs) by employing a smaller draft model to predict the\ntarget model's outputs. However, its efficacy can be limited due to the low\npredictive accuracy of the draft model, particularly when faced with diverse\ntext inputs and a significant capability gap between the draft and target\nmodels. We introduce online speculative decoding (OSD) to address this\nchallenge. The main idea is to continually update (multiple) draft model(s) on\nobserved user query data using the abundant excess computational power in an\nLLM serving cluster. Given that LLM inference is memory-bounded, the surplus\ncomputational power in a typical LLM serving cluster can be repurposed for\nonline retraining of draft models, thereby making the training cost-neutral.\nSince the query distribution of an LLM service is relatively simple, retraining\non query distribution enables the draft model to more accurately predict the\ntarget model's outputs, particularly on data originating from query\ndistributions. As the draft model evolves online, it aligns with the query\ndistribution in real time, mitigating distribution shifts. We develop a\nprototype of online speculative decoding based on online knowledge distillation\nand evaluate it using both synthetic and real query data on several popular\nLLMs. The results show a substantial increase in the token acceptance rate by\n0.1 to 0.65, which translates into 1.22x to 3.06x latency reduction.\n","authors":["Xiaoxuan Liu","Lanxiang Hu","Peter Bailis","Ion Stoica","Zhijie Deng","Alvin Cheung","Hao Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.07177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07170v1","updated":"2023-10-11T03:39:46Z","published":"2023-10-11T03:39:46Z","title":"PHALM: Building a Knowledge Graph from Scratch by Prompting Humans and a\n  Language Model","summary":"  Despite the remarkable progress in natural language understanding with\npretrained Transformers, neural language models often do not handle commonsense\nknowledge well. Toward commonsense-aware models, there have been attempts to\nobtain knowledge, ranging from automatic acquisition to crowdsourcing. However,\nit is difficult to obtain a high-quality knowledge base at a low cost,\nespecially from scratch. In this paper, we propose PHALM, a method of building\na knowledge graph from scratch, by prompting both crowdworkers and a large\nlanguage model (LLM). We used this method to build a Japanese event knowledge\ngraph and trained Japanese commonsense generation models. Experimental results\nrevealed the acceptability of the built graph and inferences generated by the\ntrained models. We also report the difference in prompting humans and an LLM.\nOur code, data, and models are available at\ngithub.com/nlp-waseda/comet-atomic-ja.\n","authors":["Tatsuya Ide","Eiki Murata","Daisuke Kawahara","Takato Yamazaki","Shengzhe Li","Kenta Shinzato","Toshinori Sato"],"pdf_url":"https://arxiv.org/pdf/2310.07170v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15074v2","updated":"2023-10-11T03:38:56Z","published":"2023-05-24T11:55:59Z","title":"Have LLMs Advanced Enough? A Challenging Problem Solving Benchmark For\n  Large Language Models","summary":"  The performance of large language models (LLMs) on existing reasoning\nbenchmarks has significantly improved over the past years. In response, we\npresent JEEBench, a considerably more challenging benchmark dataset for\nevaluating the problem solving abilities of LLMs. We curate 515 challenging\npre-engineering mathematics, physics and chemistry problems from the highly\ncompetitive IIT JEE-Advanced exam. Long-horizon reasoning on top of deep\nin-domain knowledge is essential for solving problems in this benchmark. Our\nevaluation on various open-source and proprietary models reveals that the\nhighest performance, even after using techniques like self-consistency,\nself-refinement and chain-of-thought prompting, is less than 40\\%. The typical\nfailure modes of GPT-4, the best model, are errors in algebraic manipulation,\ndifficulty in grounding abstract concepts into mathematical equations\naccurately and failure in retrieving relevant domain-specific concepts. We also\nobserve that by mere prompting, GPT-4 is unable to assess risk introduced by\nnegative marking for incorrect answers. For this, we develop a post-hoc\nconfidence-thresholding method over self-consistency, which enables effective\nresponse selection. We hope that our challenging benchmark will guide future\nre-search in problem-solving using LLMs.\n","authors":["Daman Arora","Himanshu Gaurav Singh"," Mausam"],"pdf_url":"https://arxiv.org/pdf/2305.15074v2.pdf","comment":"v2"},{"id":"http://arxiv.org/abs/2310.07161v1","updated":"2023-10-11T03:19:22Z","published":"2023-10-11T03:19:22Z","title":"Psychoacoustic Challenges Of Speech Enhancement On VoIP Platforms","summary":"  Within the ambit of VoIP (Voice over Internet Protocol) telecommunications,\nthe complexities introduced by acoustic transformations merit rigorous\nanalysis. This research, rooted in the exploration of proprietary sender-side\ndenoising effects, meticulously evaluates platforms such as Google Meets and\nZoom. The study draws upon the Deep Noise Suppression (DNS) 2020 dataset,\nensuring a structured examination tailored to various denoising settings and\nreceiver interfaces. A methodological novelty is introduced via the Oaxaca\ndecomposition, traditionally an econometric tool, repurposed herein to analyze\nacoustic-phonetic perturbations within VoIP systems. To further ground the\nimplications of these transformations, psychoacoustic metrics, specifically\nPESQ and STOI, were harnessed to furnish a comprehensive understanding of\nspeech alterations. Cumulatively, the insights garnered underscore the\nintricate landscape of VoIP-influenced acoustic dynamics. In addition to the\nprimary findings, a multitude of metrics are reported, extending the research\npurview. Moreover, out-of-domain benchmarking for both time and time-frequency\ndomain speech enhancement models is included, thereby enhancing the depth and\napplicability of this inquiry.\n","authors":["Joseph Konan","Ojas Bhargave","Shikhar Agnihotri","Shuo Han","Yunyang Zeng","Ankit Shah","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2310.07161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02796v2","updated":"2023-10-11T03:16:23Z","published":"2023-07-06T06:11:51Z","title":"VerifAI: Verified Generative AI","summary":"  Generative AI has made significant strides, yet concerns about the accuracy\nand reliability of its outputs continue to grow. Such inaccuracies can have\nserious consequences such as inaccurate decision-making, the spread of false\ninformation, privacy violations, legal liabilities, and more. Although efforts\nto address these risks are underway, including explainable AI and responsible\nAI practices such as transparency, privacy protection, bias mitigation, and\nsocial and environmental responsibility, misinformation caused by generative AI\nwill remain a significant challenge. We propose that verifying the outputs of\ngenerative AI from a data management perspective is an emerging issue for\ngenerative AI. This involves analyzing the underlying data from multi-modal\ndata lakes, including text files, tables, and knowledge graphs, and assessing\nits quality and consistency. By doing so, we can establish a stronger\nfoundation for evaluating the outputs of generative AI models. Such an approach\ncan ensure the correctness of generative AI, promote transparency, and enable\ndecision-making with greater confidence. Our vision is to promote the\ndevelopment of verifiable generative AI and contribute to a more trustworthy\nand responsible use of AI.\n","authors":["Nan Tang","Chenyu Yang","Ju Fan","Lei Cao","Yuyu Luo","Alon Halevy"],"pdf_url":"https://arxiv.org/pdf/2307.02796v2.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2310.07155v1","updated":"2023-10-11T03:01:42Z","published":"2023-10-11T03:01:42Z","title":"\"A Tale of Two Movements\": Identifying and Comparing Perspectives in\n  #BlackLivesMatter and #BlueLivesMatter Movements-related Tweets using Weakly\n  Supervised Graph-based Structured Prediction","summary":"  Social media has become a major driver of social change, by facilitating the\nformation of online social movements. Automatically understanding the\nperspectives driving the movement and the voices opposing it, is a challenging\ntask as annotated data is difficult to obtain. We propose a weakly supervised\ngraph-based approach that explicitly models perspectives in\n#BackLivesMatter-related tweets. Our proposed approach utilizes a\nsocial-linguistic representation of the data. We convert the text to a graph by\nbreaking it into structured elements and connect it with the social network of\nauthors, then structured prediction is done over the elements for identifying\nperspectives. Our approach uses a small seed set of labeled examples. We\nexperiment with large language models for generating artificial training\nexamples, compare them to manual annotation, and find that it achieves\ncomparable performance. We perform quantitative and qualitative analyses using\na human-annotated test set. Our model outperforms multitask baselines by a\nlarge margin, successfully characterizing the perspectives supporting and\nopposing #BLM.\n","authors":["Shamik Roy","Dan Goldwasser"],"pdf_url":"https://arxiv.org/pdf/2310.07155v1.pdf","comment":"Accepted version to Findings of EMNLP 2023 (camera ready coming soon)"},{"id":"http://arxiv.org/abs/2310.07147v1","updated":"2023-10-11T02:47:40Z","published":"2023-10-11T02:47:40Z","title":"QFT: Quantized Full-parameter Tuning of LLMs with Affordable Resources","summary":"  Large Language Models (LLMs) have showcased remarkable impacts across a wide\nspectrum of natural language processing tasks. Fine-tuning these pre-trained\nmodels on downstream datasets provides further significant performance gains,\nbut this process has been challenging due to its extraordinary resource\nrequirements. To this end, existing efforts focus on parameter-efficient\nfine-tuning, which, unfortunately, fail to capitalize on the powerful potential\nof full-parameter fine-tuning. In this work, we propose QFT, a novel Quantized\nFull-parameter Tuning framework for LLMs that enables memory-efficient\nfine-tuning without harming performance. Our framework incorporates two novel\nideas: (i) we adopt the efficient Lion optimizer, which only keeps track of the\nmomentum and has consistent update magnitudes for each parameter, an inherent\nadvantage for robust quantization; and (ii) we quantize all model states and\nstore them as integer values, and present a gradient flow and parameter update\nscheme for the quantized weights. As a result, QFT reduces the model state\nmemory to 21% of the standard solution while achieving comparable performance,\ne.g., tuning a LLaMA-7B model requires only <30GB of memory, satisfied by a\nsingle A6000 GPU.\n","authors":["Zhikai Li","Xiaoxuan Liu","Banghua Zhu","Zhen Dong","Qingyi Gu","Kurt Keutzer"],"pdf_url":"https://arxiv.org/pdf/2310.07147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07146v1","updated":"2023-10-11T02:47:21Z","published":"2023-10-11T02:47:21Z","title":"Empowering Psychotherapy with Large Language Models: Cognitive\n  Distortion Detection through Diagnosis of Thought Prompting","summary":"  Mental illness remains one of the most critical public health issues of our\ntime, due to the severe scarcity and accessibility limit of professionals.\nPsychotherapy requires high-level expertise to conduct deep, complex reasoning\nand analysis on the cognition modeling of the patients. In the era of Large\nLanguage Models, we believe it is the right time to develop AI assistance for\ncomputational psychotherapy. We study the task of cognitive distortion\ndetection and propose the Diagnosis of Thought (DoT) prompting. DoT performs\ndiagnosis on the patient's speech via three stages: subjectivity assessment to\nseparate the facts and the thoughts; contrastive reasoning to elicit the\nreasoning processes supporting and contradicting the thoughts; and schema\nanalysis to summarize the cognition schemas. The generated diagnosis rationales\nthrough the three stages are essential for assisting the professionals.\nExperiments demonstrate that DoT obtains significant improvements over ChatGPT\nfor cognitive distortion detection, while generating high-quality rationales\napproved by human experts.\n","authors":["Zhiyu Chen","Yujie Lu","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2310.07146v1.pdf","comment":"EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2310.05280v2","updated":"2023-10-11T02:32:10Z","published":"2023-10-08T21:03:18Z","title":"Are Personalized Stochastic Parrots More Dangerous? Evaluating Persona\n  Biases in Dialogue Systems","summary":"  Recent advancements in Large Language Models empower them to follow freeform\ninstructions, including imitating generic or specific demographic personas in\nconversations. Generic personas refer to an individual from a demographic group\n(e.g. an Asian person), whereas specific personas can be actual names of\nhistorical figures. While the adoption of personas allows dialogue systems to\nbe more engaging and approachable to users, it also carries the potential risk\nof exacerbating social biases in model responses, further causing societal\nharms through interactions with users. In this paper, we systematically study\n\"persona biases\", which we define to be the sensitivity of harmful dialogue\nmodel behaviors to different persona adoptions. We categorize persona biases\ninto biases in harmful expression and harmful agreement, as well as establish a\ncomprehensive evaluation framework to measure persona biases in five aspects:\nOffensiveness, Toxic Continuation, Regard, Stereotype Agreement, and Toxic\nAgreement. Additionally, we propose to comprehensively investigate persona\nbiases through experimenting with UniversalPersona, a systematized persona\ndataset with a comprehensive list of both generic and specific model personas.\nThrough benchmarking on four different models, including Blender, ChatGPT,\nAlpaca, and Vicuna, our study uncovers significant persona biases in these\ndialogue systems.Findings of our study underscores the immediate need to\nrevisit the use of persona traits in dialogue agents, to ensure their safe\napplication.\n","authors":["Yixin Wan","Jieyu Zhao","Aman Chadha","Nanyun Peng","Kai-Wei Chang"],"pdf_url":"https://arxiv.org/pdf/2310.05280v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07137v1","updated":"2023-10-11T02:22:28Z","published":"2023-10-11T02:22:28Z","title":"AE-smnsMLC: Multi-Label Classification with Semantic Matching and\n  Negative Label Sampling for Product Attribute Value Extraction","summary":"  Product attribute value extraction plays an important role for many\nreal-world applications in e-Commerce such as product search and\nrecommendation. Previous methods treat it as a sequence labeling task that\nneeds more annotation for position of values in the product text. This limits\ntheir application to real-world scenario in which only attribute values are\nweakly-annotated for each product without their position. Moreover, these\nmethods only use product text (i.e., product title and description) and do not\nconsider the semantic connection between the multiple attribute values of a\ngiven product and its text, which can help attribute value extraction. In this\npaper, we reformulate this task as a multi-label classification task that can\nbe applied for real-world scenario in which only annotation of attribute values\nis available to train models (i.e., annotation of positional information of\nattribute values is not available). We propose a classification model with\nsemantic matching and negative label sampling for attribute value extraction.\nSemantic matching aims to capture semantic interactions between attribute\nvalues of a given product and its text. Negative label sampling aims to enhance\nthe model's ability of distinguishing similar values belonging to the same\nattribute. Experimental results on three subsets of a large real-world\ne-Commerce dataset demonstrate the effectiveness and superiority of our\nproposed model.\n","authors":["Zhongfen Deng","Wei-Te Chen","Lei Chen","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2310.07137v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07135v1","updated":"2023-10-11T02:16:12Z","published":"2023-10-11T02:16:12Z","title":"Comparing Styles across Languages","summary":"  Understanding how styles differ across languages is advantageous for training\nboth humans and computers to generate culturally appropriate text. We introduce\nan explanation framework to extract stylistic differences from multilingual LMs\nand compare styles across languages. Our framework (1) generates comprehensive\nstyle lexica in any language and (2) consolidates feature importances from LMs\ninto comparable lexical categories. We apply this framework to compare\npoliteness, creating the first holistic multilingual politeness dataset and\nexploring how politeness varies across four languages. Our approach enables an\neffective evaluation of how distinct linguistic categories contribute to\nstylistic variations and provides interpretable insights into how people\ncommunicate differently around the world.\n","authors":["Shreya Havaldar","Matthew Pressimone","Eric Wong","Lyle Ungar"],"pdf_url":"https://arxiv.org/pdf/2310.07135v1.pdf","comment":"To appear in EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.07106v1","updated":"2023-10-11T01:03:42Z","published":"2023-10-11T01:03:42Z","title":"The Temporal Structure of Language Processing in the Human Brain\n  Corresponds to The Layered Hierarchy of Deep Language Models","summary":"  Deep Language Models (DLMs) provide a novel computational paradigm for\nunderstanding the mechanisms of natural language processing in the human brain.\nUnlike traditional psycholinguistic models, DLMs use layered sequences of\ncontinuous numerical vectors to represent words and context, allowing a\nplethora of emerging applications such as human-like text generation. In this\npaper we show evidence that the layered hierarchy of DLMs may be used to model\nthe temporal dynamics of language comprehension in the brain by demonstrating a\nstrong correlation between DLM layer depth and the time at which layers are\nmost predictive of the human brain. Our ability to temporally resolve\nindividual layers benefits from our use of electrocorticography (ECoG) data,\nwhich has a much higher temporal resolution than noninvasive methods like fMRI.\nUsing ECoG, we record neural activity from participants listening to a\n30-minute narrative while also feeding the same narrative to a high-performing\nDLM (GPT2-XL). We then extract contextual embeddings from the different layers\nof the DLM and use linear encoding models to predict neural activity. We first\nfocus on the Inferior Frontal Gyrus (IFG, or Broca's area) and then extend our\nmodel to track the increasing temporal receptive window along the linguistic\nprocessing hierarchy from auditory to syntactic and semantic areas. Our results\nreveal a connection between human language processing and DLMs, with the DLM's\nlayer-by-layer accumulation of contextual information mirroring the timing of\nneural activity in high-order language areas.\n","authors":["Ariel Goldstein","Eric Ham","Mariano Schain","Samuel Nastase","Zaid Zada","Avigail Dabush","Bobbi Aubrey","Harshvardhan Gazula","Amir Feder","Werner K Doyle","Sasha Devore","Patricia Dugan","Daniel Friedman","Roi Reichart","Michael Brenner","Avinatan Hassidim","Orrin Devinsky","Adeen Flinker","Omer Levy","Uri Hasson"],"pdf_url":"https://arxiv.org/pdf/2310.07106v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07096v1","updated":"2023-10-11T00:38:57Z","published":"2023-10-11T00:38:57Z","title":"Sparse Universal Transformer","summary":"  The Universal Transformer (UT) is a variant of the Transformer that shares\nparameters across its layers. Empirical evidence shows that UTs have better\ncompositional generalization than Vanilla Transformers (VTs) in formal language\ntasks. The parameter-sharing also affords it better parameter efficiency than\nVTs. Despite its many advantages, scaling UT parameters is much more compute\nand memory intensive than scaling up a VT. This paper proposes the Sparse\nUniversal Transformer (SUT), which leverages Sparse Mixture of Experts (SMoE)\nand a new stick-breaking-based dynamic halting mechanism to reduce UT's\ncomputation complexity while retaining its parameter efficiency and\ngeneralization ability. Experiments show that SUT achieves the same performance\nas strong baseline models while only using half computation and parameters on\nWMT'14 and strong generalization results on formal language tasks (Logical\ninference and CFQ). The new halting mechanism also enables around 50\\%\nreduction in computation during inference with very little performance decrease\non formal language tasks.\n","authors":["Shawn Tan","Yikang Shen","Zhenfang Chen","Aaron Courville","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2310.07096v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09442v3","updated":"2023-10-11T00:37:33Z","published":"2023-06-15T18:49:50Z","title":"Explore, Establish, Exploit: Red Teaming Language Models from Scratch","summary":"  Deploying large language models (LMs) can pose hazards from harmful outputs\nsuch as toxic or false text. Prior work has introduced automated tools that\nelicit harmful outputs to identify these risks. While this is a valuable step\ntoward securing models, these approaches rely on a pre-existing way to\nefficiently classify undesirable outputs. Using a pre-existing classifier does\nnot allow for red-teaming to be tailored to the target model. Furthermore, when\nfailures can be easily classified in advance, red-teaming has limited marginal\nvalue because problems can be avoided by simply filtering training data and/or\nmodel outputs. Here, we consider red-teaming \"from scratch,\" in which the\nadversary does not begin with a way to classify failures. Our framework\nconsists of three steps: 1) Exploring the model's range of behaviors in the\ndesired context; 2) Establishing a definition and measurement for undesired\nbehavior (e.g., a classifier trained to reflect human evaluations); and 3)\nExploiting the model's flaws using this measure to develop diverse adversarial\nprompts. We use this approach to red-team GPT-3 to discover classes of inputs\nthat elicit false statements. In doing so, we construct the CommonClaim dataset\nof 20,000 statements labeled by humans as common-knowledge-true, common\nknowledge-false, or neither. We are making code and data available.\n","authors":["Stephen Casper","Jason Lin","Joe Kwon","Gatlen Culp","Dylan Hadfield-Menell"],"pdf_url":"https://arxiv.org/pdf/2306.09442v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07093v1","updated":"2023-10-11T00:18:29Z","published":"2023-10-11T00:18:29Z","title":"Argumentative Stance Prediction: An Exploratory Study on Multimodality\n  and Few-Shot Learning","summary":"  To advance argumentative stance prediction as a multimodal problem, the First\nShared Task in Multimodal Argument Mining hosted stance prediction in crucial\nsocial topics of gun control and abortion. Our exploratory study attempts to\nevaluate the necessity of images for stance prediction in tweets and compare\nout-of-the-box text-based large-language models (LLM) in few-shot settings\nagainst fine-tuned unimodal and multimodal models. Our work suggests an\nensemble of fine-tuned text-based language models (0.817 F1-score) outperforms\nboth the multimodal (0.677 F1-score) and text-based few-shot prediction using a\nrecent state-of-the-art LLM (0.550 F1-score). In addition to the differences in\nperformance, our findings suggest that the multimodal models tend to perform\nbetter when image content is summarized as natural language over their native\npixel structure and, using in-context examples improves few-shot performance of\nLLMs.\n","authors":["Arushi Sharma","Abhibha Gupta","Maneesh Bilalpur"],"pdf_url":"https://arxiv.org/pdf/2310.07093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07091v1","updated":"2023-10-11T00:14:40Z","published":"2023-10-11T00:14:40Z","title":"Jaeger: A Concatenation-Based Multi-Transformer VQA Model","summary":"  Document-based Visual Question Answering poses a challenging task between\nlinguistic sense disambiguation and fine-grained multimodal retrieval. Although\nthere has been encouraging progress in document-based question answering due to\nthe utilization of large language and open-world prior models\\cite{1}, several\nchallenges persist, including prolonged response times, extended inference\ndurations, and imprecision in matching. In order to overcome these challenges,\nwe propose Jaegar, a concatenation-based multi-transformer VQA model. To derive\nquestion features, we leverage the exceptional capabilities of RoBERTa\nlarge\\cite{2} and GPT2-xl\\cite{3} as feature extractors. Subsequently, we\nsubject the outputs from both models to a concatenation process. This operation\nallows the model to consider information from diverse sources concurrently,\nstrengthening its representational capability. By leveraging pre-trained models\nfor feature extraction, our approach has the potential to amplify the\nperformance of these models through concatenation. After concatenation, we\napply dimensionality reduction to the output features, reducing the model's\ncomputational effectiveness and inference time. Empirical results demonstrate\nthat our proposed model achieves competitive performance on Task C of the\nPDF-VQA Dataset. If the user adds any new data, they should make sure to style\nit as per the instructions provided in previous sections.\n","authors":["Jieting Long","Zewei Shi","Penghao Jiang","Yidong Gan"],"pdf_url":"https://arxiv.org/pdf/2310.07091v1.pdf","comment":"This paper is the technical research paper of CIKM 2023 DocIU\n  challenges. The authors received the CIKM 2023 DocIU Winner Award, sponsored\n  by Google, Microsoft, and the Centre for data-driven geoscience"},{"id":"http://arxiv.org/abs/2310.07088v1","updated":"2023-10-11T00:01:41Z","published":"2023-10-11T00:01:41Z","title":"Diversity of Thought Improves Reasoning Abilities of Large Language\n  Models","summary":"  Large language models (LLMs) are documented to struggle in settings that\nrequire complex reasoning. Nevertheless, instructing the model to break down\nthe problem into smaller reasoning steps (Wei et al., 2022), or ensembling\nvarious generations through modifying decoding steps (Wang et al., 2023) boosts\nperformance. Current methods assume that the input prompt is fixed and expect\nthe decoding strategies to introduce the diversity needed for ensembling. In\nthis work, we relax this assumption and discuss how one can create and leverage\nvariations of the input prompt as a means to diversity of thought to improve\nmodel performance. We propose a method that automatically improves prompt\ndiversity by soliciting feedback from the LLM to ideate approaches that fit for\nthe problem. We then ensemble the diverse prompts in our method DIV-SE (DIVerse\nreasoning path Self-Ensemble) across multiple inference calls. We also propose\na cost-effective alternative where diverse prompts are used within a single\ninference call; we call this IDIV-SE (In-call DIVerse reasoning path\nSelf-Ensemble). Under a fixed generation budget, DIV-SE and IDIV-SE outperform\nthe previously discussed baselines using both GPT-3.5 and GPT-4 on several\nreasoning benchmarks, without modifying the decoding process. Additionally,\nDIV-SE advances state-of-the-art performance on recent planning benchmarks\n(Valmeekam et al., 2023), exceeding the highest previously reported accuracy by\nat least 29.6 percentage points on the most challenging 4/5 Blocksworld task.\nOur results shed light on how to enforce prompt diversity toward LLM reasoning\nand thereby improve the pareto frontier of the accuracy-cost trade-off.\n","authors":["Ranjita Naik","Varun Chandrasekaran","Mert Yuksekgonul","Hamid Palangi","Besmira Nushi"],"pdf_url":"https://arxiv.org/pdf/2310.07088v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10966v5","updated":"2023-10-11T23:59:52Z","published":"2023-09-19T23:39:07Z","title":"MBR and QE Finetuning: Training-time Distillation of the Best and Most\n  Expensive Decoding Methods","summary":"  Recent research in decoding methods for Natural Language Generation (NLG)\ntasks has shown that MAP decoding is not optimal, because model probabilities\ndo not always align with human preferences. Stronger decoding methods,\nincluding Quality Estimation (QE) reranking and Minimum Bayes' Risk (MBR)\ndecoding, have since been proposed to mitigate the model-perplexity-vs-quality\nmismatch. While these decoding methods achieve state-of-the-art performance,\nthey are prohibitively expensive to compute. In this work, we propose MBR\nfinetuning and QE finetuning which distill the quality gains from these\ndecoding methods at training time, while using an efficient decoding algorithm\nat inference time. Using the canonical NLG task of Neural Machine Translation\n(NMT), we show that even with self-training, these finetuning methods\nsignificantly outperform the base model. Moreover, when using an external LLM\nas a teacher model, these finetuning methods outperform finetuning on\nhuman-generated references. These findings suggest new ways to leverage\nmonolingual data to achieve improvements in model quality that are on par with,\nor even exceed, improvements from human-curated data, while maintaining maximum\nefficiency during decoding.\n","authors":["Mara Finkelstein","Subhajit Naskar","Mehdi Mirzazadeh","Apurva Shah","Markus Freitag"],"pdf_url":"https://arxiv.org/pdf/2309.10966v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03135v3","updated":"2023-10-11T23:38:36Z","published":"2023-07-06T17:05:26Z","title":"Distilling Large Vision-Language Model with Out-of-Distribution\n  Generalizability","summary":"  Large vision-language models have achieved outstanding performance, but their\nsize and computational requirements make their deployment on\nresource-constrained devices and time-sensitive tasks impractical. Model\ndistillation, the process of creating smaller, faster models that maintain the\nperformance of larger models, is a promising direction towards the solution.\nThis paper investigates the distillation of visual representations in large\nteacher vision-language models into lightweight student models using a small-\nor mid-scale dataset. Notably, this study focuses on open-vocabulary\nout-of-distribution (OOD) generalization, a challenging problem that has been\noverlooked in previous model distillation literature. We propose two principles\nfrom vision and language modality perspectives to enhance student's OOD\ngeneralization: (1) by better imitating teacher's visual representation space,\nand carefully promoting better coherence in vision-language alignment with the\nteacher; (2) by enriching the teacher's language representations with\ninformative and finegrained semantic attributes to effectively distinguish\nbetween different labels. We propose several metrics and conduct extensive\nexperiments to investigate their techniques. The results demonstrate\nsignificant improvements in zero-shot and few-shot student performance on\nopen-vocabulary out-of-distribution classification, highlighting the\neffectiveness of our proposed approaches. Poster:\nhttps://xuanlinli17.github.io/pdfs/iccv23_large_vlm_distillation_poster.pdf\nCode: https://github.com/xuanlinli17/large_vlm_distillation_ood\n","authors":["Xuanlin Li","Yunhao Fang","Minghua Liu","Zhan Ling","Zhuowen Tu","Hao Su"],"pdf_url":"https://arxiv.org/pdf/2307.03135v3.pdf","comment":"Published at International Conference on Computer Vision (ICCV) 2023.\n  Poster at\n  https://xuanlinli17.github.io/pdfs/iccv23_large_vlm_distillation_poster.pdf"},{"id":"http://arxiv.org/abs/2310.07931v1","updated":"2023-10-11T23:01:29Z","published":"2023-10-11T23:01:29Z","title":"D2 Pruning: Message Passing for Balancing Diversity and Difficulty in\n  Data Pruning","summary":"  Analytical theories suggest that higher-quality data can lead to lower test\nerrors in models trained on a fixed data budget. Moreover, a model can be\ntrained on a lower compute budget without compromising performance if a dataset\ncan be stripped of its redundancies. Coreset selection (or data pruning) seeks\nto select a subset of the training data so as to maximize the performance of\nmodels trained on this subset, also referred to as coreset. There are two\ndominant approaches: (1) geometry-based data selection for maximizing data\ndiversity in the coreset, and (2) functions that assign difficulty scores to\nsamples based on training dynamics. Optimizing for data diversity leads to a\ncoreset that is biased towards easier samples, whereas, selection by difficulty\nranking omits easy samples that are necessary for the training of deep learning\nmodels. This demonstrates that data diversity and importance scores are two\ncomplementary factors that need to be jointly considered during coreset\nselection. We represent a dataset as an undirected graph and propose a novel\npruning algorithm, D2 Pruning, that uses forward and reverse message passing\nover this dataset graph for coreset selection. D2 Pruning updates the\ndifficulty scores of each example by incorporating the difficulty of its\nneighboring examples in the dataset graph. Then, these updated difficulty\nscores direct a graph-based sampling method to select a coreset that\nencapsulates both diverse and difficult regions of the dataset space. We\nevaluate supervised and self-supervised versions of our method on various\nvision and language datasets. Results show that D2 Pruning improves coreset\nselection over previous state-of-the-art methods for up to 70% pruning rates.\nAdditionally, we find that using D2 Pruning for filtering large multimodal\ndatasets leads to increased diversity in the dataset and improved\ngeneralization of pretrained models.\n","authors":["Adyasha Maharana","Prateek Yadav","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2310.07931v1.pdf","comment":"17 pages (Our code is available at\n  https://github.com/adymaharana/d2pruning)"},{"id":"http://arxiv.org/abs/2310.07929v1","updated":"2023-10-11T22:57:03Z","published":"2023-10-11T22:57:03Z","title":"Crosslingual Structural Priming and the Pre-Training Dynamics of\n  Bilingual Language Models","summary":"  Do multilingual language models share abstract grammatical representations\nacross languages, and if so, when do these develop? Following Sinclair et al.\n(2022), we use structural priming to test for abstract grammatical\nrepresentations with causal effects on model outputs. We extend the approach to\na Dutch-English bilingual setting, and we evaluate a Dutch-English language\nmodel during pre-training. We find that crosslingual structural priming effects\nemerge early after exposure to the second language, with less than 1M tokens of\ndata in that language. We discuss implications for data contamination,\nlow-resource transfer, and how abstract grammatical representations emerge in\nmultilingual models.\n","authors":["Catherine Arnett","Tyler A. Chang","James A. Michaelov","Benjamin K. Bergen"],"pdf_url":"https://arxiv.org/pdf/2310.07929v1.pdf","comment":"Extended abstract accepted to the 3rd Multilingual Representation\n  Learning workshop at EMNLP 2023"},{"id":"http://arxiv.org/abs/2308.03853v2","updated":"2023-10-11T22:41:37Z","published":"2023-08-07T18:03:10Z","title":"Exploring zero-shot capability of large language models in inferences\n  from medical oncology notes","summary":"  Both medical care and observational studies in oncology require a thorough\nunderstanding of a patient's disease progression and treatment history, often\nelaborately documented in clinical notes. Despite their vital role, no current\noncology information representation and annotation schema fully encapsulates\nthe diversity of information recorded within these notes. Although large\nlanguage models (LLMs) have recently exhibited impressive performance on\nvarious medical natural language processing tasks, due to the current lack of\ncomprehensively annotated oncology datasets, an extensive evaluation of LLMs in\nextracting and reasoning with the complex rhetoric in oncology notes remains\nunderstudied. We developed a detailed schema for annotating textual oncology\ninformation, encompassing patient characteristics, tumor characteristics,\ntests, treatments, and temporality. Using a corpus of 40 de-identified breast\nand pancreatic cancer progress notes at University of California, San\nFrancisco, we applied this schema to assess the abilities of three\nrecently-released LLMs (GPT-4, GPT-3.5-turbo, and FLAN-UL2) to perform\nzero-shot extraction of detailed oncological history from two narrative\nsections of clinical progress notes. Our team annotated 9028 entities, 9986\nmodifiers, and 5312 relationships. The GPT-4 model exhibited overall best\nperformance, with an average BLEU score of 0.68, an average ROUGE score of\n0.71, and an average accuracy of 67% on complex tasks (expert manual evaluation\non subset). Notably, it was proficient in tumor characteristic and medication\nextraction, and demonstrated superior performance in advanced tasks of\ninferring symptoms due to cancer and considerations of future medications.\nGPT-4 may already be usable to extract important facts from cancer progress\nnotes needed for clinical research, complex population management, and\ndocumenting quality patient care.\n","authors":["Madhumita Sushil","Vanessa E. Kennedy","Divneet Mandair","Brenda Y. Miao","Travis Zack","Atul J. Butte"],"pdf_url":"https://arxiv.org/pdf/2308.03853v2.pdf","comment":"Source code available at:\n  https://github.com/MadhumitaSushil/OncLLMExtraction"},{"id":"http://arxiv.org/abs/2310.07923v1","updated":"2023-10-11T22:35:18Z","published":"2023-10-11T22:35:18Z","title":"The Expresssive Power of Transformers with Chain of Thought","summary":"  Recent theoretical work has identified surprisingly simple reasoning\nproblems, such as checking if two nodes in a graph are connected or simulating\nfinite-state machines, that are provably unsolvable by standard transformers\nthat answer immediately after reading their input. However, in practice,\ntransformers' reasoning can be improved by allowing them to use a \"chain of\nthought\" or \"scratchpad\", i.e., generate and condition on a sequence of\nintermediate tokens before answering. Motivated by this, we ask: Does such\nintermediate generation fundamentally extend the computational power of a\ndecoder-only transformer? We show that the answer is yes, but the amount of\nincrease depends crucially on the amount of intermediate generation. For\ninstance, we find that transformer decoders with a logarithmic number of\ndecoding steps (w.r.t. the input length) push the limits of standard\ntransformers only slightly, while a linear number of decoding steps adds a\nclear new ability (under standard complexity conjectures): recognizing all\nregular languages. Our results also imply that linear steps keep transformer\ndecoders within context-sensitive languages, and polynomial steps make them\nrecognize exactly the class of polynomial-time solvable problems -- the first\nexact characterization of a type of transformers in terms of standard\ncomplexity classes. Together, our results provide a nuanced framework for\nunderstanding how the length of a transformer's chain of thought or scratchpad\nimpacts its reasoning power.\n","authors":["William Merrill","Ashish Sabharwal"],"pdf_url":"https://arxiv.org/pdf/2310.07923v1.pdf","comment":"9-page preprint"},{"id":"http://arxiv.org/abs/2310.07911v1","updated":"2023-10-11T21:38:40Z","published":"2023-10-11T21:38:40Z","title":"Pit One Against Many: Leveraging Attention-head Embeddings for\n  Parameter-efficient Multi-head Attention","summary":"  Scaling pre-trained language models has resulted in large performance gains\nin various natural language processing tasks but comes with a large cost in\nmemory requirements. Inspired by the position embeddings in transformers, we\naim to simplify and reduce the memory footprint of the multi-head attention\n(MHA) mechanism. We propose an alternative module that uses only a single\nshared projection matrix and multiple head embeddings (MHE), i.e. one per head.\nWe empirically demonstrate that our MHE attention is substantially more memory\nefficient compared to alternative attention mechanisms while achieving high\npredictive performance retention ratio to vanilla MHA on several downstream\ntasks. MHE attention only requires a negligible fraction of additional\nparameters ($3nd$, where $n$ is the number of attention heads and $d$ the size\nof the head embeddings) compared to a single-head attention, while MHA requires\n$(3n^2-3n)d^2-3nd$ additional parameters.\n","authors":["Huiyin Xue","Nikolaos Aletras"],"pdf_url":"https://arxiv.org/pdf/2310.07911v1.pdf","comment":"Accepted at EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2305.09656v3","updated":"2023-10-11T21:38:22Z","published":"2023-05-16T17:55:51Z","title":"SatLM: Satisfiability-Aided Language Models Using Declarative Prompting","summary":"  Prior work has combined chain-of-thought prompting in large language models\n(LLMs) with programmatic representations to perform effective and transparent\nreasoning. While such an approach works well for tasks that only require\nforward reasoning (e.g., straightforward arithmetic), it is less effective for\nconstraint solving problems that require more sophisticated planning and\nsearch. In this paper, we propose a new satisfiability-aided language modeling\n(SatLM) approach for improving the reasoning capabilities of LLMs. We use an\nLLM to generate a declarative task specification rather than an imperative\nprogram and leverage an off-the-shelf automated theorem prover to derive the\nfinal answer. This approach has two key advantages. The declarative\nspecification is closer to the problem description than the reasoning steps\nare, so the LLM can parse it out of the description more accurately.\nFurthermore, by offloading the actual reasoning task to an automated theorem\nprover, our approach can guarantee the correctness of the answer with respect\nto the parsed specification and avoid planning errors in the solving process.\nWe evaluate SATLM on 8 different datasets and show that it consistently\noutperforms program-aided LMs in the imperative paradigm. In particular, SATLM\noutperforms program-aided LMs by 23% on a challenging subset of the GSM\narithmetic reasoning dataset; SATLM also achieves a new SoTA on LSAT and\nBoardgameQA, surpassing previous models that are trained on the respective\ntraining sets.\n","authors":["Xi Ye","Qiaochu Chen","Isil Dillig","Greg Durrett"],"pdf_url":"https://arxiv.org/pdf/2305.09656v3.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.07889v1","updated":"2023-10-11T20:52:30Z","published":"2023-10-11T20:52:30Z","title":"LangNav: Language as a Perceptual Representation for Navigation","summary":"  We explore the use of language as a perceptual representation for\nvision-and-language navigation. Our approach uses off-the-shelf vision systems\n(for image captioning and object detection) to convert an agent's egocentric\npanoramic view at each time step into natural language descriptions. We then\nfinetune a pretrained language model to select an action, based on the current\nview and the trajectory history, that would best fulfill the navigation\ninstructions. In contrast to the standard setup which adapts a pretrained\nlanguage model to work directly with continuous visual features from pretrained\nvision models, our approach instead uses (discrete) language as the perceptual\nrepresentation. We explore two use cases of our language-based navigation\n(LangNav) approach on the R2R vision-and-language navigation benchmark:\ngenerating synthetic trajectories from a prompted large language model (GPT-4)\nwith which to finetune a smaller language model; and sim-to-real transfer where\nwe transfer a policy learned on a simulated environment (ALFRED) to a\nreal-world environment (R2R). Our approach is found to improve upon strong\nbaselines that rely on visual features in settings where only a few gold\ntrajectories (10-100) are available, demonstrating the potential of using\nlanguage as a perceptual representation for navigation tasks.\n","authors":["Bowen Pan","Rameswar Panda","SouYoung Jin","Rogerio Feris","Aude Oliva","Phillip Isola","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2310.07889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07875v1","updated":"2023-10-11T20:34:42Z","published":"2023-10-11T20:34:42Z","title":"TabLib: A Dataset of 627M Tables with Context","summary":"  It is well-established that large, diverse datasets play a pivotal role in\nthe performance of modern AI systems for text and image modalities. However,\nthere are no datasets for tabular data of comparable size and diversity to\nthose available for text and images. Thus we present \"TabLib'', a compilation\nof 627 million tables totaling 69 TiB, along with 867B tokens of context.\nTabLib was extracted from numerous file formats, including CSV, HTML, SQLite,\nPDF, Excel, and others, sourced from GitHub and Common Crawl. The size and\ndiversity of TabLib offer considerable promise in the table modality,\nreminiscent of the original promise of foundational datasets for text and\nimages, such as The Pile and LAION.\n","authors":["Gus Eggert","Kevin Huo","Mike Biven","Justin Waugh"],"pdf_url":"https://arxiv.org/pdf/2310.07875v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07856v1","updated":"2023-10-11T19:58:07Z","published":"2023-10-11T19:58:07Z","title":"Assessing Evaluation Metrics for Neural Test Oracle Generation","summary":"  In this work, we revisit existing oracle generation studies plus ChatGPT to\nempirically investigate the current standing of their performance in both\nNLG-based and test adequacy metrics. Specifically, we train and run four\nstate-of-the-art test oracle generation models on five NLG-based and two test\nadequacy metrics for our analysis. We apply two different correlation analyses\nbetween these two different sets of metrics. Surprisingly, we found no\nsignificant correlation between the NLG-based metrics and test adequacy\nmetrics. For instance, oracles generated from ChatGPT on the project\nactivemq-artemis had the highest performance on all the NLG-based metrics among\nthe studied NOGs, however, it had the most number of projects with a decrease\nin test adequacy metrics compared to all the studied NOGs. We further conduct a\nqualitative analysis to explore the reasons behind our observations, we found\nthat oracles with high NLG-based metrics but low test adequacy metrics tend to\nhave complex or multiple chained method invocations within the oracle's\nparameters, making it hard for the model to generate completely, affecting the\ntest adequacy metrics. On the other hand, oracles with low NLG-based metrics\nbut high test adequacy metrics tend to have to call different assertion types\nor a different method that functions similarly to the ones in the ground truth.\nOverall, this work complements prior studies on test oracle generation with an\nextensive performance evaluation with both NLG and test adequacy metrics and\nprovides guidelines for better assessment of deep learning applications in\nsoftware test generation in the future.\n","authors":["Jiho Shin","Hadi Hemmati","Moshi Wei","Song Wang"],"pdf_url":"https://arxiv.org/pdf/2310.07856v1.pdf","comment":"10 pages + reference"},{"id":"http://arxiv.org/abs/2305.09548v2","updated":"2023-10-11T19:57:43Z","published":"2023-05-16T15:45:59Z","title":"Measuring Social Dimensions of Self-Presentation in Social Media\n  Biographies with an Identity-based Approach","summary":"  Social media users on sites like Twitter, Instagram, and Tiktok use the\nprofile description, or bio, field of user profiles to present themselves to\nthe world. In contrast to the ``offline'' world, where social context often\nencourages us to adopt a single identity, the profile description is a\nfree-text field in which users are encouraged to present the self using\nmultiple, sometimes conflicting, social identities. While sociologists, social\npsychologists, sociolinguists, and increasingly computational social\nscientists, have developed a large and growing array of methods to estimate the\nmeaning of individual social identities, little work has attended to the ways\nin which social meanings emerge from the collections of social identities\npresent in social media bios. The present work proposes and evaluate three\nnovel, identity-based methods to measure the social dimensions of meaning\nexpressed in Twitter bios. We show that these models outperform reasonable\nbaselines with respect to 1) predicting which sets of identities are more\nlikely to co-occur within a single biography and 2) quantifying perceptions of\nentire social media biographies along salient dimensions of social meaning on\nTwitter, in particular partisanship. We demonstrate the utility of our method\nin a computational social science setting by using model outputs to better\nunderstand how self presentation along dimensions of partisanship, religion,\nage, and gender are related to the sharing of URLs on Twitter from low versus\nhigh quality news sites.\n","authors":["Navid Madani","Rabiraj Bandyopadhyay","Briony Swire-Thompson","Michael Miller Yoder","Kenneth Joseph"],"pdf_url":"https://arxiv.org/pdf/2305.09548v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07849v1","updated":"2023-10-11T19:51:13Z","published":"2023-10-11T19:51:13Z","title":"Synthetic Data Generation with Large Language Models for Text\n  Classification: Potential and Limitations","summary":"  The collection and curation of high-quality training data is crucial for\ndeveloping text classification models with superior performance, but it is\noften associated with significant costs and time investment. Researchers have\nrecently explored using large language models (LLMs) to generate synthetic\ndatasets as an alternative approach. However, the effectiveness of the\nLLM-generated synthetic data in supporting model training is inconsistent\nacross different classification tasks. To better understand factors that\nmoderate the effectiveness of the LLM-generated synthetic data, in this study,\nwe look into how the performance of models trained on these synthetic data may\nvary with the subjectivity of classification. Our results indicate that\nsubjectivity, at both the task level and instance level, is negatively\nassociated with the performance of the model trained on synthetic data. We\nconclude by discussing the implications of our work on the potential and\nlimitations of leveraging LLM for synthetic data generation.\n","authors":["Zhuoyan Li","Hangxiao Zhu","Zhuoran Lu","Ming Yin"],"pdf_url":"https://arxiv.org/pdf/2310.07849v1.pdf","comment":"Accepted to EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.07848v1","updated":"2023-10-11T19:50:59Z","published":"2023-10-11T19:50:59Z","title":"Framework for Question-Answering in Sanskrit through Automated\n  Construction of Knowledge Graphs","summary":"  Sanskrit (sa\\d{m}sk\\d{r}ta) enjoys one of the largest and most varied\nliterature in the whole world. Extracting the knowledge from it, however, is a\nchallenging task due to multiple reasons including complexity of the language\nand paucity of standard natural language processing tools. In this paper, we\ntarget the problem of building knowledge graphs for particular types of\nrelationships from sa\\d{m}sk\\d{r}ta texts. We build a natural language\nquestion-answering system in sa\\d{m}sk\\d{r}ta that uses the knowledge graph to\nanswer factoid questions. We design a framework for the overall system and\nimplement two separate instances of the system on human relationships from\nmah\\=abh\\=arata and r\\=am\\=aya\\d{n}a, and one instance on synonymous\nrelationships from bh\\=avaprak\\=a\\'sa nigha\\d{n}\\d{t}u, a technical text from\n\\=ayurveda. We show that about 50% of the factoid questions can be answered\ncorrectly by the system. More importantly, we analyse the shortcomings of the\nsystem in detail for each step, and discuss the possible ways forward.\n","authors":["Hrishikesh Terdalkar","Arnab Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2310.07848v1.pdf","comment":"Accepted at 6th International Sanskrit Computational Linguistics\n  Symposium (ISCLS) 2019"},{"id":"http://arxiv.org/abs/2310.07830v1","updated":"2023-10-11T19:16:09Z","published":"2023-10-11T19:16:09Z","title":"Does Synthetic Data Make Large Language Models More Efficient?","summary":"  Natural Language Processing (NLP) has undergone transformative changes with\nthe advent of deep learning methodologies. One challenge persistently\nconfronting researchers is the scarcity of high-quality, annotated datasets\nthat drive these models. This paper explores the nuances of synthetic data\ngeneration in NLP, with a focal point on template-based question generation. By\nassessing its advantages, including data augmentation potential and the\nintroduction of structured variety, we juxtapose these benefits against\ninherent limitations, such as the risk of overfitting and the constraints posed\nby pre-defined templates. Drawing from empirical evaluations, we demonstrate\nthe impact of template-based synthetic data on the performance of modern\ntransformer models. We conclude by emphasizing the delicate balance required\nbetween synthetic and real-world data, and the future trajectories of\nintegrating synthetic data in model training pipelines. The findings aim to\nguide NLP practitioners in harnessing synthetic data's potential, ensuring\noptimal model performance in diverse applications.\n","authors":["Sia Gholami","Marwan Omar"],"pdf_url":"https://arxiv.org/pdf/2310.07830v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07826v1","updated":"2023-10-11T19:09:07Z","published":"2023-10-11T19:09:07Z","title":"Antarlekhaka: A Comprehensive Tool for Multi-task Natural Language\n  Annotation","summary":"  One of the primary obstacles in the advancement of Natural Language\nProcessing (NLP) technologies for low-resource languages is the lack of\nannotated datasets for training and testing machine learning models. In this\npaper, we present Antarlekhaka, a tool for manual annotation of a comprehensive\nset of tasks relevant to NLP. The tool is Unicode-compatible,\nlanguage-agnostic, Web-deployable and supports distributed annotation by\nmultiple simultaneous annotators. The system sports user-friendly interfaces\nfor 8 categories of annotation tasks. These, in turn, enable the annotation of\na considerably larger set of NLP tasks. The task categories include two\nlinguistic tasks not handled by any other tool, namely, sentence boundary\ndetection and deciding canonical word order, which are important tasks for text\nthat is in the form of poetry. We propose the idea of sequential annotation\nbased on small text units, where an annotator performs several tasks related to\na single text unit before proceeding to the next unit. The research\napplications of the proposed mode of multi-task annotation are also discussed.\nAntarlekhaka outperforms other annotation tools in objective evaluation. It has\nbeen also used for two real-life annotation tasks on two different languages,\nnamely, Sanskrit and Bengali. The tool is available at\nhttps://github.com/Antarlekhaka/code.\n","authors":["Hrishikesh Terdalkar","Arnab Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2310.07826v1.pdf","comment":"Accepted: 3rd Workshop for Natural Language Processing Open Source\n  Software (NLP-OSS) @ EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.07821v1","updated":"2023-10-11T19:02:57Z","published":"2023-10-11T19:02:57Z","title":"Non-autoregressive Text Editing with Copy-aware Latent Alignments","summary":"  Recent work has witnessed a paradigm shift from Seq2Seq to Seq2Edit in the\nfield of text editing, with the aim of addressing the slow autoregressive\ninference problem posed by the former. Despite promising results, Seq2Edit\napproaches still face several challenges such as inflexibility in generation\nand difficulty in generalizing to other languages. In this work, we propose a\nnovel non-autoregressive text editing method to circumvent the above issues, by\nmodeling the edit process with latent CTC alignments. We make a crucial\nextension to CTC by introducing the copy operation into the edit space, thus\nenabling more efficient management of textual overlap in editing. We conduct\nextensive experiments on GEC and sentence fusion tasks, showing that our\nproposed method significantly outperforms existing Seq2Edit models and achieves\nsimilar or even better results than Seq2Seq with over $4\\times$ speedup.\nMoreover, it demonstrates good generalizability on German and Russian. In-depth\nanalyses reveal the strengths of our method in terms of the robustness under\nvarious scenarios and generating fluent and flexible outputs.\n","authors":["Yu Zhang","Yue Zhang","Leyang Cui","Guohong Fu"],"pdf_url":"https://arxiv.org/pdf/2310.07821v1.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.07819v1","updated":"2023-10-11T19:00:40Z","published":"2023-10-11T19:00:40Z","title":"Faithfulness Measurable Masked Language Models","summary":"  A common approach to explain NLP models, is to use importance measures that\nexpress which tokens are important for a prediction. Unfortunately, such\nexplanations are often wrong despite being persuasive. Therefore, it is\nessential to measure their faithfulness. One such metric is if tokens are truly\nimportant, then masking them should result in worse model performance. However,\ntoken masking introduces out-of-distribution issues and existing solutions are\ncomputationally expensive and employ proxy-models. Furthermore, other metrics\nare very limited in scope. In this work, we propose an inherently faithfulness\nmeasurable model that addresses these challenges. This is achieved by using a\nnovel fine-tuning method that incorporates masking, such that masking tokens\nbecome in-distribution by design. This differs from existing approaches, which\nare completely model-agnostic but are inapplicable in practice. We demonstrate\nthe generality of our approach by applying it to various tasks and validate it\nusing statistical in-distribution tests. Additionally, because masking is\nin-distribution, importance measures which themselves use masking become more\nfaithful, thus our model becomes more explainable.\n","authors":["Andreas Madsen","Siva Reddy","Sarath Chandar"],"pdf_url":"https://arxiv.org/pdf/2310.07819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07818v1","updated":"2023-10-11T18:59:48Z","published":"2023-10-11T18:59:48Z","title":"Exploring the Relationship between Analogy Identification and Sentence\n  Structure Encoding in Large Language Models","summary":"  Identifying analogies plays a pivotal role in human cognition and language\nproficiency. In the last decade, there has been extensive research on word\nanalogies in the form of ``A is to B as C is to D.'' However, there is a\ngrowing interest in analogies that involve longer text, such as sentences and\ncollections of sentences, which convey analogous meanings. While the current\nNLP research community evaluates the ability of Large Language Models (LLMs) to\nidentify such analogies, the underlying reasons behind these abilities warrant\ndeeper investigation. Furthermore, the capability of LLMs to encode both\nsyntactic and semantic structures of language within their embeddings has\ngarnered significant attention with the surge in their utilization. In this\nwork, we examine the relationship between the abilities of multiple LLMs to\nidentify sentence analogies, and their capacity to encode syntactic and\nsemantic structures. Through our analysis, we find that analogy identification\nability of LLMs is positively correlated with their ability to encode syntactic\nand semantic structures of sentences. Specifically, we find that the LLMs which\ncapture syntactic structures better, also have higher abilities in identifying\nsentence analogies.\n","authors":["Thilini Wijesiriwardene","Ruwan Wickramarachchi","Aishwarya Naresh Reganti","Vinija Jain","Aman Chadha","Amit Sheth","Amitava Das"],"pdf_url":"https://arxiv.org/pdf/2310.07818v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07815v1","updated":"2023-10-11T18:56:15Z","published":"2023-10-11T18:56:15Z","title":"Language Models As Semantic Indexers","summary":"  Semantic identifier (ID) is an important concept in information retrieval\nthat aims to preserve the semantics of objects such as documents and items\ninside their IDs. Previous studies typically adopt a two-stage pipeline to\nlearn semantic IDs by first procuring embeddings using off-the-shelf text\nencoders and then deriving IDs based on the embeddings. However, each step\nintroduces potential information loss and there is usually an inherent mismatch\nbetween the distribution of embeddings within the latent space produced by text\nencoders and the anticipated distribution required for semantic indexing.\nNevertheless, it is non-trivial to design a method that can learn the\ndocument's semantic representations and its hierarchical structure\nsimultaneously, given that semantic IDs are discrete and sequentially\nstructured, and the semantic supervision is deficient. In this paper, we\nintroduce LMINDEXER, a self-supervised framework to learn semantic IDs with a\ngenerative language model. We tackle the challenge of sequential discrete ID by\nintroducing a semantic indexer capable of generating neural sequential discrete\nrepresentations with progressive training and contrastive learning. In response\nto the semantic supervision deficiency, we propose to train the model with a\nself-supervised document reconstruction objective. The learned semantic indexer\ncan facilitate various downstream tasks, such as recommendation and retrieval.\nWe conduct experiments on three tasks including recommendation, product search,\nand document retrieval on five datasets from various domains, where LMINDEXER\noutperforms competitive baselines significantly and consistently.\n","authors":["Bowen Jin","Hansi Zeng","Guoyin Wang","Xiusi Chen","Tianxin Wei","Ruirui Li","Zhengyang Wang","Zheng Li","Yang Li","Hanqing Lu","Suhang Wang","Jiawei Han","Xianfeng Tang"],"pdf_url":"https://arxiv.org/pdf/2310.07815v1.pdf","comment":"9 pages, 3 appendix pages"},{"id":"http://arxiv.org/abs/2310.07803v1","updated":"2023-10-11T18:36:13Z","published":"2023-10-11T18:36:13Z","title":"A general mechanism of humor: reformulating the semantic overlap","summary":"  This article proposes a cognitive mechanism of humour of general\napplicability, not restricted to verbal communication. It is indebted to\nRaskin's concept of script overlap, and conforms to the incongruity-resolution\ntheoretical framework, but it is built on the notion of constraint, an abstract\ncorrespondence between sets of data. Under this view, script overlap is an\noutcome of a more abstractly described phenomenon, constraint overlap. The\nimportant concept of the overlooked argument is introduced to characterise the\ntwo overlapping constraints -- overt and covert. Their inputs and outputs are\nnot directly encoded in utterances, but implicated by them, and their overlap\nresults in another overlap at the level of the communicated utterances, that\nthe incongruity reveals. Our hypothesis assumes as a given that the evocation\nof such constraints is a cognitive effect of the inferential process by which a\nhearer interprets utterances. We base this assumption on Hofstadter's theory of\nanalogy-making as the essence of human thought. By substituting \"stimuli\" of\nany kind for \"utterances\" in this model, we obtain a mechanism as easily\napplicable to non-verbal communication -- slapstick, cartoons -- and we propose\nit describes the necessary and sufficient conditions for a communicative act in\nany modality to carry humour.\n","authors":["Javier Martínez"],"pdf_url":"https://arxiv.org/pdf/2310.07803v1.pdf","comment":"24 pages, 8 figures"},{"id":"http://arxiv.org/abs/2310.07795v1","updated":"2023-10-11T18:30:37Z","published":"2023-10-11T18:30:37Z","title":"Ontology Enrichment for Effective Fine-grained Entity Typing","summary":"  Fine-grained entity typing (FET) is the task of identifying specific entity\ntypes at a fine-grained level for entity mentions based on their contextual\ninformation. Conventional methods for FET require extensive human annotation,\nwhich is time-consuming and costly. Recent studies have been developing weakly\nsupervised or zero-shot approaches. We study the setting of zero-shot FET where\nonly an ontology is provided. However, most existing ontology structures lack\nrich supporting information and even contain ambiguous relations, making them\nineffective in guiding FET. Recently developed language models, though\npromising in various few-shot and zero-shot NLP tasks, may face challenges in\nzero-shot FET due to their lack of interaction with task-specific ontology. In\nthis study, we propose OnEFET, where we (1) enrich each node in the ontology\nstructure with two types of extra information: instance information for\ntraining sample augmentation and topic information to relate types to contexts,\nand (2) develop a coarse-to-fine typing algorithm that exploits the enriched\ninformation by training an entailment model with contrasting topics and\ninstance-based augmented training samples. Our experiments show that OnEFET\nachieves high-quality fine-grained entity typing without human annotation,\noutperforming existing zero-shot methods by a large margin and rivaling\nsupervised methods.\n","authors":["Siru Ouyang","Jiaxin Huang","Pranav Pillai","Yunyi Zhang","Yu Zhang","Jiawei Han"],"pdf_url":"https://arxiv.org/pdf/2310.07795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07793v1","updated":"2023-10-11T18:27:12Z","published":"2023-10-11T18:27:12Z","title":"GenTKG: Generative Forecasting on Temporal Knowledge Graph","summary":"  The rapid advancements in large language models (LLMs) have ignited interest\nin the temporal knowledge graph (tKG) domain, where conventional carefully\ndesigned embedding-based and rule-based models dominate. The question remains\nopen of whether pre-trained LLMs can understand structured temporal relational\ndata and replace them as the foundation model for temporal relational\nforecasting. Therefore, we bring temporal knowledge forecasting into the\ngenerative setting. However, challenges occur in the huge chasms between\ncomplex temporal graph data structure and sequential natural expressions LLMs\ncan handle, and between the enormous data sizes of tKGs and heavy computation\ncosts of finetuning LLMs. To address these challenges, we propose a novel\nretrieval augmented generation framework that performs generative forecasting\non tKGs named GenTKG, which combines a temporal logical rule-based retrieval\nstrategy and lightweight parameter-efficient instruction tuning. Extensive\nexperiments have shown that GenTKG outperforms conventional methods of temporal\nrelational forecasting under low computation resources. GenTKG also highlights\nremarkable transferability with exceeding performance on unseen datasets\nwithout re-training. Our work reveals the huge potential of LLMs in the tKG\ndomain and opens a new frontier for generative forecasting on tKGs.\n","authors":["Ruotong Liao","Xu Jia","Yunpu Ma","Volker Tresp"],"pdf_url":"https://arxiv.org/pdf/2310.07793v1.pdf","comment":"8 pages"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2310.07716v1","updated":"2023-10-11T17:59:56Z","published":"2023-10-11T17:59:56Z","title":"PAD: A Dataset and Benchmark for Pose-agnostic Anomaly Detection","summary":"  Object anomaly detection is an important problem in the field of machine\nvision and has seen remarkable progress recently. However, two significant\nchallenges hinder its research and application. First, existing datasets lack\ncomprehensive visual information from various pose angles. They usually have an\nunrealistic assumption that the anomaly-free training dataset is pose-aligned,\nand the testing samples have the same pose as the training data. However, in\npractice, anomaly may exist in any regions on a object, the training and query\nsamples may have different poses, calling for the study on pose-agnostic\nanomaly detection. Second, the absence of a consensus on experimental protocols\nfor pose-agnostic anomaly detection leads to unfair comparisons of different\nmethods, hindering the research on pose-agnostic anomaly detection. To address\nthese issues, we develop Multi-pose Anomaly Detection (MAD) dataset and\nPose-agnostic Anomaly Detection (PAD) benchmark, which takes the first step to\naddress the pose-agnostic anomaly detection problem. Specifically, we build MAD\nusing 20 complex-shaped LEGO toys including 4K views with various poses, and\nhigh-quality and diverse 3D anomalies in both simulated and real environments.\nAdditionally, we propose a novel method OmniposeAD, trained using MAD,\nspecifically designed for pose-agnostic anomaly detection. Through\ncomprehensive evaluations, we demonstrate the relevance of our dataset and\nmethod. Furthermore, we provide an open-source benchmark library, including\ndataset and baseline methods that cover 8 anomaly detection paradigms, to\nfacilitate future research and application in this domain. Code, data, and\nmodels are publicly available at https://github.com/EricLee0224/PAD.\n","authors":["Qiang Zhou","Weize Li","Lihan Jiang","Guoliang Wang","Guyue Zhou","Shanghang Zhang","Hao Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.07716v1.pdf","comment":"Accepted by NeurIPS 2023. Codes are available at\n  https://github.com/EricLee0224/PAD"},{"id":"http://arxiv.org/abs/2305.05658v2","updated":"2023-10-11T17:59:44Z","published":"2023-05-09T17:52:59Z","title":"TidyBot: Personalized Robot Assistance with Large Language Models","summary":"  For a robot to personalize physical assistance effectively, it must learn\nuser preferences that can be generally reapplied to future scenarios. In this\nwork, we investigate personalization of household cleanup with robots that can\ntidy up rooms by picking up objects and putting them away. A key challenge is\ndetermining the proper place to put each object, as people's preferences can\nvary greatly depending on personal taste or cultural background. For instance,\none person may prefer storing shirts in the drawer, while another may prefer\nthem on the shelf. We aim to build systems that can learn such preferences from\njust a handful of examples via prior interactions with a particular person. We\nshow that robots can combine language-based planning and perception with the\nfew-shot summarization capabilities of large language models (LLMs) to infer\ngeneralized user preferences that are broadly applicable to future\ninteractions. This approach enables fast adaptation and achieves 91.2% accuracy\non unseen objects in our benchmark dataset. We also demonstrate our approach on\na real-world mobile manipulator called TidyBot, which successfully puts away\n85.0% of objects in real-world test scenarios.\n","authors":["Jimmy Wu","Rika Antonova","Adam Kan","Marion Lepert","Andy Zeng","Shuran Song","Jeannette Bohg","Szymon Rusinkiewicz","Thomas Funkhouser"],"pdf_url":"https://arxiv.org/pdf/2305.05658v2.pdf","comment":"Accepted to Autonomous Robots (AuRo) - Special Issue: Large Language\n  Models in Robotics, 2023 and IEEE/RSJ International Conference on Intelligent\n  Robots and Systems (IROS), 2023. Project page:\n  https://tidybot.cs.princeton.edu"},{"id":"http://arxiv.org/abs/2310.07707v1","updated":"2023-10-11T17:57:14Z","published":"2023-10-11T17:57:14Z","title":"MatFormer: Nested Transformer for Elastic Inference","summary":"  Transformer models are deployed in a wide range of settings, from\nmulti-accelerator clusters to standalone mobile phones. The diverse inference\nconstraints in these scenarios necessitate practitioners to train foundation\nmodels such as PaLM 2, Llama, & ViTs as a series of models of varying sizes.\nDue to significant training costs, only a select few model sizes are trained\nand supported, limiting more fine-grained control over relevant tradeoffs,\nincluding latency, cost, and accuracy. This work introduces MatFormer, a nested\nTransformer architecture designed to offer elasticity in a variety of\ndeployment constraints. Each Feed Forward Network (FFN) block of a MatFormer\nmodel is jointly optimized with a few nested smaller FFN blocks. This training\nprocedure allows for the Mix'n'Match of model granularities across layers --\ni.e., a trained universal MatFormer model enables extraction of hundreds of\naccurate smaller models, which were never explicitly optimized. We empirically\ndemonstrate MatFormer's effectiveness across different model classes (decoders\n& encoders), modalities (language & vision), and scales (up to 2.6B\nparameters). We find that a 2.6B decoder-only MatFormer language model (MatLM)\nallows us to extract smaller models spanning from 1.5B to 2.6B, each exhibiting\ncomparable validation loss and one-shot downstream evaluations to their\nindependently trained counterparts. Furthermore, we observe that smaller\nencoders extracted from a universal MatFormer-based ViT (MatViT) encoder\npreserve the metric-space structure for adaptive large-scale retrieval.\nFinally, we showcase that speculative decoding with the accurate and consistent\nsubmodels extracted from MatFormer can further reduce inference latency.\n","authors":[" Devvrit","Sneha Kudugunta","Aditya Kusupati","Tim Dettmers","Kaifeng Chen","Inderjit Dhillon","Yulia Tsvetkov","Hannaneh Hajishirzi","Sham Kakade","Ali Farhadi","Prateek Jain"],"pdf_url":"https://arxiv.org/pdf/2310.07707v1.pdf","comment":"31 pages, 12 figures, first three authors contributed equally"},{"id":"http://arxiv.org/abs/2310.07704v1","updated":"2023-10-11T17:55:15Z","published":"2023-10-11T17:55:15Z","title":"Ferret: Refer and Ground Anything Anywhere at Any Granularity","summary":"  We introduce Ferret, a new Multimodal Large Language Model (MLLM) capable of\nunderstanding spatial referring of any shape or granularity within an image and\naccurately grounding open-vocabulary descriptions. To unify referring and\ngrounding in the LLM paradigm, Ferret employs a novel and powerful hybrid\nregion representation that integrates discrete coordinates and continuous\nfeatures jointly to represent a region in the image. To extract the continuous\nfeatures of versatile regions, we propose a spatial-aware visual sampler, adept\nat handling varying sparsity across different shapes. Consequently, Ferret can\naccept diverse region inputs, such as points, bounding boxes, and free-form\nshapes. To bolster the desired capability of Ferret, we curate GRIT, a\ncomprehensive refer-and-ground instruction tuning dataset including 1.1M\nsamples that contain rich hierarchical spatial knowledge, with 95K hard\nnegative data to promote model robustness. The resulting model not only\nachieves superior performance in classical referring and grounding tasks, but\nalso greatly outperforms existing MLLMs in region-based and\nlocalization-demanded multimodal chatting. Our evaluations also reveal a\nsignificantly improved capability of describing image details and a remarkable\nalleviation in object hallucination. Code and data will be available at\nhttps://github.com/apple/ml-ferret\n","authors":["Haoxuan You","Haotian Zhang","Zhe Gan","Xianzhi Du","Bowen Zhang","Zirui Wang","Liangliang Cao","Shih-Fu Chang","Yinfei Yang"],"pdf_url":"https://arxiv.org/pdf/2310.07704v1.pdf","comment":"30 pages, 10 figures. Code/Project Website:\n  https://github.com/apple/ml-ferret"},{"id":"http://arxiv.org/abs/2310.07702v1","updated":"2023-10-11T17:52:39Z","published":"2023-10-11T17:52:39Z","title":"ScaleCrafter: Tuning-free Higher-Resolution Visual Generation with\n  Diffusion Models","summary":"  In this work, we investigate the capability of generating images from\npre-trained diffusion models at much higher resolutions than the training image\nsizes. In addition, the generated images should have arbitrary image aspect\nratios. When generating images directly at a higher resolution, 1024 x 1024,\nwith the pre-trained Stable Diffusion using training images of resolution 512 x\n512, we observe persistent problems of object repetition and unreasonable\nobject structures. Existing works for higher-resolution generation, such as\nattention-based and joint-diffusion approaches, cannot well address these\nissues. As a new perspective, we examine the structural components of the U-Net\nin diffusion models and identify the crucial cause as the limited perception\nfield of convolutional kernels. Based on this key observation, we propose a\nsimple yet effective re-dilation that can dynamically adjust the convolutional\nperception field during inference. We further propose the dispersed convolution\nand noise-damped classifier-free guidance, which can enable\nultra-high-resolution image generation (e.g., 4096 x 4096). Notably, our\napproach does not require any training or optimization. Extensive experiments\ndemonstrate that our approach can address the repetition issue well and achieve\nstate-of-the-art performance on higher-resolution image synthesis, especially\nin texture details. Our work also suggests that a pre-trained diffusion model\ntrained on low-resolution images can be directly used for high-resolution\nvisual generation without further tuning, which may provide insights for future\nresearch on ultra-high-resolution image and video synthesis.\n","authors":["Yingqing He","Shaoshu Yang","Haoxin Chen","Xiaodong Cun","Menghan Xia","Yong Zhang","Xintao Wang","Ran He","Qifeng Chen","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2310.07702v1.pdf","comment":"Project page: https://yingqinghe.github.io/scalecrafter/ Github:\n  https://github.com/YingqingHe/ScaleCrafter"},{"id":"http://arxiv.org/abs/2310.07699v1","updated":"2023-10-11T17:49:13Z","published":"2023-10-11T17:49:13Z","title":"From Scarcity to Efficiency: Improving CLIP Training via Visual-enriched\n  Captions","summary":"  Web-crawled datasets are pivotal to the success of pre-training\nvision-language models, exemplified by CLIP. However, web-crawled AltTexts can\nbe noisy and potentially irrelevant to images, thereby undermining the crucial\nimage-text alignment. Existing methods for rewriting captions using large\nlanguage models (LLMs) have shown promise on small, curated datasets like CC3M\nand CC12M. Nevertheless, their efficacy on massive web-captured captions is\nconstrained by the inherent noise and randomness in such data. In this study,\nwe address this limitation by focusing on two key aspects: data quality and\ndata variety. Unlike recent LLM rewriting techniques, we emphasize exploiting\nvisual concepts and their integration into the captions to improve data\nquality. For data variety, we propose a novel mixed training scheme that\noptimally leverages AltTexts alongside newly generated Visual-enriched Captions\n(VeC). We use CLIP as one example and adapt the method for CLIP training on\nlarge-scale web-crawled datasets, named VeCLIP. We conduct a comprehensive\nevaluation of VeCLIP across small, medium, and large scales of raw data. Our\nresults show significant advantages in image-text alignment and overall model\nperformance, underscoring the effectiveness of VeCLIP in improving CLIP\ntraining. For example, VeCLIP achieves a remarkable over 20% improvement in\nCOCO and Flickr30k retrieval tasks under the 12M setting. For data efficiency,\nwe also achieve a notable over 3% improvement while using only 14% of the data\nemployed in the vanilla CLIP and 11% in ALIGN.\n","authors":["Zhengfeng Lai","Haotian Zhang","Wentao Wu","Haoping Bai","Aleksei Timofeev","Xianzhi Du","Zhe Gan","Jiulong Shan","Chen-Nee Chuah","Yinfei Yang","Meng Cao"],"pdf_url":"https://arxiv.org/pdf/2310.07699v1.pdf","comment":"CV/ML"},{"id":"http://arxiv.org/abs/2310.07697v1","updated":"2023-10-11T17:46:28Z","published":"2023-10-11T17:46:28Z","title":"ConditionVideo: Training-Free Condition-Guided Text-to-Video Generation","summary":"  Recent works have successfully extended large-scale text-to-image models to\nthe video domain, producing promising results but at a high computational cost\nand requiring a large amount of video data. In this work, we introduce\nConditionVideo, a training-free approach to text-to-video generation based on\nthe provided condition, video, and input text, by leveraging the power of\noff-the-shelf text-to-image generation methods (e.g., Stable Diffusion).\nConditionVideo generates realistic dynamic videos from random noise or given\nscene videos. Our method explicitly disentangles the motion representation into\ncondition-guided and scenery motion components. To this end, the ConditionVideo\nmodel is designed with a UNet branch and a control branch. To improve temporal\ncoherence, we introduce sparse bi-directional spatial-temporal attention\n(sBiST-Attn). The 3D control network extends the conventional 2D controlnet\nmodel, aiming to strengthen conditional generation accuracy by additionally\nleveraging the bi-directional frames in the temporal domain. Our method\nexhibits superior performance in terms of frame consistency, clip score, and\nconditional accuracy, outperforming other compared methods.\n","authors":["Bo Peng","Xinyuan Chen","Yaohui Wang","Chaochao Lu","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2310.07697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09535v3","updated":"2023-10-11T17:46:21Z","published":"2023-03-16T17:51:13Z","title":"FateZero: Fusing Attentions for Zero-shot Text-based Video Editing","summary":"  The diffusion-based generative models have achieved remarkable success in\ntext-based image generation. However, since it contains enormous randomness in\ngeneration progress, it is still challenging to apply such models for\nreal-world visual content editing, especially in videos. In this paper, we\npropose FateZero, a zero-shot text-based editing method on real-world videos\nwithout per-prompt training or use-specific mask. To edit videos consistently,\nwe propose several techniques based on the pre-trained models. Firstly, in\ncontrast to the straightforward DDIM inversion technique, our approach captures\nintermediate attention maps during inversion, which effectively retain both\nstructural and motion information. These maps are directly fused in the editing\nprocess rather than generated during denoising. To further minimize semantic\nleakage of the source video, we then fuse self-attentions with a blending mask\nobtained by cross-attention features from the source prompt. Furthermore, we\nhave implemented a reform of the self-attention mechanism in denoising UNet by\nintroducing spatial-temporal attention to ensure frame consistency. Yet\nsuccinct, our method is the first one to show the ability of zero-shot\ntext-driven video style and local attribute editing from the trained\ntext-to-image model. We also have a better zero-shot shape-aware editing\nability based on the text-to-video model. Extensive experiments demonstrate our\nsuperior temporal consistency and editing capability than previous works.\n","authors":["Chenyang Qi","Xiaodong Cun","Yong Zhang","Chenyang Lei","Xintao Wang","Ying Shan","Qifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2303.09535v3.pdf","comment":"Accepted to ICCV 2023 as an Oral Presentation. Project page:\n  https://fate-zero-edit.github.io ; GitHub repository:\n  https://github.com/ChenyangQiQi/FateZero"},{"id":"http://arxiv.org/abs/2303.05078v2","updated":"2023-10-11T17:46:03Z","published":"2023-03-09T07:26:49Z","title":"Efficient Transformer-based 3D Object Detection with Dynamic Token\n  Halting","summary":"  Balancing efficiency and accuracy is a long-standing problem for deploying\ndeep learning models. The trade-off is even more important for real-time\nsafety-critical systems like autonomous vehicles. In this paper, we propose an\neffective approach for accelerating transformer-based 3D object detectors by\ndynamically halting tokens at different layers depending on their contribution\nto the detection task. Although halting a token is a non-differentiable\noperation, our method allows for differentiable end-to-end learning by\nleveraging an equivalent differentiable forward-pass. Furthermore, our\nframework allows halted tokens to be reused to inform the model's predictions\nthrough a straightforward token recycling mechanism. Our method significantly\nimproves the Pareto frontier of efficiency versus accuracy when compared with\nthe existing approaches. By halting tokens and increasing model capacity, we\nare able to improve the baseline model's performance without increasing the\nmodel's latency on the Waymo Open Dataset.\n","authors":["Mao Ye","Gregory P. Meyer","Yuning Chai","Qiang Liu"],"pdf_url":"https://arxiv.org/pdf/2303.05078v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07687v1","updated":"2023-10-11T17:36:17Z","published":"2023-10-11T17:36:17Z","title":"Orbital Polarimetric Tomography of a Flare Near the Sagittarius A*\n  Supermassive Black Hole","summary":"  The interaction between the supermassive black hole at the center of the\nMilky Way, Sagittarius A$^*$, and its accretion disk, occasionally produces\nhigh energy flares seen in X-ray, infrared and radio. One mechanism for\nobserved flares is the formation of compact bright regions that appear within\nthe accretion disk and close to the event horizon. Understanding these flares\ncan provide a window into black hole accretion processes. Although\nsophisticated simulations predict the formation of these flares, their\nstructure has yet to be recovered by observations. Here we show the first\nthree-dimensional (3D) reconstruction of an emission flare in orbit recovered\nfrom ALMA light curves observed on April 11, 2017. Our recovery results show\ncompact bright regions at a distance of roughly 6 times the event horizon.\nMoreover, our recovery suggests a clockwise rotation in a low-inclination\norbital plane, a result consistent with prior studies by EHT and GRAVITY\ncollaborations. To recover this emission structure we solve a highly ill-posed\ntomography problem by integrating a neural 3D representation (an emergent\nartificial intelligence approach for 3D reconstruction) with a gravitational\nmodel for black holes. Although the recovered 3D structure is subject, and\nsometimes sensitive, to the model assumptions, under physically motivated\nchoices we find that our results are stable and our approach is successful on\nsimulated data. We anticipate that in the future, this approach could be used\nto analyze a richer collection of time-series data that could shed light on the\nmechanisms governing black hole and plasma dynamics.\n","authors":["Aviad Levis","Andrew A. Chael","Katherine L. Bouman","Maciek Wielgus","Pratul P. Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2310.07687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.12424v2","updated":"2023-10-11T17:34:19Z","published":"2023-06-21T17:59:51Z","title":"VisoGender: A dataset for benchmarking gender bias in image-text pronoun\n  resolution","summary":"  We introduce VisoGender, a novel dataset for benchmarking gender bias in\nvision-language models. We focus on occupation-related biases within a\nhegemonic system of binary gender, inspired by Winograd and Winogender schemas,\nwhere each image is associated with a caption containing a pronoun relationship\nof subjects and objects in the scene. VisoGender is balanced by gender\nrepresentation in professional roles, supporting bias evaluation in two ways:\ni) resolution bias, where we evaluate the difference between pronoun resolution\naccuracies for image subjects with gender presentations perceived as masculine\nversus feminine by human annotators and ii) retrieval bias, where we compare\nratios of professionals perceived to have masculine and feminine gender\npresentations retrieved for a gender-neutral search query. We benchmark several\nstate-of-the-art vision-language models and find that they demonstrate bias in\nresolving binary gender in complex scenes. While the direction and magnitude of\ngender bias depends on the task and the model being evaluated, captioning\nmodels are generally less biased than Vision-Language Encoders. Dataset and\ncode are available at https://github.com/oxai/visogender\n","authors":["Siobhan Mackenzie Hall","Fernanda Gonçalves Abrantes","Hanwen Zhu","Grace Sodunke","Aleksandar Shtedritski","Hannah Rose Kirk"],"pdf_url":"https://arxiv.org/pdf/2306.12424v2.pdf","comment":"Data and code available at https://github.com/oxai/visogender"},{"id":"http://arxiv.org/abs/2310.07682v1","updated":"2023-10-11T17:32:24Z","published":"2023-10-11T17:32:24Z","title":"Prediction of MET Overexpression in Non-Small Cell Lung Adenocarcinomas\n  from Hematoxylin and Eosin Images","summary":"  MET protein overexpression is a targetable event in non-small cell lung\ncancer (NSCLC) and is the subject of active drug development. Challenges in\nidentifying patients for these therapies include lack of access to validated\ntesting, such as standardized immunohistochemistry (IHC) assessment, and\nconsumption of valuable tissue for a single gene/protein assay. Development of\npre-screening algorithms using routinely available digitized hematoxylin and\neosin (H&E)-stained slides to predict MET overexpression could promote testing\nfor those who will benefit most. While assessment of MET expression using IHC\nis currently not routinely performed in NSCLC, next-generation sequencing is\ncommon and in some cases includes RNA expression panel testing. In this work,\nwe leveraged a large database of matched H&E slides and RNA expression data to\ntrain a weakly supervised model to predict MET RNA overexpression directly from\nH&E images. This model was evaluated on an independent holdout test set of 300\nover-expressed and 289 normal patients, demonstrating an ROC-AUC of 0.70 (95th\npercentile interval: 0.66 - 0.74) with stable performance characteristics\nacross different patient clinical variables and robust to synthetic noise on\nthe test set. These results suggest that H&E-based predictive models could be\nuseful to prioritize patients for confirmatory testing of MET protein or MET\ngene expression status.\n","authors":["Kshitij Ingale","Sun Hae Hong","Josh S. K. Bell","Abbas Rizvi","Amy Welch","Lingdao Sha","Irvin Ho","Kunal Nagpal","Aicha BenTaieb","Rohan P Joshi","Martin C Stumpe"],"pdf_url":"https://arxiv.org/pdf/2310.07682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07678v1","updated":"2023-10-11T17:21:48Z","published":"2023-10-11T17:21:48Z","title":"Explainable Image Similarity: Integrating Siamese Networks and Grad-CAM","summary":"  With the proliferation of image-based applications in various domains, the\nneed for accurate and interpretable image similarity measures has become\nincreasingly critical. Existing image similarity models often lack\ntransparency, making it challenging to understand the reasons why two images\nare considered similar. In this paper, we propose the concept of explainable\nimage similarity, where the goal is the development of an approach, which is\ncapable of providing similarity scores along with visual factual and\ncounterfactual explanations. Along this line, we present a new framework, which\nintegrates Siamese Networks and Grad-CAM for providing explainable image\nsimilarity and discuss the potential benefits and challenges of adopting this\napproach. In addition, we provide a comprehensive discussion about factual and\ncounterfactual explanations provided by the proposed framework for assisting\ndecision making. The proposed approach has the potential to enhance the\ninterpretability, trustworthiness and user acceptance of image-based systems in\nreal-world image similarity applications. The implementation code can be found\nin https://github.com/ioannislivieris/Grad_CAM_Siamese.git.\n","authors":["Ioannis E. Livieris","Emmanuel Pintelas","Niki Kiriakidou","Panagiotis Pintelas"],"pdf_url":"https://arxiv.org/pdf/2310.07678v1.pdf","comment":"The manuscript has been submitted for publication in \"Journal of\n  Imaging\""},{"id":"http://arxiv.org/abs/2310.07669v1","updated":"2023-10-11T17:18:15Z","published":"2023-10-11T17:18:15Z","title":"HaarNet: Large-scale Linear-Morphological Hybrid Network for RGB-D\n  Semantic Segmentation","summary":"  Signals from different modalities each have their own combination algebra\nwhich affects their sampling processing. RGB is mostly linear; depth is a\ngeometric signal following the operations of mathematical morphology. If a\nnetwork obtaining RGB-D input has both kinds of operators available in its\nlayers, it should be able to give effective output with fewer parameters. In\nthis paper, morphological elements in conjunction with more familiar linear\nmodules are used to construct a mixed linear-morphological network called\nHaarNet. This is the first large-scale linear-morphological hybrid, evaluated\non a set of sizeable real-world datasets. In the network, morphological Haar\nsampling is applied to both feature channels in several layers, which splits\nextreme values and high-frequency information such that both can be processed\nto improve both modalities. Moreover, morphologically parameterised ReLU is\nused, and morphologically-sound up-sampling is applied to obtain a\nfull-resolution output. Experiments show that HaarNet is competitive with a\nstate-of-the-art CNN, implying that morphological networks are a promising\nresearch direction for geometry-based learning tasks.\n","authors":["Rick Groenendijk","Leo Dorst","Theo Gevers"],"pdf_url":"https://arxiv.org/pdf/2310.07669v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07664v1","updated":"2023-10-11T17:09:19Z","published":"2023-10-11T17:09:19Z","title":"Accelerating Vision Transformers Based on Heterogeneous Attention\n  Patterns","summary":"  Recently, Vision Transformers (ViTs) have attracted a lot of attention in the\nfield of computer vision. Generally, the powerful representative capacity of\nViTs mainly benefits from the self-attention mechanism, which has a high\ncomputation complexity. To accelerate ViTs, we propose an integrated\ncompression pipeline based on observed heterogeneous attention patterns across\nlayers. On one hand, different images share more similar attention patterns in\nearly layers than later layers, indicating that the dynamic query-by-key\nself-attention matrix may be replaced with a static self-attention matrix in\nearly layers. Then, we propose a dynamic-guided static self-attention (DGSSA)\nmethod where the matrix inherits self-attention information from the replaced\ndynamic self-attention to effectively improve the feature representation\nability of ViTs. On the other hand, the attention maps have more low-rank\npatterns, which reflect token redundancy, in later layers than early layers. In\na view of linear dimension reduction, we further propose a method of global\naggregation pyramid (GLAD) to reduce the number of tokens in later layers of\nViTs, such as Deit. Experimentally, the integrated compression pipeline of\nDGSSA and GLAD can accelerate up to 121% run-time throughput compared with\nDeiT, which surpasses all SOTA approaches.\n","authors":["Deli Yu","Teng Xi","Jianwei Li","Baopu Li","Gang Zhang","Haocheng Feng","Junyu Han","Jingtuo Liu","Errui Ding","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2310.07664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07663v1","updated":"2023-10-11T17:03:21Z","published":"2023-10-11T17:03:21Z","title":"Deep Video Inpainting Guided by Audio-Visual Self-Supervision","summary":"  Humans can easily imagine a scene from auditory information based on their\nprior knowledge of audio-visual events. In this paper, we mimic this innate\nhuman ability in deep learning models to improve the quality of video\ninpainting. To implement the prior knowledge, we first train the audio-visual\nnetwork, which learns the correspondence between auditory and visual\ninformation. Then, the audio-visual network is employed as a guider that\nconveys the prior knowledge of audio-visual correspondence to the video\ninpainting network. This prior knowledge is transferred through our proposed\ntwo novel losses: audio-visual attention loss and audio-visual pseudo-class\nconsistency loss. These two losses further improve the performance of the video\ninpainting by encouraging the inpainting result to have a high correspondence\nto its synchronized audio. Experimental results demonstrate that our proposed\nmethod can restore a wider domain of video scenes and is particularly effective\nwhen the sounding object in the scene is partially blinded.\n","authors":["Kyuyeon Kim","Junsik Jung","Woo Jae Kim","Sung-Eui Yoon"],"pdf_url":"https://arxiv.org/pdf/2310.07663v1.pdf","comment":"Accepted at ICASSP 2022"},{"id":"http://arxiv.org/abs/2305.13172v2","updated":"2023-10-11T16:51:50Z","published":"2023-05-22T16:00:00Z","title":"Editing Large Language Models: Problems, Methods, and Opportunities","summary":"  Despite the ability to train capable LLMs, the methodology for maintaining\ntheir relevancy and rectifying errors remains elusive. To this end, the past\nfew years have witnessed a surge in techniques for editing LLMs, the objective\nof which is to efficiently alter the behavior of LLMs within a specific domain\nwithout negatively impacting performance across other inputs. This paper\nembarks on a deep exploration of the problems, methods, and opportunities\nrelated to model editing for LLMs. In particular, we provide an exhaustive\noverview of the task definition and challenges associated with model editing,\nalong with an in-depth empirical analysis of the most progressive methods\ncurrently at our disposal. We also build a new benchmark dataset to facilitate\na more robust evaluation and pinpoint enduring issues intrinsic to existing\ntechniques. Our objective is to provide valuable insights into the\neffectiveness and feasibility of each editing technique, thereby assisting the\ncommunity in making informed decisions on the selection of the most appropriate\nmethod for a specific task or context. Code and datasets are available at\nhttps://github.com/zjunlp/EasyEdit.\n","authors":["Yunzhi Yao","Peng Wang","Bozhong Tian","Siyuan Cheng","Zhoubo Li","Shumin Deng","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.13172v2.pdf","comment":"EMNLP 2023. Updated with new experiments"},{"id":"http://arxiv.org/abs/2310.07638v1","updated":"2023-10-11T16:33:30Z","published":"2023-10-11T16:33:30Z","title":"Context-Enhanced Detector For Building Detection From Remote Sensing\n  Images","summary":"  The field of building detection from remote sensing images has made\nsignificant progress, but faces challenges in achieving high-accuracy detection\ndue to the diversity in building appearances and the complexity of vast scenes.\nTo address these challenges, we propose a novel approach called\nContext-Enhanced Detector (CEDet). Our approach utilizes a three-stage cascade\nstructure to enhance the extraction of contextual information and improve\nbuilding detection accuracy. Specifically, we introduce two modules: the\nSemantic Guided Contextual Mining (SGCM) module, which aggregates multi-scale\ncontexts and incorporates an attention mechanism to capture long-range\ninteractions, and the Instance Context Mining Module (ICMM), which captures\ninstance-level relationship context by constructing a spatial relationship\ngraph and aggregating instance features. Additionally, we introduce a semantic\nsegmentation loss based on pseudo-masks to guide contextual information\nextraction. Our method achieves state-of-the-art performance on three building\ndetection benchmarks, including CNBuilding-9P, CNBuilding-23P, and SpaceNet.\n","authors":["Ziyue Huang","Mingming Zhang","Qingjie Liu","Wei Wang","Zhe Dong","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2310.07638v1.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2310.07633v1","updated":"2023-10-11T16:28:24Z","published":"2023-10-11T16:28:24Z","title":"Attention-Map Augmentation for Hypercomplex Breast Cancer Classification","summary":"  Breast cancer is the most widespread neoplasm among women and early detection\nof this disease is critical. Deep learning techniques have become of great\ninterest to improve diagnostic performance. Nonetheless, discriminating between\nmalignant and benign masses from whole mammograms remains challenging due to\nthem being almost identical to an untrained eye and the region of interest\n(ROI) occupying a minuscule portion of the entire image. In this paper, we\npropose a framework, parameterized hypercomplex attention maps (PHAM), to\novercome these problems. Specifically, we deploy an augmentation step based on\ncomputing attention maps. Then, the attention maps are used to condition the\nclassification step by constructing a multi-dimensional input comprised of the\noriginal breast cancer image and the corresponding attention map. In this step,\na parameterized hypercomplex neural network (PHNN) is employed to perform\nbreast cancer classification. The framework offers two main advantages. First,\nattention maps provide critical information regarding the ROI and allow the\nneural model to concentrate on it. Second, the hypercomplex architecture has\nthe ability to model local relations between input dimensions thanks to\nhypercomplex algebra rules, thus properly exploiting the information provided\nby the attention map. We demonstrate the efficacy of the proposed framework on\nboth mammography images as well as histopathological ones, surpassing\nattention-based state-of-the-art networks and the real-valued counterpart of\nour method. The code of our work is available at\nhttps://github.com/elelo22/AttentionBCS.\n","authors":["Eleonora Lopez","Filippo Betello","Federico Carmignani","Eleonora Grassucci","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2310.07633v1.pdf","comment":"Submitted to Pattern Recognition Letters"},{"id":"http://arxiv.org/abs/2310.07632v1","updated":"2023-10-11T16:25:45Z","published":"2023-10-11T16:25:45Z","title":"Prompt Backdoors in Visual Prompt Learning","summary":"  Fine-tuning large pre-trained computer vision models is infeasible for\nresource-limited users. Visual prompt learning (VPL) has thus emerged to\nprovide an efficient and flexible alternative to model fine-tuning through\nVisual Prompt as a Service (VPPTaaS). Specifically, the VPPTaaS provider\noptimizes a visual prompt given downstream data, and downstream users can use\nthis prompt together with the large pre-trained model for prediction. However,\nthis new learning paradigm may also pose security risks when the VPPTaaS\nprovider instead provides a malicious visual prompt. In this paper, we take the\nfirst step to explore such risks through the lens of backdoor attacks.\nSpecifically, we propose BadVisualPrompt, a simple yet effective backdoor\nattack against VPL. For example, poisoning $5\\%$ CIFAR10 training data leads to\nabove $99\\%$ attack success rates with only negligible model accuracy drop by\n$1.5\\%$. In particular, we identify and then address a new technical challenge\nrelated to interactions between the backdoor trigger and visual prompt, which\ndoes not exist in conventional, model-level backdoors. Moreover, we provide\nin-depth analyses of seven backdoor defenses from model, prompt, and input\nlevels. Overall, all these defenses are either ineffective or impractical to\nmitigate our BadVisualPrompt, implying the critical vulnerability of VPL.\n","authors":["Hai Huang","Zhengyu Zhao","Michael Backes","Yun Shen","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.07632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07623v1","updated":"2023-10-11T16:06:14Z","published":"2023-10-11T16:06:14Z","title":"Dual Quaternion Rotational and Translational Equivariance in 3D Rigid\n  Motion Modelling","summary":"  Objects' rigid motions in 3D space are described by rotations and\ntranslations of a highly-correlated set of points, each with associated $x,y,z$\ncoordinates that real-valued networks consider as separate entities, losing\ninformation. Previous works exploit quaternion algebra and their ability to\nmodel rotations in 3D space. However, these algebras do not properly encode\ntranslations, leading to sub-optimal performance in 3D learning tasks. To\novercome these limitations, we employ a dual quaternion representation of rigid\nmotions in the 3D space that jointly describes rotations and translations of\npoint sets, processing each of the points as a single entity. Our approach is\ntranslation and rotation equivariant, so it does not suffer from shifts in the\ndata and better learns object trajectories, as we validate in the experimental\nevaluations. Models endowed with this formulation outperform previous\napproaches in a human pose forecasting application, attesting to the\neffectiveness of the proposed dual quaternion formulation for rigid motions in\n3D space.\n","authors":["Guilherme Vieira","Eleonora Grassucci","Marcos Eduardo Valle","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2310.07623v1.pdf","comment":"Accepted at IEEE MLSP 2023 (Honorable Mention Top 10% Outstanding\n  Paper)"},{"id":"http://arxiv.org/abs/2310.07602v1","updated":"2023-10-11T15:41:52Z","published":"2023-10-11T15:41:52Z","title":"Dual Radar: A Multi-modal Dataset with Dual 4D Radar for Autononous\n  Driving","summary":"  Radar has stronger adaptability in adverse scenarios for autonomous driving\nenvironmental perception compared to widely adopted cameras and LiDARs.\nCompared with commonly used 3D radars, latest 4D radars have precise vertical\nresolution and higher point cloud density, making it a highly promising sensor\nfor autonomous driving in complex environmental perception. However, due to the\nmuch higher noise than LiDAR, manufacturers choose different filtering\nstrategies, resulting in an inverse ratio between noise level and point cloud\ndensity. There is still a lack of comparative analysis on which method is\nbeneficial for deep learning-based perception algorithms in autonomous driving.\nOne of the main reasons is that current datasets only adopt one type of 4D\nradar, making it difficult to compare different 4D radars in the same scene.\nTherefore, in this paper, we introduce a novel large-scale multi-modal dataset\nfeaturing, for the first time, two types of 4D radars captured simultaneously.\nThis dataset enables further research into effective 4D radar perception\nalgorithms.Our dataset consists of 151 consecutive series, most of which last\n20 seconds and contain 10,007 meticulously synchronized and annotated frames.\nMoreover, our dataset captures a variety of challenging driving scenarios,\nincluding many road conditions, weather conditions, nighttime and daytime with\ndifferent lighting intensities and periods. Our dataset annotates consecutive\nframes, which can be applied to 3D object detection and tracking, and also\nsupports the study of multi-modal tasks. We experimentally validate our\ndataset, providing valuable results for studying different types of 4D radars.\nThis dataset is released on https://github.com/adept-thu/Dual-Radar.\n","authors":["Xinyu Zhang","Li Wang","Jian Chen","Cheng Fang","Lei Yang","Ziying Song","Guangqi Yang","Yichen Wang","Xiaofei Zhang","Jun Li"],"pdf_url":"https://arxiv.org/pdf/2310.07602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07591v1","updated":"2023-10-11T15:33:10Z","published":"2023-10-11T15:33:10Z","title":"PeP: a Point enhanced Painting method for unified point cloud tasks","summary":"  Point encoder is of vital importance for point cloud recognition. As the very\nbeginning step of whole model pipeline, adding features from diverse sources\nand providing stronger feature encoding mechanism would provide better input\nfor downstream modules. In our work, we proposed a novel PeP module to tackle\nabove issue. PeP contains two main parts, a refined point painting method and a\nLM-based point encoder. Experiments results on the nuScenes and KITTI datasets\nvalidate the superior performance of our PeP. The advantages leads to strong\nperformance on both semantic segmentation and object detection, in both lidar\nand multi-modal settings. Notably, our PeP module is model agnostic and\nplug-and-play. Our code will be publicly available soon.\n","authors":["Zichao Dong","Hang Ji","Xufeng Huang","Weikun Zhang","Xin Zhan","Junbo Chen"],"pdf_url":"https://arxiv.org/pdf/2310.07591v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07585v1","updated":"2023-10-11T15:21:40Z","published":"2023-10-11T15:21:40Z","title":"A Discrepancy Aware Framework for Robust Anomaly Detection","summary":"  Defect detection is a critical research area in artificial intelligence.\nRecently, synthetic data-based self-supervised learning has shown great\npotential on this task. Although many sophisticated synthesizing strategies\nexist, little research has been done to investigate the robustness of models\nwhen faced with different strategies. In this paper, we focus on this issue and\nfind that existing methods are highly sensitive to them. To alleviate this\nissue, we present a Discrepancy Aware Framework (DAF), which demonstrates\nrobust performance consistently with simple and cheap strategies across\ndifferent anomaly detection benchmarks. We hypothesize that the high\nsensitivity to synthetic data of existing self-supervised methods arises from\ntheir heavy reliance on the visual appearance of synthetic data during\ndecoding. In contrast, our method leverages an appearance-agnostic cue to guide\nthe decoder in identifying defects, thereby alleviating its reliance on\nsynthetic appearance. To this end, inspired by existing knowledge distillation\nmethods, we employ a teacher-student network, which is trained based on\nsynthesized outliers, to compute the discrepancy map as the cue. Extensive\nexperiments on two challenging datasets prove the robustness of our method.\nUnder the simple synthesis strategies, it outperforms existing methods by a\nlarge margin. Furthermore, it also achieves the state-of-the-art localization\nperformance. Code is available at: https://github.com/caiyuxuan1120/DAF.\n","authors":["Yuxuan Cai","Dingkang Liang","Dongliang Luo","Xinwei He","Xin Yang","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2310.07585v1.pdf","comment":"Accepted by IEEE Transactions on Industrial Informatics. Code is\n  available at: https://github.com/caiyuxuan1120/DAF"},{"id":"http://arxiv.org/abs/2310.07584v1","updated":"2023-10-11T15:20:44Z","published":"2023-10-11T15:20:44Z","title":"Centrality of the Fingerprint Core Location","summary":"  Fingerprints have long been recognized as a unique and reliable means of\npersonal identification. Central to the analysis and enhancement of\nfingerprints is the concept of the fingerprint core. Although the location of\nthe core is used in many applications, to the best of our knowledge, this study\nis the first to investigate the empirical distribution of the core over a\nlarge, combined dataset of rolled, as well as plain fingerprint recordings. We\nidentify and investigate the extent of incomplete rolling during the rolled\nfingerprint acquisition and investigate the centrality of the core. After\ncorrecting for the incomplete rolling, we find that the core deviates from the\nfingerprint center by 5.7% $\\pm$ 5.2% to 7.6% $\\pm$ 6.9%, depending on the\nfinger. Additionally, we find that the assumption of normal distribution of the\ncore position of plain fingerprint recordings cannot be rejected, but for\nrolled ones it can. Therefore, we use a multi-step process to find the\ndistribution of the rolled fingerprint recordings. The process consists of an\nAnderson-Darling normality test, the Bayesian Information Criterion to reduce\nthe number of possible candidate distributions and finally a Generalized Monte\nCarlo goodness-of-fit procedure to find the best fitting distribution. We find\nthe non-central Fischer distribution best describes the cores' horizontal\npositions. Finally, we investigate the correlation between mean core position\noffset and the NFIQ 2 score and find that the NFIQ 2 prefers rolled fingerprint\nrecordings where the core sits slightly below the fingerprint center.\n","authors":["Laurenz Ruzicka","Bernhard Strobl","Bernhard Kohn","Clemens Heitzinger"],"pdf_url":"https://arxiv.org/pdf/2310.07584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07573v1","updated":"2023-10-11T15:15:05Z","published":"2023-10-11T15:15:05Z","title":"Relational Prior Knowledge Graphs for Detection and Instance\n  Segmentation","summary":"  Humans have a remarkable ability to perceive and reason about the world\naround them by understanding the relationships between objects. In this paper,\nwe investigate the effectiveness of using such relationships for object\ndetection and instance segmentation. To this end, we propose a Relational\nPrior-based Feature Enhancement Model (RP-FEM), a graph transformer that\nenhances object proposal features using relational priors. The proposed\narchitecture operates on top of scene graphs obtained from initial proposals\nand aims to concurrently learn relational context modeling for object detection\nand instance segmentation. Experimental evaluations on COCO show that the\nutilization of scene graphs, augmented with relational priors, offer benefits\nfor object detection and instance segmentation. RP-FEM demonstrates its\ncapacity to suppress improbable class predictions within the image while also\npreventing the model from generating duplicate predictions, leading to\nimprovements over the baseline model on which it is built.\n","authors":["Osman Ülger","Yu Wang","Ysbrand Galama","Sezer Karaoglu","Theo Gevers","Martin R. Oswald"],"pdf_url":"https://arxiv.org/pdf/2310.07573v1.pdf","comment":"Published in ICCV2023 SG2RL Workshop"},{"id":"http://arxiv.org/abs/2310.07572v1","updated":"2023-10-11T15:14:54Z","published":"2023-10-11T15:14:54Z","title":"Impact of Label Types on Training SWIN Models with Overhead Imagery","summary":"  Understanding the impact of data set design on model training and performance\ncan help alleviate the costs associated with generating remote sensing and\noverhead labeled data. This work examined the impact of training shifted window\ntransformers using bounding boxes and segmentation labels, where the latter are\nmore expensive to produce. We examined classification tasks by comparing models\ntrained with both target and backgrounds against models trained with only\ntarget pixels, extracted by segmentation labels. For object detection models,\nwe compared performance using either label type when training. We found that\nthe models trained on only target pixels do not show performance improvement\nfor classification tasks, appearing to conflate background pixels in the\nevaluation set with target pixels. For object detection, we found that models\ntrained with either label type showed equivalent performance across testing. We\nfound that bounding boxes appeared to be sufficient for tasks that did not\nrequire more complex labels, such as object segmentation. Continuing work to\ndetermine consistency of this result across data types and model architectures\ncould potentially result in substantial savings in generating remote sensing\ndata sets for deep learning.\n","authors":["Ryan Ford","Kenneth Hutchison","Nicholas Felts","Benjamin Cheng","Jesse Lew","Kyle Jackson"],"pdf_url":"https://arxiv.org/pdf/2310.07572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.15751v3","updated":"2023-10-11T15:13:02Z","published":"2022-11-28T20:11:37Z","title":"Edge Video Analytics: A Survey on Applications, Systems and Enabling\n  Techniques","summary":"  Video, as a key driver in the global explosion of digital information, can\ncreate tremendous benefits for human society. Governments and enterprises are\ndeploying innumerable cameras for a variety of applications, e.g., law\nenforcement, emergency management, traffic control, and security surveillance,\nall facilitated by video analytics (VA). This trend is spurred by the rapid\nadvancement of deep learning (DL), which enables more precise models for object\nclassification, detection, and tracking. Meanwhile, with the proliferation of\nInternet-connected devices, massive amounts of data are generated daily,\noverwhelming the cloud. Edge computing, an emerging paradigm that moves\nworkloads and services from the network core to the network edge, has been\nwidely recognized as a promising solution. The resulting new intersection, edge\nvideo analytics (EVA), begins to attract widespread attention. Nevertheless,\nonly a few loosely-related surveys exist on this topic. The basic concepts of\nEVA (e.g., definition, architectures) were not fully elucidated due to the\nrapid development of this domain. To fill these gaps, we provide a\ncomprehensive survey of the recent efforts on EVA. In this paper, we first\nreview the fundamentals of edge computing, followed by an overview of VA. EVA\nsystems and their enabling techniques are discussed next. In addition, we\nintroduce prevalent frameworks and datasets to aid future researchers in the\ndevelopment of EVA systems. Finally, we discuss existing challenges and foresee\nfuture research directions. We believe this survey will help readers comprehend\nthe relationship between VA and edge computing, and spark new ideas on EVA.\n","authors":["Renjie Xu","Saiedeh Razavi","Rong Zheng"],"pdf_url":"https://arxiv.org/pdf/2211.15751v3.pdf","comment":"Accepted in IEEE Communications Surveys and Tutorials, 2023"},{"id":"http://arxiv.org/abs/2304.14933v2","updated":"2023-10-11T15:08:51Z","published":"2023-04-28T15:43:21Z","title":"An Empirical Study of Multimodal Model Merging","summary":"  Model merging (e.g., via interpolation or task arithmetic) fuses multiple\nmodels trained on different tasks to generate a multi-task solution. The\ntechnique has been proven successful in previous studies, where the models are\ntrained on similar tasks and with the same initialization. In this paper, we\nexpand on this concept to a multimodal setup by merging transformers trained on\ndifferent modalities. Furthermore, we conduct our study for a novel goal where\nwe can merge vision, language, and cross-modal transformers of a\nmodality-specific architecture to create a parameter-efficient\nmodality-agnostic architecture. Through comprehensive experiments, we\nsystematically investigate the key factors impacting model performance after\nmerging, including initialization, merging mechanisms, and model architectures.\nWe also propose two metrics that assess the distance between weights to be\nmerged and can serve as an indicator of the merging outcomes. Our analysis\nleads to an effective training recipe for matching the performance of the\nmodality-agnostic baseline (i.e., pre-trained from scratch) via model merging.\nOur method also outperforms naive merging significantly on various tasks, with\nimprovements of 3% on VQA, 7% on COCO retrieval, 25% on NLVR2, 14% on Flickr30k\nand 3% on ADE20k. Our code is available at https://github.com/ylsung/vl-merging\n","authors":["Yi-Lin Sung","Linjie Li","Kevin Lin","Zhe Gan","Mohit Bansal","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2304.14933v2.pdf","comment":"EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2310.07555v1","updated":"2023-10-11T15:00:11Z","published":"2023-10-11T15:00:11Z","title":"Does resistance to Style-Transfer equal Shape Bias? Evaluating Shape\n  Bias by Distorted Shape","summary":"  Deep learning models are known to exhibit a strong texture bias, while human\ntends to rely heavily on global shape for object recognition. The current\nbenchmark for evaluating a model's shape bias is a set of style-transferred\nimages with the assumption that resistance to the attack of style transfer is\nrelated to the development of shape sensitivity in the model. In this work, we\nshow that networks trained with style-transfer images indeed learn to ignore\nstyle, but its shape bias arises primarily from local shapes. We provide a\nDistorted Shape Testbench (DiST) as an alternative measurement of global shape\nsensitivity. Our test includes 2400 original images from ImageNet-1K, each of\nwhich is accompanied by two images with the global shapes of the original image\ndistorted while preserving its texture via the texture synthesis program. We\nfound that (1) models that performed well on the previous shape bias evaluation\ndo not fare well in the proposed DiST; (2) the widely adopted ViT models do not\nshow significant advantages over Convolutional Neural Networks (CNNs) on this\nbenchmark despite that ViTs rank higher on the previous shape bias tests. (3)\ntraining with DiST images bridges the significant gap between human and\nexisting SOTA models' performance while preserving the models' accuracy on\nstandard image classification tasks; training with DiST images and\nstyle-transferred images are complementary, and can be combined to train\nnetwork together to enhance both the global and local shape sensitivity of the\nnetwork. Our code will be host at: https://github.com/leelabcnbc/DiST\n","authors":["Ziqi Wen","Tianqin Li","Tai Sing Lee"],"pdf_url":"https://arxiv.org/pdf/2310.07555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07552v1","updated":"2023-10-11T14:54:40Z","published":"2023-10-11T14:54:40Z","title":"ProtoHPE: Prototype-guided High-frequency Patch Enhancement for\n  Visible-Infrared Person Re-identification","summary":"  Visible-infrared person re-identification is challenging due to the large\nmodality gap. To bridge the gap, most studies heavily rely on the correlation\nof visible-infrared holistic person images, which may perform poorly under\nsevere distribution shifts. In contrast, we find that some cross-modal\ncorrelated high-frequency components contain discriminative visual patterns and\nare less affected by variations such as wavelength, pose, and background\nclutter than holistic images. Therefore, we are motivated to bridge the\nmodality gap based on such high-frequency components, and propose\n\\textbf{Proto}type-guided \\textbf{H}igh-frequency \\textbf{P}atch\n\\textbf{E}nhancement (ProtoHPE) with two core designs. \\textbf{First}, to\nenhance the representation ability of cross-modal correlated high-frequency\ncomponents, we split patches with such components by Wavelet Transform and\nexponential moving average Vision Transformer (ViT), then empower ViT to take\nthe split patches as auxiliary input. \\textbf{Second}, to obtain semantically\ncompact and discriminative high-frequency representations of the same identity,\nwe propose Multimodal Prototypical Contrast. To be specific, it hierarchically\ncaptures the comprehensive semantics of different modal instances, facilitating\nthe aggregation of high-frequency representations belonging to the same\nidentity. With it, ViT can capture key high-frequency components during\ninference without relying on ProtoHPE, thus bringing no extra complexity.\nExtensive experiments validate the effectiveness of ProtoHPE.\n","authors":["Guiwei Zhang","Yongfei Zhang","Zichang Tan"],"pdf_url":"https://arxiv.org/pdf/2310.07552v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07548v1","updated":"2023-10-11T14:50:52Z","published":"2023-10-11T14:50:52Z","title":"Attribute Localization and Revision Network for Zero-Shot Learning","summary":"  Zero-shot learning enables the model to recognize unseen categories with the\naid of auxiliary semantic information such as attributes. Current works\nproposed to detect attributes from local image regions and align extracted\nfeatures with class-level semantics. In this paper, we find that the choice\nbetween local and global features is not a zero-sum game, global features can\nalso contribute to the understanding of attributes. In addition, aligning\nattribute features with class-level semantics ignores potential intra-class\nattribute variation. To mitigate these disadvantages, we present Attribute\nLocalization and Revision Network in this paper. First, we design Attribute\nLocalization Module (ALM) to capture both local and global features from image\nregions, a novel module called Scale Control Unit is incorporated to fuse\nglobal and local representations. Second, we propose Attribute Revision Module\n(ARM), which generates image-level semantics by revising the ground-truth value\nof each attribute, compensating for performance degradation caused by ignoring\nintra-class variation. Finally, the output of ALM will be aligned with revised\nsemantics produced by ARM to achieve the training process. Comprehensive\nexperimental results on three widely used benchmarks demonstrate the\neffectiveness of our model in the zero-shot prediction task.\n","authors":["Junzhe Xu","Suling Duan","Chenwei Tang","Zhenan He","Jiancheng Lv"],"pdf_url":"https://arxiv.org/pdf/2310.07548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12067v2","updated":"2023-10-11T14:49:26Z","published":"2023-08-23T11:27:30Z","title":"InstructionGPT-4: A 200-Instruction Paradigm for Fine-Tuning MiniGPT-4","summary":"  Multimodal large language models are typically trained in two stages: first\npre-training on image-text pairs, and then fine-tuning using supervised\nvision-language instruction data. Recent studies have shown that large language\nmodels can achieve satisfactory results even with a limited amount of\nhigh-quality instruction-following data. In this paper, we introduce\nInstructionGPT-4, which is fine-tuned on a small dataset comprising only 200\nexamples, amounting to approximately 6\\% of the instruction-following data used\nin the alignment dataset for MiniGPT-4. To achieve this, we first propose\nseveral metrics to access the quality of multimodal instruction data. Based on\nthese metrics, we present an effective and trainable data selector to\nautomatically identify and filter low-quality vision-language data. By\nemploying this method, InstructionGPT-4 outperforms the original MiniGPT-4 on\nvarious evaluations. Overall, our findings demonstrate that less but\nhigh-quality instruction tuning data is efficient in enabling multimodal large\nlanguage models to generate better output. Our code is available at\nhttps://github.com/waltonfuture/InstructionGPT-4.\n","authors":["Lai Wei","Zihao Jiang","Weiran Huang","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2308.12067v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07534v1","updated":"2023-10-11T14:39:12Z","published":"2023-10-11T14:39:12Z","title":"Human-Centered Evaluation of XAI Methods","summary":"  In the ever-evolving field of Artificial Intelligence, a critical challenge\nhas been to decipher the decision-making processes within the so-called \"black\nboxes\" in deep learning. Over recent years, a plethora of methods have emerged,\ndedicated to explaining decisions across diverse tasks. Particularly in tasks\nlike image classification, these methods typically identify and emphasize the\npivotal pixels that most influence a classifier's prediction. Interestingly,\nthis approach mirrors human behavior: when asked to explain our rationale for\nclassifying an image, we often point to the most salient features or aspects.\nCapitalizing on this parallel, our research embarked on a user-centric study.\nWe sought to objectively measure the interpretability of three leading\nexplanation methods: (1) Prototypical Part Network, (2) Occlusion, and (3)\nLayer-wise Relevance Propagation. Intriguingly, our results highlight that\nwhile the regions spotlighted by these methods can vary widely, they all offer\nhumans a nearly equivalent depth of understanding. This enables users to\ndiscern and categorize images efficiently, reinforcing the value of these\nmethods in enhancing AI transparency.\n","authors":["Karam Dawoud","Wojciech Samek","Sebastian Lapuschkin","Sebastian Bosse"],"pdf_url":"https://arxiv.org/pdf/2310.07534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12813v2","updated":"2023-10-11T14:35:26Z","published":"2023-07-24T14:06:54Z","title":"Described Object Detection: Liberating Object Detection with Flexible\n  Expressions","summary":"  Detecting objects based on language information is a popular task that\nincludes Open-Vocabulary object Detection (OVD) and Referring Expression\nComprehension (REC). In this paper, we advance them to a more practical setting\ncalled Described Object Detection (DOD) by expanding category names to flexible\nlanguage expressions for OVD and overcoming the limitation of REC only\ngrounding the pre-existing object. We establish the research foundation for DOD\nby constructing a Description Detection Dataset ($D^3$). This dataset features\nflexible language expressions, whether short category names or long\ndescriptions, and annotating all described objects on all images without\nomission. By evaluating previous SOTA methods on $D^3$, we find some\ntroublemakers that fail current REC, OVD, and bi-functional methods. REC\nmethods struggle with confidence scores, rejecting negative instances, and\nmulti-target scenarios, while OVD methods face constraints with long and\ncomplex descriptions. Recent bi-functional methods also do not work well on DOD\ndue to their separated training procedures and inference strategies for REC and\nOVD tasks. Building upon the aforementioned findings, we propose a baseline\nthat largely improves REC methods by reconstructing the training data and\nintroducing a binary classification sub-task, outperforming existing methods.\nData and code are available at https://github.com/shikras/d-cube and related\nworks are tracked in\nhttps://github.com/Charles-Xie/awesome-described-object-detection.\n","authors":["Chi Xie","Zhao Zhang","Yixuan Wu","Feng Zhu","Rui Zhao","Shuang Liang"],"pdf_url":"https://arxiv.org/pdf/2307.12813v2.pdf","comment":"Accepted by NeurIPS 2023"},{"id":"http://arxiv.org/abs/2304.01950v2","updated":"2023-10-11T14:21:29Z","published":"2023-04-01T09:16:40Z","title":"MP-FedCL: Multiprototype Federated Contrastive Learning for Edge\n  Intelligence","summary":"  Federated learning-assisted edge intelligence enables privacy protection in\nmodern intelligent services. However, not independent and identically\ndistributed (non-IID) distribution among edge clients can impair the local\nmodel performance. The existing single prototype-based strategy represents a\nclass by using the mean of the feature space. However, feature spaces are\nusually not clustered, and a single prototype may not represent a class well.\nMotivated by this, this paper proposes a multi-prototype federated contrastive\nlearning approach (MP-FedCL) which demonstrates the effectiveness of using a\nmulti-prototype strategy over a single-prototype under non-IID settings,\nincluding both label and feature skewness. Specifically, a multi-prototype\ncomputation strategy based on \\textit{k-means} is first proposed to capture\ndifferent embedding representations for each class space, using multiple\nprototypes ($k$ centroids) to represent a class in the embedding space. In each\nglobal round, the computed multiple prototypes and their respective model\nparameters are sent to the edge server for aggregation into a global prototype\npool, which is then sent back to all clients to guide their local training.\nFinally, local training for each client minimizes their own supervised learning\ntasks and learns from shared prototypes in the global prototype pool through\nsupervised contrastive learning, which encourages them to learn knowledge\nrelated to their own class from others and reduces the absorption of unrelated\nknowledge in each global iteration. Experimental results on MNIST, Digit-5,\nOffice-10, and DomainNet show that our method outperforms multiple baselines,\nwith an average test accuracy improvement of about 4.6\\% and 10.4\\% under\nfeature and label non-IID distributions, respectively.\n","authors":["Yu Qiao","Md. Shirajum Munir","Apurba Adhikary","Huy Q. Le","Avi Deb Raha","Chaoning Zhang","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2304.01950v2.pdf","comment":"Accepted by IEEE Internet of Things"},{"id":"http://arxiv.org/abs/2310.07522v1","updated":"2023-10-11T14:19:05Z","published":"2023-10-11T14:19:05Z","title":"S4C: Self-Supervised Semantic Scene Completion with Neural Fields","summary":"  3D semantic scene understanding is a fundamental challenge in computer\nvision. It enables mobile agents to autonomously plan and navigate arbitrary\nenvironments. SSC formalizes this challenge as jointly estimating dense\ngeometry and semantic information from sparse observations of a scene. Current\nmethods for SSC are generally trained on 3D ground truth based on aggregated\nLiDAR scans. This process relies on special sensors and annotation by hand\nwhich are costly and do not scale well. To overcome this issue, our work\npresents the first self-supervised approach to SSC called S4C that does not\nrely on 3D ground truth data. Our proposed method can reconstruct a scene from\na single image and only relies on videos and pseudo segmentation ground truth\ngenerated from off-the-shelf image segmentation network during training. Unlike\nexisting methods, which use discrete voxel grids, we represent scenes as\nimplicit semantic fields. This formulation allows querying any point within the\ncamera frustum for occupancy and semantic class. Our architecture is trained\nthrough rendering-based self-supervised losses. Nonetheless, our method\nachieves performance close to fully supervised state-of-the-art methods.\nAdditionally, our method demonstrates strong generalization capabilities and\ncan synthesize accurate segmentation maps for far away viewpoints.\n","authors":["Adrian Hayler","Felix Wimbauer","Dominik Muhle","Christian Rupprecht","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2310.07522v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07517v1","updated":"2023-10-11T14:15:25Z","published":"2023-10-11T14:15:25Z","title":"CM-PIE: Cross-modal perception for interactive-enhanced audio-visual\n  video parsing","summary":"  Audio-visual video parsing is the task of categorizing a video at the segment\nlevel with weak labels, and predicting them as audible or visible events.\nRecent methods for this task leverage the attention mechanism to capture the\nsemantic correlations among the whole video across the audio-visual modalities.\nHowever, these approaches have overlooked the importance of individual segments\nwithin a video and the relationship among them, and tend to rely on a single\nmodality when learning features. In this paper, we propose a novel\ninteractive-enhanced cross-modal perception method~(CM-PIE), which can learn\nfine-grained features by applying a segment-based attention module.\nFurthermore, a cross-modal aggregation block is introduced to jointly optimize\nthe semantic representation of audio and visual signals by enhancing\ninter-modal interactions. The experimental results show that our model offers\nimproved parsing performance on the Look, Listen, and Parse dataset compared to\nother methods.\n","authors":["Yaru Chen","Ruohao Guo","Xubo Liu","Peipei Wu","Guangyao Li","Zhenbo Li","Wenwu Wang"],"pdf_url":"https://arxiv.org/pdf/2310.07517v1.pdf","comment":"5 pages, 3 figures, 15 references"},{"id":"http://arxiv.org/abs/2310.07511v1","updated":"2023-10-11T14:07:05Z","published":"2023-10-11T14:07:05Z","title":"A Unified Remote Sensing Anomaly Detector Across Modalities and Scenes\n  via Deviation Relationship Learning","summary":"  Remote sensing anomaly detector can find the objects deviating from the\nbackground as potential targets. Given the diversity in earth anomaly types, a\nunified anomaly detector across modalities and scenes should be cost-effective\nand flexible to new earth observation sources and anomaly types. However, the\ncurrent anomaly detectors are limited to a single modality and single scene,\nsince they aim to learn the varying background distribution. Motivated by the\nuniversal anomaly deviation pattern, in that anomalies exhibit deviations from\ntheir local context, we exploit this characteristic to build a unified anomaly\ndetector. Firstly, we reformulate the anomaly detection task as an undirected\nbilayer graph based on the deviation relationship, where the anomaly score is\nmodeled as the conditional probability, given the pattern of the background and\nnormal objects. The learning objective is then expressed as a conditional\nprobability ranking problem. Furthermore, we design an instantiation of the\nreformulation in the data, architecture, and optimization aspects. Simulated\nspectral and spatial anomalies drive the instantiated architecture. The model\nis optimized directly for the conditional probability ranking. The proposed\nmodel was validated in five modalities including the hyperspectral, visible\nlight, synthetic aperture radar (SAR), infrared and low light to show its\nunified detection ability.\n","authors":["Jingtao Li","Xinyu Wang","Hengwei Zhao","Liangpei Zhang","Yanfei Zhong"],"pdf_url":"https://arxiv.org/pdf/2310.07511v1.pdf","comment":"Journal paper"},{"id":"http://arxiv.org/abs/2310.07510v1","updated":"2023-10-11T14:06:04Z","published":"2023-10-11T14:06:04Z","title":"Heuristic Vision Pre-Training with Self-Supervised and Supervised\n  Multi-Task Learning","summary":"  To mimic human vision with the way of recognizing the diverse and open world,\nfoundation vision models are much critical. While recent techniques of\nself-supervised learning show the promising potentiality of this mission, we\nargue that signals from labelled data are also important for common-sense\nrecognition, and properly chosen pre-text tasks can facilitate the efficiency\nof vision representation learning. To this end, we propose a novel pre-training\nframework by adopting both self-supervised and supervised visual pre-text tasks\nin a multi-task manner. Specifically, given an image, we take a heuristic way\nby considering its intrinsic style properties, inside objects with their\nlocations and correlations, and how it looks like in 3D space for basic visual\nunderstanding. However, large-scale object bounding boxes and correlations are\nusually hard to achieve. Alternatively, we develop a hybrid method by\nleveraging both multi-label classification and self-supervised learning. On the\none hand, under the multi-label supervision, the pre-trained model can explore\nthe detailed information of an image, e.g., image types, objects, and part of\nsemantic relations. On the other hand, self-supervised learning tasks, with\nrespect to Masked Image Modeling (MIM) and contrastive learning, can help the\nmodel learn pixel details and patch correlations. Results show that our\npre-trained models can deliver results on par with or better than\nstate-of-the-art (SOTA) results on multiple visual tasks. For example, with a\nvanilla Swin-B backbone, we achieve 85.3\\% top-1 accuracy on ImageNet-1K\nclassification, 47.9 box AP on COCO object detection for Mask R-CNN, and 50.6\nmIoU on ADE-20K semantic segmentation when using Upernet. The performance shows\nthe ability of our vision foundation model to serve general purpose vision\ntasks.\n","authors":["Zhiming Qian"],"pdf_url":"https://arxiv.org/pdf/2310.07510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07506v1","updated":"2023-10-11T14:02:11Z","published":"2023-10-11T14:02:11Z","title":"Leveraging Hierarchical Feature Sharing for Efficient Dataset\n  Condensation","summary":"  Given a real-world dataset, data condensation (DC) aims to synthesize a\nsignificantly smaller dataset that captures the knowledge of this dataset for\nmodel training with high performance. Recent works propose to enhance DC with\ndata parameterization, which condenses data into parameterized data containers\nrather than pixel space. The intuition behind data parameterization is to\nencode shared features of images to avoid additional storage costs. In this\npaper, we recognize that images share common features in a hierarchical way due\nto the inherent hierarchical structure of the classification system, which is\noverlooked by current data parameterization methods. To better align DC with\nthis hierarchical nature and encourage more efficient information sharing\ninside data containers, we propose a novel data parameterization architecture,\nHierarchical Memory Network (HMN). HMN stores condensed data in a three-tier\nstructure, representing the dataset-level, class-level, and instance-level\nfeatures. Another helpful property of the hierarchical architecture is that HMN\nnaturally ensures good independence among images despite achieving information\nsharing. This enables instance-level pruning for HMN to reduce redundant\ninformation, thereby further minimizing redundancy and enhancing performance.\nWe evaluate HMN on four public datasets (SVHN, CIFAR10, CIFAR100, and\nTiny-ImageNet) and compare HMN with eight DC baselines. The evaluation results\nshow that our proposed method outperforms all baselines, even when trained with\na batch-based loss consuming less GPU memory.\n","authors":["Haizhong Zheng","Jiachen Sun","Shutong Wu","Bhavya Kailkhura","Zhuoqing Mao","Chaowei Xiao","Atul Prakash"],"pdf_url":"https://arxiv.org/pdf/2310.07506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07504v1","updated":"2023-10-11T14:01:36Z","published":"2023-10-11T14:01:36Z","title":"PtychoDV: Vision Transformer-Based Deep Unrolling Network for\n  Ptychographic Image Reconstruction","summary":"  Ptychography is an imaging technique that captures multiple overlapping\nsnapshots of a sample, illuminated coherently by a moving localized probe. The\nimage recovery from ptychographic data is generally achieved via an iterative\nalgorithm that solves a nonlinear phase-field problem derived from measured\ndiffraction patterns. However, these approaches have high computational cost.\nIn this paper, we introduce PtychoDV, a novel deep model-based network designed\nfor efficient, high-quality ptychographic image reconstruction. PtychoDV\ncomprises a vision transformer that generates an initial image from the set of\nraw measurements, taking into consideration their mutual correlations. This is\nfollowed by a deep unrolling network that refines the initial image using\nlearnable convolutional priors and the ptychography measurement model.\nExperimental results on simulated data demonstrate that PtychoDV is capable of\noutperforming existing deep learning methods for this problem, and\nsignificantly reduces computational cost compared to iterative methodologies,\nwhile maintaining competitive performance.\n","authors":["Weijie Gan","Qiuchen Zhai","Michael Thompson McCann","Cristina Garcia Cardona","Ulugbek S. Kamilov","Brendt Wohlberg"],"pdf_url":"https://arxiv.org/pdf/2310.07504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08313v3","updated":"2023-10-11T13:55:29Z","published":"2023-08-16T12:18:27Z","title":"ECPC-IDS:A benchmark endometrail cancer PET/CT image dataset for\n  evaluation of semantic segmentation and detection of hypermetabolic regions","summary":"  Endometrial cancer is one of the most common tumors in the female\nreproductive system and is the third most common gynecological malignancy that\ncauses death after ovarian and cervical cancer. Early diagnosis can\nsignificantly improve the 5-year survival rate of patients. With the\ndevelopment of artificial intelligence, computer-assisted diagnosis plays an\nincreasingly important role in improving the accuracy and objectivity of\ndiagnosis, as well as reducing the workload of doctors. However, the absence of\npublicly available endometrial cancer image datasets restricts the application\nof computer-assisted diagnostic techniques.In this paper, a publicly available\nEndometrial Cancer PET/CT Image Dataset for Evaluation of Semantic Segmentation\nand Detection of Hypermetabolic Regions (ECPC-IDS) are published. Specifically,\nthe segmentation section includes PET and CT images, with a total of 7159\nimages in multiple formats. In order to prove the effectiveness of segmentation\nmethods on ECPC-IDS, five classical deep learning semantic segmentation methods\nare selected to test the image segmentation task. The object detection section\nalso includes PET and CT images, with a total of 3579 images and XML files with\nannotation information. Six deep learning methods are selected for experiments\non the detection task.This study conduct extensive experiments using deep\nlearning-based semantic segmentation and object detection methods to\ndemonstrate the differences between various methods on ECPC-IDS. As far as we\nknow, this is the first publicly available dataset of endometrial cancer with a\nlarge number of multiple images, including a large amount of information\nrequired for image and target detection. ECPC-IDS can aid researchers in\nexploring new algorithms to enhance computer-assisted technology, benefiting\nboth clinical doctors and patients greatly.\n","authors":["Dechao Tang","Tianming Du","Deguo Ma","Zhiyu Ma","Hongzan Sun","Marcin Grzegorzek","Huiyan Jiang","Chen Li"],"pdf_url":"https://arxiv.org/pdf/2308.08313v3.pdf","comment":"14 pages,6 figures"},{"id":"http://arxiv.org/abs/2310.07492v1","updated":"2023-10-11T13:39:11Z","published":"2023-10-11T13:39:11Z","title":"Boosting Black-box Attack to Deep Neural Networks with Conditional\n  Diffusion Models","summary":"  Existing black-box attacks have demonstrated promising potential in creating\nadversarial examples (AE) to deceive deep learning models. Most of these\nattacks need to handle a vast optimization space and require a large number of\nqueries, hence exhibiting limited practical impacts in real-world scenarios. In\nthis paper, we propose a novel black-box attack strategy, Conditional Diffusion\nModel Attack (CDMA), to improve the query efficiency of generating AEs under\nquery-limited situations. The key insight of CDMA is to formulate the task of\nAE synthesis as a distribution transformation problem, i.e., benign examples\nand their corresponding AEs can be regarded as coming from two distinctive\ndistributions and can transform from each other with a particular converter.\nUnlike the conventional \\textit{query-and-optimization} approach, we generate\neligible AEs with direct conditional transform using the aforementioned data\nconverter, which can significantly reduce the number of queries needed. CDMA\nadopts the conditional Denoising Diffusion Probabilistic Model as the\nconverter, which can learn the transformation from clean samples to AEs, and\nensure the smooth development of perturbed noise resistant to various defense\nstrategies. We demonstrate the effectiveness and efficiency of CDMA by\ncomparing it with nine state-of-the-art black-box attacks across three\nbenchmark datasets. On average, CDMA can reduce the query count to a handful of\ntimes; in most cases, the query count is only ONE. We also show that CDMA can\nobtain $>99\\%$ attack success rate for untarget attacks over all datasets and\ntargeted attack over CIFAR-10 with the noise budget of $\\epsilon=16$.\n","authors":["Renyang Liu","Wei Zhou","Tianwei Zhang","Kangjie Chen","Jun Zhao","Kwok-Yan Lam"],"pdf_url":"https://arxiv.org/pdf/2310.07492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11161v2","updated":"2023-10-11T13:29:23Z","published":"2023-04-02T16:03:44Z","title":"altiro3D: Scene representation from single image and novel view\n  synthesis","summary":"  We introduce altiro3D, a free extended library developed to represent reality\nstarting from a given original RGB image or flat video. It allows to generate a\nlight-field (or Native) image or video and get a realistic 3D experience. To\nsynthesize N-number of virtual images and add them sequentially into a Quilt\ncollage, we apply MiDaS models for the monocular depth estimation, simple\nOpenCV and Telea inpainting techniques to map all pixels, and implement a\n'Fast' algorithm to handle 3D projection camera and scene transformations along\nN-viewpoints. We use the degree of depth to move proportionally the pixels,\nassuming the original image to be at the center of all the viewpoints. altiro3D\ncan also be used with DIBR algorithm to compute intermediate snapshots from a\nequivalent 'Real (slower)' camera with N-geometric viewpoints, which requires\nto calibrate a priori several intrinsic and extrinsic camera parameters. We\nadopt a pixel- and device-based Lookup Table to optimize computing time. The\nmultiple viewpoints and video generated from a single image or frame can be\ndisplayed in a free-view LCD display.\n","authors":["E. Canessa","L. Tenze"],"pdf_url":"https://arxiv.org/pdf/2304.11161v2.pdf","comment":"In press (2023) Springer International Journal of Information\n  Technology (IJIT) 10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.07473v1","updated":"2023-10-11T13:19:29Z","published":"2023-10-11T13:19:29Z","title":"FGPrompt: Fine-grained Goal Prompting for Image-goal Navigation","summary":"  Learning to navigate to an image-specified goal is an important but\nchallenging task for autonomous systems. The agent is required to reason the\ngoal location from where a picture is shot. Existing methods try to solve this\nproblem by learning a navigation policy, which captures semantic features of\nthe goal image and observation image independently and lastly fuses them for\npredicting a sequence of navigation actions. However, these methods suffer from\ntwo major limitations. 1) They may miss detailed information in the goal image,\nand thus fail to reason the goal location. 2) More critically, it is hard to\nfocus on the goal-relevant regions in the observation image, because they\nattempt to understand observation without goal conditioning. In this paper, we\naim to overcome these limitations by designing a Fine-grained Goal Prompting\n(FGPrompt) method for image-goal navigation. In particular, we leverage\nfine-grained and high-resolution feature maps in the goal image as prompts to\nperform conditioned embedding, which preserves detailed information in the goal\nimage and guides the observation encoder to pay attention to goal-relevant\nregions. Compared with existing methods on the image-goal navigation benchmark,\nour method brings significant performance improvement on 3 benchmark datasets\n(i.e., Gibson, MP3D, and HM3D). Especially on Gibson, we surpass the\nstate-of-the-art success rate by 8% with only 1/50 model size. Project page:\nhttps://xinyusun.github.io/fgprompt-pages\n","authors":["Xinyu Sun","Peihao Chen","Jugang Fan","Thomas H. Li","Jian Chen","Mingkui Tan"],"pdf_url":"https://arxiv.org/pdf/2310.07473v1.pdf","comment":"Accepted by NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.07449v1","updated":"2023-10-11T12:51:16Z","published":"2023-10-11T12:51:16Z","title":"PoRF: Pose Residual Field for Accurate Neural Surface Reconstruction","summary":"  Neural surface reconstruction is sensitive to the camera pose noise, even if\nstate-of-the-art pose estimators like COLMAP or ARKit are used. More\nimportantly, existing Pose-NeRF joint optimisation methods have struggled to\nimprove pose accuracy in challenging real-world scenarios. To overcome the\nchallenges, we introduce the pose residual field (\\textbf{PoRF}), a novel\nimplicit representation that uses an MLP for regressing pose updates. This is\nmore robust than the conventional pose parameter optimisation due to parameter\nsharing that leverages global information over the entire sequence.\nFurthermore, we propose an epipolar geometry loss to enhance the supervision\nthat leverages the correspondences exported from COLMAP results without the\nextra computational overhead. Our method yields promising results. On the DTU\ndataset, we reduce the rotation error by 78\\% for COLMAP poses, leading to the\ndecreased reconstruction Chamfer distance from 3.48mm to 0.85mm. On the\nMobileBrick dataset that contains casually captured unbounded 360-degree\nvideos, our method refines ARKit poses and improves the reconstruction F1 score\nfrom 69.18 to 75.67, outperforming that with the dataset provided ground-truth\npose (75.14). These achievements demonstrate the efficacy of our approach in\nrefining camera poses and improving the accuracy of neural surface\nreconstruction in real-world scenarios.\n","authors":["Jia-Wang Bian","Wenjing Bian","Victor Adrian Prisacariu","Philip Torr"],"pdf_url":"https://arxiv.org/pdf/2310.07449v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2310.07440v1","updated":"2023-10-11T12:46:11Z","published":"2023-10-11T12:46:11Z","title":"Distance-based Weighted Transformer Network for Image Completion","summary":"  The challenge of image generation has been effectively modeled as a problem\nof structure priors or transformation. However, existing models have\nunsatisfactory performance in understanding the global input image structures\nbecause of particular inherent features (for example, local inductive prior).\nRecent studies have shown that self-attention is an efficient modeling\ntechnique for image completion problems. In this paper, we propose a new\narchitecture that relies on Distance-based Weighted Transformer (DWT) to better\nunderstand the relationships between an image's components. In our model, we\nleverage the strengths of both Convolutional Neural Networks (CNNs) and DWT\nblocks to enhance the image completion process. Specifically, CNNs are used to\naugment the local texture information of coarse priors and DWT blocks are used\nto recover certain coarse textures and coherent visual structures. Unlike\ncurrent approaches that generally use CNNs to create feature maps, we use the\nDWT to encode global dependencies and compute distance-based weighted feature\nmaps, which substantially minimizes the problem of visual ambiguities.\nMeanwhile, to better produce repeated textures, we introduce Residual Fast\nFourier Convolution (Res-FFC) blocks to combine the encoder's skip features\nwith the coarse features provided by our generator. Furthermore, a simple yet\neffective technique is proposed to normalize the non-zero values of\nconvolutions, and fine-tune the network layers for regularization of the\ngradient norms to provide an efficient training stabiliser. Extensive\nquantitative and qualitative experiments on three challenging datasets\ndemonstrate the superiority of our proposed model compared to existing\napproaches.\n","authors":["Pourya Shamsolmoali","Masoumeh Zareapoor","Huiyu Zhou","Xuelong Li","Yue Lu"],"pdf_url":"https://arxiv.org/pdf/2310.07440v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07438v1","updated":"2023-10-11T12:41:32Z","published":"2023-10-11T12:41:32Z","title":"DESTINE: Dynamic Goal Queries with Temporal Transductive Alignment for\n  Trajectory Prediction","summary":"  Predicting temporally consistent road users' trajectories in a multi-agent\nsetting is a challenging task due to unknown characteristics of agents and\ntheir varying intentions. Besides using semantic map information and modeling\ninteractions, it is important to build an effective mechanism capable of\nreasoning about behaviors at different levels of granularity. To this end, we\npropose Dynamic goal quErieS with temporal Transductive alIgNmEnt (DESTINE)\nmethod. Unlike past arts, our approach 1) dynamically predicts agents' goals\nirrespective of particular road structures, such as lanes, allowing the method\nto produce a more accurate estimation of destinations; 2) achieves map\ncompliant predictions by generating future trajectories in a coarse-to-fine\nfashion, where the coarser predictions at a lower frame rate serve as\nintermediate goals; and 3) uses an attention module designed to temporally\nalign predicted trajectories via masked attention. Using the common Argoverse\nbenchmark dataset, we show that our method achieves state-of-the-art\nperformance on various metrics, and further investigate the contributions of\nproposed modules via comprehensive ablation studies.\n","authors":["Rezaul Karim","Soheil Mohamad Alizadeh Shabestary","Amir Rasouli"],"pdf_url":"https://arxiv.org/pdf/2310.07438v1.pdf","comment":"6 tables 4 figures"},{"id":"http://arxiv.org/abs/2205.09615v4","updated":"2023-10-11T12:09:35Z","published":"2022-05-19T15:13:00Z","title":"EXACT: How to Train Your Accuracy","summary":"  Classification tasks are usually evaluated in terms of accuracy. However,\naccuracy is discontinuous and cannot be directly optimized using gradient\nascent. Popular methods minimize cross-entropy, hinge loss, or other surrogate\nlosses, which can lead to suboptimal results. In this paper, we propose a new\noptimization framework by introducing stochasticity to a model's output and\noptimizing expected accuracy, i.e. accuracy of the stochastic model. Extensive\nexperiments on linear models and deep image classification show that the\nproposed optimization method is a powerful alternative to widely used\nclassification losses.\n","authors":["Ivan Karpukhin","Stanislav Dereka","Sergey Kolesnikov"],"pdf_url":"https://arxiv.org/pdf/2205.09615v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07419v1","updated":"2023-10-11T12:05:44Z","published":"2023-10-11T12:05:44Z","title":"Multi-Concept T2I-Zero: Tweaking Only The Text Embeddings and Nothing\n  Else","summary":"  Recent advances in text-to-image diffusion models have enabled the\nphotorealistic generation of images from text prompts. Despite the great\nprogress, existing models still struggle to generate compositional\nmulti-concept images naturally, limiting their ability to visualize human\nimagination. While several recent works have attempted to address this issue,\nthey either introduce additional training or adopt guidance at inference time.\nIn this work, we consider a more ambitious goal: natural multi-concept\ngeneration using a pre-trained diffusion model, and with almost no extra cost.\nTo achieve this goal, we identify the limitations in the text embeddings used\nfor the pre-trained text-to-image diffusion models. Specifically, we observe\nconcept dominance and non-localized contribution that severely degrade\nmulti-concept generation performance. We further design a minimal low-cost\nsolution that overcomes the above issues by tweaking (not re-training) the text\nembeddings for more realistic multi-concept text-to-image generation. Our\nCorrection by Similarities method tweaks the embedding of concepts by\ncollecting semantic features from most similar tokens to localize the\ncontribution. To avoid mixing features of concepts, we also apply Cross-Token\nNon-Maximum Suppression, which excludes the overlap of contributions from\ndifferent concepts. Experiments show that our approach outperforms previous\nmethods in text-to-image, image manipulation, and personalization tasks,\ndespite not introducing additional training or inference costs to the diffusion\nsteps.\n","authors":["Hazarapet Tunanyan","Dejia Xu","Shant Navasardyan","Zhangyang Wang","Humphrey Shi"],"pdf_url":"https://arxiv.org/pdf/2310.07419v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07416v1","updated":"2023-10-11T12:01:52Z","published":"2023-10-11T12:01:52Z","title":"A Novel Voronoi-based Convolutional Neural Network Framework for Pushing\n  Person Detection in Crowd Videos","summary":"  Analyzing the microscopic dynamics of pushing behavior within crowds can\noffer valuable insights into crowd patterns and interactions. By identifying\ninstances of pushing in crowd videos, a deeper understanding of when, where,\nand why such behavior occurs can be achieved. This knowledge is crucial to\ncreating more effective crowd management strategies, optimizing crowd flow, and\nenhancing overall crowd experiences. However, manually identifying pushing\nbehavior at the microscopic level is challenging, and the existing automatic\napproaches cannot detect such microscopic behavior. Thus, this article\nintroduces a novel automatic framework for identifying pushing in videos of\ncrowds on a microscopic level. The framework comprises two main components: i)\nFeature extraction and ii) Video labeling. In the feature extraction component,\na new Voronoi-based method is developed for determining the local regions\nassociated with each person in the input video. Subsequently, these regions are\nfed into EfficientNetV1B0 Convolutional Neural Network to extract the deep\nfeatures of each person over time. In the second component, a combination of a\nfully connected layer with a Sigmoid activation function is employed to analyze\nthese deep features and annotate the individuals involved in pushing within the\nvideo. The framework is trained and evaluated on a new dataset created using\nsix real-world experiments, including their corresponding ground truths. The\nexperimental findings indicate that the suggested framework outperforms seven\nbaseline methods that are employed for comparative analysis purposes.\n","authors":["Ahmed Alia","Mohammed Maree","Mohcine Chraibi","Armin Seyfried"],"pdf_url":"https://arxiv.org/pdf/2310.07416v1.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2309.14065v4","updated":"2023-10-11T11:43:41Z","published":"2023-09-25T11:57:16Z","title":"AsymFormer: Asymmetrical Cross-Modal Representation Learning for Mobile\n  Platform Real-Time RGB-D Semantic Segmentation","summary":"  In the realm of robotic intelligence, achieving efficient and precise RGB-D\nsemantic segmentation is a key cornerstone. State-of-the-art multimodal\nsemantic segmentation methods, primarily rooted in symmetrical skeleton\nnetworks, find it challenging to harmonize computational efficiency and\nprecision. In this work, we propose AsymFormer, a novel network for real-time\nRGB-D semantic segmentation, which targets the minimization of superfluous\nparameters by optimizing the distribution of computational resources and\nintroduces an asymmetrical backbone to allow for the effective fusion of\nmultimodal features. Furthermore, we explore techniques to bolster network\naccuracy by redefining feature selection and extracting multi-modal\nself-similarity features without a substantial increase in the parameter count,\nthereby ensuring real-time execution on robotic platforms. Additionally, a\nLocal Attention-Guided Feature Selection (LAFS) module is used to selectively\nfuse features from different modalities by leveraging their dependencies.\nSubsequently, a Cross-Modal Attention-Guided Feature Correlation Embedding\n(CMA) module is introduced to further extract cross-modal representations. This\nmethod is evaluated on NYUv2 and SUNRGBD datasets, with AsymFormer\ndemonstrating competitive results with 52.0% mIoU on NYUv2 and 49.1% mIoU on\nSUNRGBD. Notably, AsymFormer achieves an inference speed of 65 FPS and after\nimplementing mixed precision quantization, it attains an impressive inference\nspeed of 79 FPS on RTX3090. This significantly outperforms existing multi-modal\nmethods, thereby demonstrating that AsymFormer can strike a balance between\nhigh accuracy and efficiency for RGB-D semantic segmentation.\n","authors":["Siqi Du","Weixi Wang","Renzhong Guo","Shengjun Tang"],"pdf_url":"https://arxiv.org/pdf/2309.14065v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.04707v2","updated":"2023-10-11T11:30:32Z","published":"2023-03-08T16:48:24Z","title":"DiM: Distilling Dataset into Generative Model","summary":"  Dataset distillation reduces the network training cost by synthesizing small\nand informative datasets from large-scale ones. Despite the success of the\nrecent dataset distillation algorithms, three drawbacks still limit their wider\napplication: i). the synthetic images perform poorly on large architectures;\nii). they need to be re-optimized when the distillation ratio changes; iii).\nthe limited diversity restricts the performance when the distillation ratio is\nlarge. In this paper, we propose a novel distillation scheme to\n\\textbf{D}istill information of large train sets \\textbf{i}nto generative\n\\textbf{M}odels, named DiM. Specifically, DiM learns to use a generative model\nto store the information of the target dataset. During the distillation phase,\nwe minimize the differences in logits predicted by a models pool between real\nand generated images. At the deployment stage, the generative model synthesizes\nvarious training samples from random noises on the fly. Due to the simple yet\neffective designs, the trained DiM can be directly applied to different\ndistillation ratios and large architectures without extra cost. We validate the\nproposed DiM across 4 datasets and achieve state-of-the-art results on all of\nthem. To the best of our knowledge, we are the first to achieve higher accuracy\non complex architectures than simple ones, such as 75.1\\% with ResNet-18 and\n72.6\\% with ConvNet-3 on ten images per class of CIFAR-10. Besides, DiM\noutperforms previous methods with 10\\% $\\sim$ 22\\% when images per class are 1\nand 10 on the SVHN dataset.\n","authors":["Kai Wang","Jianyang Gu","Daquan Zhou","Zheng Zhu","Wei Jiang","Yang You"],"pdf_url":"https://arxiv.org/pdf/2303.04707v2.pdf","comment":"Distilling datasets into generative models"},{"id":"http://arxiv.org/abs/2310.05682v2","updated":"2023-10-11T11:28:40Z","published":"2023-10-09T12:51:46Z","title":"Analysis of Rainfall Variability and Water Extent of Selected Hydropower\n  Reservoir Using Google Earth Engine (GEE): A Case Study from Two Tropical\n  Countries, Sri Lanka and Vietnam","summary":"  This study presents a comprehensive remote sensing analysis of rainfall\npatterns and selected hydropower reservoir water extent in two tropical monsoon\ncountries, Vietnam and Sri Lanka. The aim is to understand the relationship\nbetween remotely sensed rainfall data and the dynamic changes (monthly) in\nreservoir water extent. The analysis utilizes high-resolution optical imagery\nand Sentinel-1 Synthetic Aperture Radar (SAR) data to observe and monitor water\nbodies during different weather conditions, especially during the monsoon\nseason. The average annual rainfall for both countries is determined, and\nspatiotemporal variations in monthly average rainfall are examined at regional\nand reservoir basin levels using the Climate Hazards Group InfraRed\nPrecipitation with Station (CHIRPS) dataset from 1981 to 2022. Water extents\nare derived for selected reservoirs using Sentinel-1 SAR Ground Range Detected\n(GRD) images in Vietnam and Sri Lanka from 2017 to 2022. The images are\npre-processed and corrected using terrain correction and refined Lee filter. An\nautomated thresholding algorithm, OTSU, distinguishes water and land, taking\nadvantage of both VV and VH polarization data. The connected pixel count\nthreshold is applied to enhance result accuracy. The results indicate a clear\nrelationship between rainfall patterns and reservoir water extent, with\nincreased precipitation during the monsoon season leading to higher water\nextents in the later months. This study contributes to understanding how\nrainfall variability impacts reservoir water resources in tropical monsoon\nregions. The preliminary findings can inform water resource management\nstrategies and support these countries' decision-making processes related to\nhydropower generation, flood management, and irrigation.\n","authors":["Punsisi Rajakaruna","Surajit Ghosh","Bunyod Holmatov"],"pdf_url":"https://arxiv.org/pdf/2310.05682v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07394v1","updated":"2023-10-11T11:26:35Z","published":"2023-10-11T11:26:35Z","title":"CLIP for Lightweight Semantic Segmentation","summary":"  The large-scale pretrained model CLIP, trained on 400 million image-text\npairs, offers a promising paradigm for tackling vision tasks, albeit at the\nimage level. Later works, such as DenseCLIP and LSeg, extend this paradigm to\ndense prediction, including semantic segmentation, and have achieved excellent\nresults. However, the above methods either rely on CLIP-pretrained visual\nbackbones or use none-pretrained but heavy backbones such as Swin, while\nfalling ineffective when applied to lightweight backbones. The reason for this\nis that the lightweitht networks, feature extraction ability of which are\nrelatively limited, meet difficulty embedding the image feature aligned with\ntext embeddings perfectly. In this work, we present a new feature fusion module\nwhich tackles this problem and enables language-guided paradigm to be applied\nto lightweight networks. Specifically, the module is a parallel design of CNN\nand transformer with a two-way bridge in between, where CNN extracts spatial\ninformation and visual context of the feature map from the image encoder, and\nthe transformer propagates text embeddings from the text encoder forward. The\ncore of the module is the bidirectional fusion of visual and text feature\nacross the bridge which prompts their proximity and alignment in embedding\nspace. The module is model-agnostic, which can not only make language-guided\nlightweight semantic segmentation practical, but also fully exploit the\npretrained knowledge of language priors and achieve better performance than\nprevious SOTA work, such as DenseCLIP, whatever the vision backbone is.\nExtensive experiments have been conducted to demonstrate the superiority of our\nmethod.\n","authors":["Ke Jin","Wankou Yang"],"pdf_url":"https://arxiv.org/pdf/2310.07394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07379v1","updated":"2023-10-11T10:54:44Z","published":"2023-10-11T10:54:44Z","title":"Causal Unsupervised Semantic Segmentation","summary":"  Unsupervised semantic segmentation aims to achieve high-quality semantic\ngrouping without human-labeled annotations. With the advent of self-supervised\npre-training, various frameworks utilize the pre-trained features to train\nprediction heads for unsupervised dense prediction. However, a significant\nchallenge in this unsupervised setup is determining the appropriate level of\nclustering required for segmenting concepts. To address it, we propose a novel\nframework, CAusal Unsupervised Semantic sEgmentation (CAUSE), which leverages\ninsights from causal inference. Specifically, we bridge intervention-oriented\napproach (i.e., frontdoor adjustment) to define suitable two-step tasks for\nunsupervised prediction. The first step involves constructing a concept\nclusterbook as a mediator, which represents possible concept prototypes at\ndifferent levels of granularity in a discretized form. Then, the mediator\nestablishes an explicit link to the subsequent concept-wise self-supervised\nlearning for pixel-level grouping. Through extensive experiments and analyses\non various datasets, we corroborate the effectiveness of CAUSE and achieve\nstate-of-the-art performance in unsupervised semantic segmentation.\n","authors":["Junho Kim","Byung-Kwan Lee","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2310.07379v1.pdf","comment":"code available:\n  https://github.com/ByungKwanLee/Causal-Unsupervised-Segmentation"},{"id":"http://arxiv.org/abs/2310.07376v1","updated":"2023-10-11T10:50:15Z","published":"2023-10-11T10:50:15Z","title":"Point Cloud Denoising and Outlier Detection with Local Geometric\n  Structure by Dynamic Graph CNN","summary":"  The digitalization of society is rapidly developing toward the realization of\nthe digital twin and metaverse. In particular, point clouds are attracting\nattention as a media format for 3D space. Point cloud data is contaminated with\nnoise and outliers due to measurement errors. Therefore, denoising and outlier\ndetection are necessary for point cloud processing. Among them, PointCleanNet\nis an effective method for point cloud denoising and outlier detection.\nHowever, it does not consider the local geometric structure of the patch. We\nsolve this problem by applying two types of graph convolutional layer designed\nbased on the Dynamic Graph CNN. Experimental results show that the proposed\nmethods outperform the conventional method in AUPR, which indicates outlier\ndetection accuracy, and Chamfer Distance, which indicates denoising accuracy.\n","authors":["Kosuke Nakayama","Hiroto Fukuta","Hiroshi Watanabe"],"pdf_url":"https://arxiv.org/pdf/2310.07376v1.pdf","comment":"2023 IEEE 12th Global Conference on Consumer Electronics (GCCE 2023)"},{"id":"http://arxiv.org/abs/2303.16570v2","updated":"2023-10-11T10:41:11Z","published":"2023-03-29T10:08:29Z","title":"Point2Vec for Self-Supervised Representation Learning on Point Clouds","summary":"  Recently, the self-supervised learning framework data2vec has shown inspiring\nperformance for various modalities using a masked student-teacher approach.\nHowever, it remains open whether such a framework generalizes to the unique\nchallenges of 3D point clouds. To answer this question, we extend data2vec to\nthe point cloud domain and report encouraging results on several downstream\ntasks. In an in-depth analysis, we discover that the leakage of positional\ninformation reveals the overall object shape to the student even under heavy\nmasking and thus hampers data2vec to learn strong representations for point\nclouds. We address this 3D-specific shortcoming by proposing point2vec, which\nunleashes the full potential of data2vec-like pre-training on point clouds. Our\nexperiments show that point2vec outperforms other self-supervised methods on\nshape classification and few-shot learning on ModelNet40 and ScanObjectNN,\nwhile achieving competitive results on part segmentation on ShapeNetParts.\nThese results suggest that the learned representations are strong and\ntransferable, highlighting point2vec as a promising direction for\nself-supervised learning of point cloud representations.\n","authors":["Karim Abou Zeid","Jonas Schult","Alexander Hermans","Bastian Leibe"],"pdf_url":"https://arxiv.org/pdf/2303.16570v2.pdf","comment":"Accepted at GCPR 2023. Project page at\n  https://vision.rwth-aachen.de/point2vec"},{"id":"http://arxiv.org/abs/2307.00773v3","updated":"2023-10-11T10:29:59Z","published":"2023-07-03T06:33:49Z","title":"DifFSS: Diffusion Model for Few-Shot Semantic Segmentation","summary":"  Diffusion models have demonstrated excellent performance in image generation.\nAlthough various few-shot semantic segmentation (FSS) models with different\nnetwork structures have been proposed, performance improvement has reached a\nbottleneck. This paper presents the first work to leverage the diffusion model\nfor FSS task, called DifFSS. DifFSS, a novel FSS paradigm, can further improve\nthe performance of the state-of-the-art FSS models by a large margin without\nmodifying their network structure. Specifically, we utilize the powerful\ngeneration ability of diffusion models to generate diverse auxiliary support\nimages by using the semantic mask, scribble or soft HED boundary of the support\nimage as control conditions. This generation process simulates the variety\nwithin the class of the query image, such as color, texture variation,\nlighting, $etc$. As a result, FSS models can refer to more diverse support\nimages, yielding more robust representations, thereby achieving a consistent\nimprovement in segmentation performance. Extensive experiments on three\npublicly available datasets based on existing advanced FSS models demonstrate\nthe effectiveness of the diffusion model for FSS task. Furthermore, we explore\nin detail the impact of different input settings of the diffusion model on\nsegmentation performance. Hopefully, this completely new paradigm will bring\ninspiration to the study of FSS task integrated with AI-generated content. Code\nis available at https://github.com/TrinitialChan/DifFSS\n","authors":["Weimin Tan","Siyuan Chen","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2307.00773v3.pdf","comment":"code is available at https://github.com/TrinitialChan/DifFSS"},{"id":"http://arxiv.org/abs/2305.03989v2","updated":"2023-10-11T10:26:27Z","published":"2023-05-06T09:29:12Z","title":"LEO: Generative Latent Image Animator for Human Video Synthesis","summary":"  Spatio-temporal coherency is a major challenge in synthesizing high quality\nvideos, particularly in synthesizing human videos that contain rich global and\nlocal deformations. To resolve this challenge, previous approaches have\nresorted to different features in the generation process aimed at representing\nappearance and motion. However, in the absence of strict mechanisms to\nguarantee such disentanglement, a separation of motion from appearance has\nremained challenging, resulting in spatial distortions and temporal jittering\nthat break the spatio-temporal coherency. Motivated by this, we here propose\nLEO, a novel framework for human video synthesis, placing emphasis on\nspatio-temporal coherency. Our key idea is to represent motion as a sequence of\nflow maps in the generation process, which inherently isolate motion from\nappearance. We implement this idea via a flow-based image animator and a Latent\nMotion Diffusion Model (LMDM). The former bridges a space of motion codes with\nthe space of flow maps, and synthesizes video frames in a warp-and-inpaint\nmanner. LMDM learns to capture motion prior in the training data by\nsynthesizing sequences of motion codes. Extensive quantitative and qualitative\nanalysis suggests that LEO significantly improves coherent synthesis of human\nvideos over previous methods on the datasets TaichiHD, FaceForensics and\nCelebV-HQ. In addition, the effective disentanglement of appearance and motion\nin LEO allows for two additional tasks, namely infinite-length human video\nsynthesis, as well as content-preserving video editing.\n","authors":["Yaohui Wang","Xin Ma","Xinyuan Chen","Antitza Dantcheva","Bo Dai","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2305.03989v2.pdf","comment":"Project webpage: https://wyhsirius.github.io/LEO-project/"},{"id":"http://arxiv.org/abs/2310.07361v1","updated":"2023-10-11T10:21:34Z","published":"2023-10-11T10:21:34Z","title":"Domain Generalization Guided by Gradient Signal to Noise Ratio of\n  Parameters","summary":"  Overfitting to the source domain is a common issue in gradient-based training\nof deep neural networks. To compensate for the over-parameterized models,\nnumerous regularization techniques have been introduced such as those based on\ndropout. While these methods achieve significant improvements on classical\nbenchmarks such as ImageNet, their performance diminishes with the introduction\nof domain shift in the test set i.e. when the unseen data comes from a\nsignificantly different distribution. In this paper, we move away from the\nclassical approach of Bernoulli sampled dropout mask construction and propose\nto base the selection on gradient-signal-to-noise ratio (GSNR) of network's\nparameters. Specifically, at each training step, parameters with high GSNR will\nbe discarded. Furthermore, we alleviate the burden of manually searching for\nthe optimal dropout ratio by leveraging a meta-learning approach. We evaluate\nour method on standard domain generalization benchmarks and achieve competitive\nresults on classification and face anti-spoofing problems.\n","authors":["Mateusz Michalkiewicz","Masoud Faraki","Xiang Yu","Manmohan Chandraker","Mahsa Baktashmotlagh"],"pdf_url":"https://arxiv.org/pdf/2310.07361v1.pdf","comment":"Paper was accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2310.07359v1","updated":"2023-10-11T10:17:41Z","published":"2023-10-11T10:17:41Z","title":"Diagnosing Bipolar Disorder from 3-D Structural Magnetic Resonance\n  Images Using a Hybrid GAN-CNN Method","summary":"  Bipolar Disorder (BD) is a psychiatric condition diagnosed by repetitive\ncycles of hypomania and depression. Since diagnosing BD relies on subjective\nbehavioral assessments over a long period, a solid diagnosis based on objective\ncriteria is not straightforward. The current study responded to the described\nobstacle by proposing a hybrid GAN-CNN model to diagnose BD from 3-D structural\nMRI Images (sMRI). The novelty of this study stems from diagnosing BD from sMRI\nsamples rather than conventional datasets such as functional MRI (fMRI),\nelectroencephalography (EEG), and behavioral symptoms while removing the data\ninsufficiency usually encountered when dealing with sMRI samples. The impact of\nvarious augmentation ratios is also tested using 5-fold cross-validation. Based\non the results, this study obtains an accuracy rate of 75.8%, a sensitivity of\n60.3%, and a specificity of 82.5%, which are 3-5% higher than prior work while\nutilizing less than 6% sample counts. Next, it is demonstrated that a 2- D\nlayer-based GAN generator can effectively reproduce complex 3D brain samples, a\nmore straightforward technique than manual image processing. Lastly, the\noptimum augmentation threshold for the current study using 172 sMRI samples is\n50%, showing the applicability of the described method for larger sMRI\ndatasets. In conclusion, it is established that data augmentation using GAN\nimproves the accuracy of the CNN classifier using sMRI samples, thus developing\nmore reliable decision support systems to assist practitioners in identifying\nBD patients more reliably and in a shorter period\n","authors":["Masood Hamed Saghayan","Mohammad Hossein Zolfagharnasab","Ali Khadem","Farzam Matinfar","Hassan Rashidi"],"pdf_url":"https://arxiv.org/pdf/2310.07359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07355v1","updated":"2023-10-11T10:12:43Z","published":"2023-10-11T10:12:43Z","title":"IMITATE: Clinical Prior Guided Hierarchical Vision-Language Pre-training","summary":"  In the field of medical Vision-Language Pre-training (VLP), significant\nefforts have been devoted to deriving text and image features from both\nclinical reports and associated medical images. However, most existing methods\nmay have overlooked the opportunity in leveraging the inherent hierarchical\nstructure of clinical reports, which are generally split into `findings' for\ndescriptive content and `impressions' for conclusive observation. Instead of\nutilizing this rich, structured format, current medical VLP approaches often\nsimplify the report into either a unified entity or fragmented tokens. In this\nwork, we propose a novel clinical prior guided VLP framework named IMITATE to\nlearn the structure information from medical reports with hierarchical\nvision-language alignment. The framework derives multi-level visual features\nfrom the chest X-ray (CXR) images and separately aligns these features with the\ndescriptive and the conclusive text encoded in the hierarchical medical report.\nFurthermore, a new clinical-informed contrastive loss is introduced for\ncross-modal learning, which accounts for clinical prior knowledge in\nformulating sample correlations in contrastive learning. The proposed model,\nIMITATE, outperforms baseline VLP methods across six different datasets,\nspanning five medical imaging downstream tasks. Comprehensive experimental\nresults highlight the advantages of integrating the hierarchical structure of\nmedical reports for vision-language alignment.\n","authors":["Che Liu","Sibo Cheng","Miaojing Shi","Anand Shah","Wenjia Bai","Rossella Arcucci"],"pdf_url":"https://arxiv.org/pdf/2310.07355v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2307.00040v2","updated":"2023-10-11T10:11:36Z","published":"2023-06-30T17:37:48Z","title":"DisCo: Disentangled Control for Realistic Human Dance Generation","summary":"  Generative AI has made significant strides in computer vision, particularly\nin text-driven image/video synthesis (T2I/T2V). Despite the notable\nadvancements, it remains challenging in human-centric content synthesis such as\nrealistic dance generation. Current methodologies, primarily tailored for human\nmotion transfer, encounter difficulties when confronted with real-world dance\nscenarios (e.g., social media dance) which require to generalize across a wide\nspectrum of poses and intricate human details. In this paper, we depart from\nthe traditional paradigm of human motion transfer and emphasize two additional\ncritical attributes for the synthesis of human dance content in social media\ncontexts: (i) Generalizability: the model should be able to generalize beyond\ngeneric human viewpoints as well as unseen human subjects, backgrounds, and\nposes; (ii) Compositionality: it should allow for composition of seen/unseen\nsubjects, backgrounds, and poses from different sources seamlessly. To address\nthese challenges, we introduce DisCo, which includes a novel model architecture\nwith disentangled control to improve the compositionality of dance synthesis,\nand an effective human attribute pre-training for better generalizability to\nunseen humans. Extensive qualitative and quantitative results demonstrate that\nDisCo can generate high-quality human dance images and videos with diverse\nappearances and flexible motions. Code, demo, video and visualization are\navailable at: https://disco-dance.github.io/.\n","authors":["Tan Wang","Linjie Li","Kevin Lin","Yuanhao Zhai","Chung-Ching Lin","Zhengyuan Yang","Hanwang Zhang","Zicheng Liu","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2307.00040v2.pdf","comment":"Project Page: https://disco-dance.github.io/ ; Add temporal module ;\n  Synchronize FVD computation with MCVD ; More baselines and visualizations"},{"id":"http://arxiv.org/abs/2307.02869v2","updated":"2023-10-11T10:03:08Z","published":"2023-07-06T09:12:13Z","title":"MomentDiff: Generative Video Moment Retrieval from Random to Real","summary":"  Video moment retrieval pursues an efficient and generalized solution to\nidentify the specific temporal segments within an untrimmed video that\ncorrespond to a given language description. To achieve this goal, we provide a\ngenerative diffusion-based framework called MomentDiff, which simulates a\ntypical human retrieval process from random browsing to gradual localization.\nSpecifically, we first diffuse the real span to random noise, and learn to\ndenoise the random noise to the original span with the guidance of similarity\nbetween text and video. This allows the model to learn a mapping from arbitrary\nrandom locations to real moments, enabling the ability to locate segments from\nrandom initialization. Once trained, MomentDiff could sample random temporal\nsegments as initial guesses and iteratively refine them to generate an accurate\ntemporal boundary. Different from discriminative works (e.g., based on\nlearnable proposals or queries), MomentDiff with random initialized spans could\nresist the temporal location biases from datasets. To evaluate the influence of\nthe temporal location biases, we propose two anti-bias datasets with location\ndistribution shifts, named Charades-STA-Len and Charades-STA-Mom. The\nexperimental results demonstrate that our efficient framework consistently\noutperforms state-of-the-art methods on three public benchmarks, and exhibits\nbetter generalization and robustness on the proposed anti-bias datasets. The\ncode, model, and anti-bias evaluation datasets are available at\nhttps://github.com/IMCCretrieval/MomentDiff.\n","authors":["Pandeng Li","Chen-Wei Xie","Hongtao Xie","Liming Zhao","Lei Zhang","Yun Zheng","Deli Zhao","Yongdong Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.02869v2.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2304.06841v2","updated":"2023-10-11T09:53:51Z","published":"2023-04-13T22:20:54Z","title":"Video alignment using unsupervised learning of local and global features","summary":"  In this paper, we tackle the problem of video alignment, the process of\nmatching the frames of a pair of videos containing similar actions. The main\nchallenge in video alignment is that accurate correspondence should be\nestablished despite the differences in the execution processes and appearances\nbetween the two videos. We introduce an unsupervised method for alignment that\nuses global and local features of the frames. In particular, we introduce\neffective features for each video frame using three machine vision tools:\nperson detection, pose estimation, and VGG network. Then, the features are\nprocessed and combined to construct a multidimensional time series that\nrepresents the video. The resulting time series are used to align videos of the\nsame actions using a novel version of dynamic time warping named Diagonalized\nDynamic Time Warping(DDTW). The main advantage of our approach is that no\ntraining is required, which makes it applicable for any new type of action\nwithout any need to collect training samples for it. For evaluation, we\nconsidered video synchronization and phase classification tasks on the Penn\naction dataset. Also, for an effective evaluation of the video synchronization\ntask, we present a new metric called Enclosed Area Error(EAE). The results show\nthat our method outperforms previous state-of-the-art methods, such as TCC, and\nother self-supervised and weakly supervised methods.\n","authors":["Niloufar Fakhfour","Mohammad ShahverdiKondori","Hoda Mohammadzade"],"pdf_url":"https://arxiv.org/pdf/2304.06841v2.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2310.02044v2","updated":"2023-10-11T09:21:23Z","published":"2023-10-03T13:35:49Z","title":"Video Transformers under Occlusion: How Physics and Background\n  Attributes Impact Large Models for Robotic Manipulation","summary":"  As transformer architectures and dataset sizes continue to scale, the need to\nunderstand the specific dataset factors affecting model performance becomes\nincreasingly urgent. This paper investigates how object physics attributes\n(color, friction coefficient, shape) and background characteristics (static,\ndynamic, background complexity) influence the performance of Video Transformers\nin trajectory prediction tasks under occlusion. Beyond mere occlusion\nchallenges, this study aims to investigate three questions: How do object\nphysics attributes and background characteristics influence the model\nperformance? What kinds of attributes are most influential to the model\ngeneralization? Is there a data saturation point for large transformer model\nperformance within a single task? To facilitate this research, we present\nOccluManip, a real-world video-based robot pushing dataset comprising 460,000\nconsistent recordings of objects with different physics and varying\nbackgrounds. 1.4 TB and in total 1278 hours of high-quality videos of flexible\ntemporal length along with target object trajectories are collected,\naccommodating tasks with different temporal requirements. Additionally, we\npropose Video Occlusion Transformer (VOT), a generic video-transformer-based\nnetwork achieving an average 96% accuracy across all 18 sub-datasets provided\nin OccluManip. OccluManip and VOT will be released at:\nhttps://github.com/ShutongJIN/OccluManip.git\n","authors":["Shutong Jin","Ruiyu Wang","Muhammad Zahid","Florian T. Pokorny"],"pdf_url":"https://arxiv.org/pdf/2310.02044v2.pdf","comment":"Under review at IEEE ICRA 2024"},{"id":"http://arxiv.org/abs/2310.07324v1","updated":"2023-10-11T09:14:30Z","published":"2023-10-11T09:14:30Z","title":"Guided Attention for Interpretable Motion Captioning","summary":"  While much effort has been invested in generating human motion from text,\nrelatively few studies have been dedicated to the reverse direction, that is,\ngenerating text from motion. Much of the research focuses on maximizing\ngeneration quality without any regard for the interpretability of the\narchitectures, particularly regarding the influence of particular body parts in\nthe generation and the temporal synchronization of words with specific\nmovements and actions. This study explores the combination of movement encoders\nwith spatio-temporal attention models and proposes strategies to guide the\nattention during training to highlight perceptually pertinent areas of the\nskeleton in time. We show that adding guided attention with adaptive gate leads\nto interpretable captioning while improving performance compared to higher\nparameter-count non-interpretable SOTA systems. On the KIT MLD dataset, we\nobtain a BLEU@4 of 24.4% (SOTA+6%), a ROUGE-L of 58.30% (SOTA +14.1%), a CIDEr\nof 112.10 (SOTA +32.6) and a Bertscore of 41.20% (SOTA +18.20%). On HumanML3D,\nwe obtain a BLEU@4 of 25.00 (SOTA +2.7%), a ROUGE-L score of 55.4% (SOTA\n+6.1%), a CIDEr of 61.6 (SOTA -10.9%), a Bertscore of 40.3% (SOTA +2.5%). Our\ncode implementation and reproduction details will be soon available at\nhttps://github.com/rd20karim/M2T-Interpretable/tree/main.\n","authors":["Karim Radouane","Andon Tchechmedjiev","Sylvie Ranwez","Julien Lagarde"],"pdf_url":"https://arxiv.org/pdf/2310.07324v1.pdf","comment":"arXiv preprint"},{"id":"http://arxiv.org/abs/2310.07322v1","updated":"2023-10-11T09:12:42Z","published":"2023-10-11T09:12:42Z","title":"A webcam-based machine learning approach for three-dimensional range of\n  motion evaluation","summary":"  Background. Joint range of motion (ROM) is an important quantitative measure\nfor physical therapy. Commonly relying on a goniometer, accurate and reliable\nROM measurement requires extensive training and practice. This, in turn,\nimposes a significant barrier for those who have limited in-person access to\nhealthcare.\n  Objective. The current study presents and evaluates an alternative machine\nlearning-based ROM evaluation method that could be remotely accessed via a\nwebcam.\n  Methods. To evaluate its reliability, the ROM measurements for a diverse set\nof joints (neck, spine, and upper and lower extremities) derived using this\nmethod were compared to those obtained from a marker-based optical motion\ncapture system.\n  Results. Data collected from 25 healthy adults demonstrated that the webcam\nsolution exhibited high test-retest reliability, with substantial to almost\nperfect intraclass correlation coefficients for most joints. Compared with the\nmarker-based system, the webcam-based system demonstrated substantial to almost\nperfect inter-rater reliability for some joints, and lower inter-rater\nreliability for other joints (e.g., shoulder flexion and elbow flexion), which\ncould be attributed to the reduced sensitivity to joint locations at the apex\nof the movement.\n  Conclusions. The proposed webcam-based method exhibited high test-retest and\ninter-rater reliability, making it a versatile alternative for existing ROM\nevaluation methods in clinical practice and the tele-implementation of physical\ntherapy and rehabilitation.\n","authors":["Xiaoye Michael Wang","Derek T. Smith","Qin Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.07322v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07310v1","updated":"2023-10-11T08:47:29Z","published":"2023-10-11T08:47:29Z","title":"Deep Aramaic: Towards a Synthetic Data Paradigm Enabling Machine\n  Learning in Epigraphy","summary":"  Epigraphy increasingly turns to modern artificial intelligence (AI)\ntechnologies such as machine learning (ML) for extracting insights from ancient\ninscriptions. However, scarce labeled data for training ML algorithms severely\nlimits current techniques, especially for ancient scripts like Old Aramaic. Our\nresearch pioneers an innovative methodology for generating synthetic training\ndata tailored to Old Aramaic letters. Our pipeline synthesizes photo-realistic\nAramaic letter datasets, incorporating textural features, lighting, damage, and\naugmentations to mimic real-world inscription diversity. Despite minimal real\nexamples, we engineer a dataset of 250,000 training and 25,000 validation\nimages covering the 22 letter classes in the Aramaic alphabet. This\ncomprehensive corpus provides a robust volume of data for training a residual\nneural network (ResNet) to classify highly degraded Aramaic letters. The ResNet\nmodel demonstrates high accuracy in classifying real images from the 8th\ncentury BCE Hadad statue inscription. Additional experiments validate\nperformance on varying materials and styles, proving effective generalization.\nOur results validate the model's capabilities in handling diverse real-world\nscenarios, proving the viability of our synthetic data approach and avoiding\nthe dependence on scarce training data that has constrained epigraphic\nanalysis. Our innovative framework elevates interpretation accuracy on damaged\ninscriptions, thus enhancing knowledge extraction from these historical\nresources.\n","authors":["Andrei C. Aioanei","Regine Hunziker-Rodewald","Konstantin Klein","Dominik L. Michels"],"pdf_url":"https://arxiv.org/pdf/2310.07310v1.pdf","comment":"41 pages, 19 images"},{"id":"http://arxiv.org/abs/2206.07255v2","updated":"2023-10-11T08:41:34Z","published":"2022-06-15T02:35:51Z","title":"GRAM-HD: 3D-Consistent Image Generation at High Resolution with\n  Generative Radiance Manifolds","summary":"  Recent works have shown that 3D-aware GANs trained on unstructured single\nimage collections can generate multiview images of novel instances. The key\nunderpinnings to achieve this are a 3D radiance field generator and a volume\nrendering process. However, existing methods either cannot generate\nhigh-resolution images (e.g., up to 256X256) due to the high computation cost\nof neural volume rendering, or rely on 2D CNNs for image-space upsampling which\njeopardizes the 3D consistency across different views. This paper proposes a\nnovel 3D-aware GAN that can generate high resolution images (up to 1024X1024)\nwhile keeping strict 3D consistency as in volume rendering. Our motivation is\nto achieve super-resolution directly in the 3D space to preserve 3D\nconsistency. We avoid the otherwise prohibitively-expensive computation cost by\napplying 2D convolutions on a set of 2D radiance manifolds defined in the\nrecent generative radiance manifold (GRAM) approach, and apply dedicated loss\nfunctions for effective GAN training at high resolution. Experiments on FFHQ\nand AFHQv2 datasets show that our method can produce high-quality 3D-consistent\nresults that significantly outperform existing methods. It makes a significant\nstep towards closing the gap between traditional 2D image generation and\n3D-consistent free-view generation.\n","authors":["Jianfeng Xiang","Jiaolong Yang","Yu Deng","Xin Tong"],"pdf_url":"https://arxiv.org/pdf/2206.07255v2.pdf","comment":"ICCV2023 camera ready version (more results and method comparisons).\n  Project page: https://jeffreyxiang.github.io/GRAM-HD/"},{"id":"http://arxiv.org/abs/2301.11986v2","updated":"2023-10-11T08:25:26Z","published":"2023-01-27T20:54:58Z","title":"Enhancing Face Recognition with Latent Space Data Augmentation and\n  Facial Posture Reconstruction","summary":"  The small amount of training data for many state-of-the-art deep\nlearning-based Face Recognition (FR) systems causes a marked deterioration in\ntheir performance. Although a considerable amount of research has addressed\nthis issue by inventing new data augmentation techniques, using either input\nspace transformations or Generative Adversarial Networks (GAN) for feature\nspace augmentations, these techniques have yet to satisfy expectations. In this\npaper, we propose an approach named the Face Representation Augmentation (FRA)\nfor augmenting face datasets. To the best of our knowledge, FRA is the first\nmethod that shifts its focus towards manipulating the face embeddings generated\nby any face representation learning algorithm to create new embeddings\nrepresenting the same identity and facial emotion but with an altered posture.\nExtensive experiments conducted in this study convince of the efficacy of our\nmethodology and its power to provide noiseless, completely new facial\nrepresentations to improve the training procedure of any FR algorithm.\nTherefore, FRA can help the recent state-of-the-art FR methods by providing\nmore data for training FR systems. The proposed method, using experiments\nconducted on the Karolinska Directed Emotional Faces (KDEF) dataset, improves\nthe identity classification accuracies by 9.52 %, 10.04 %, and 16.60 %, in\ncomparison with the base models of MagFace, ArcFace, and CosFace, respectively.\n","authors":["Soroush Hashemifar","Abdolreza Marefat","Javad Hassannataj Joloudari","Hamid Hassanpour"],"pdf_url":"https://arxiv.org/pdf/2301.11986v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04189v2","updated":"2023-10-11T08:01:11Z","published":"2023-10-06T12:08:15Z","title":"Bridging the Gap between Human Motion and Action Semantics via Kinematic\n  Phrases","summary":"  The goal of motion understanding is to establish a reliable mapping between\nmotion and action semantics, while it is a challenging many-to-many problem. An\nabstract action semantic (i.e., walk forwards) could be conveyed by\nperceptually diverse motions (walk with arms up or swinging), while a motion\ncould carry different semantics w.r.t. its context and intention. This makes an\nelegant mapping between them difficult. Previous attempts adopted\ndirect-mapping paradigms with limited reliability. Also, current automatic\nmetrics fail to provide reliable assessments of the consistency between motions\nand action semantics. We identify the source of these problems as the\nsignificant gap between the two modalities. To alleviate this gap, we propose\nKinematic Phrases (KP) that take the objective kinematic facts of human motion\nwith proper abstraction, interpretability, and generality characteristics.\nBased on KP as a mediator, we can unify a motion knowledge base and build a\nmotion understanding system. Meanwhile, KP can be automatically converted from\nmotions and to text descriptions with no subjective bias, inspiring Kinematic\nPrompt Generation (KPG) as a novel automatic motion generation benchmark. In\nextensive experiments, our approach shows superiority over other methods. Our\ncode and data would be made publicly available at https://foruck.github.io/KP.\n","authors":["Xinpeng Liu","Yong-Lu Li","Ailing Zeng","Zizheng Zhou","Yang You","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2310.04189v2.pdf","comment":"Yong-Lu Li and Cewu Lu are the corresponding authors. Project page is\n  available at https://foruck.github.io/KP/"},{"id":"http://arxiv.org/abs/2310.07265v1","updated":"2023-10-11T07:45:37Z","published":"2023-10-11T07:45:37Z","title":"Distilling Efficient Vision Transformers from CNNs for Semantic\n  Segmentation","summary":"  In this paper, we tackle a new problem: how to transfer knowledge from the\npre-trained cumbersome yet well-performed CNN-based model to learn a compact\nVision Transformer (ViT)-based model while maintaining its learning capacity?\nDue to the completely different characteristics of ViT and CNN and the\nlong-existing capacity gap between teacher and student models in Knowledge\nDistillation (KD), directly transferring the cross-model knowledge is\nnon-trivial. To this end, we subtly leverage the visual and\nlinguistic-compatible feature character of ViT (i.e., student), and its\ncapacity gap with the CNN (i.e., teacher) and propose a novel CNN-to-ViT KD\nframework, dubbed C2VKD. Importantly, as the teacher's features are\nheterogeneous to those of the student, we first propose a novel\nvisual-linguistic feature distillation (VLFD) module that explores efficient KD\namong the aligned visual and linguistic-compatible representations. Moreover,\ndue to the large capacity gap between the teacher and student and the\ninevitable prediction errors of the teacher, we then propose a pixel-wise\ndecoupled distillation (PDD) module to supervise the student under the\ncombination of labels and teacher's predictions from the decoupled target and\nnon-target classes. Experiments on three semantic segmentation benchmark\ndatasets consistently show that the increment of mIoU of our method is over\n200% of the SoTA KD methods\n","authors":["Xu Zheng","Yunhao Luo","Pengyuan Zhou","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2310.07265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13720v5","updated":"2023-10-11T07:39:48Z","published":"2023-06-23T18:08:00Z","title":"Decoupled Diffusion Models: Image to Zero and Zero to Noise","summary":"  Recent diffusion probabilistic models (DPMs) have shown remarkable abilities\nof generated content, however, they often suffer from complex forward\nprocesses, resulting in inefficient solutions for the reversed process and\nprolonged sampling times. In this paper, we aim to address the aforementioned\nchallenges by focusing on the diffusion process itself that we propose to\ndecouple the intricate diffusion process into two comparatively simpler process\nto improve the generative efficacy and speed. In particular, we present a novel\ndiffusion paradigm named DDM (Decoupled Diffusion Models) based on the Ito\ndiffusion process, in which the image distribution is approximated by an\nexplicit transition probability while the noise path is controlled by the\nstandard Wiener process. We find that decoupling the diffusion process reduces\nthe learning difficulty and the explicit transition probability improves the\ngenerative speed significantly. We prove a new training objective for DPM,\nwhich enables the model to learn to predict the noise and image components\nseparately. Moreover, given the novel forward diffusion equation, we derive the\nreverse denoising formula of DDM that naturally supports fewer steps of\ngeneration without ordinary differential equation (ODE) based accelerators. Our\nexperiments demonstrate that DDM outperforms previous DPMs by a large margin in\nfewer function evaluations setting and gets comparable performances in long\nfunction evaluations setting. We also show that our framework can be applied to\nimage-conditioned generation and high-resolution image synthesis, and that it\ncan generate high-quality images with only 10 function evaluations.\n","authors":["Yuhang Huang","Liang Zheng","Zheng Qin","Xinwang Liu","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2306.13720v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07259v1","updated":"2023-10-11T07:37:13Z","published":"2023-10-11T07:37:13Z","title":"Uncovering Hidden Connections: Iterative Tracking and Reasoning for\n  Video-grounded Dialog","summary":"  In contrast to conventional visual question answering, video-grounded dialog\nnecessitates a profound understanding of both dialog history and video content\nfor accurate response generation. Despite commendable strides made by existing\nmethodologies, they often grapple with the challenges of incrementally\nunderstanding intricate dialog histories and assimilating video information. In\nresponse to this gap, we present an iterative tracking and reasoning strategy\nthat amalgamates a textual encoder, a visual encoder, and a generator. At its\ncore, our textual encoder is fortified with a path tracking and aggregation\nmechanism, adept at gleaning nuances from dialog history that are pivotal to\ndeciphering the posed questions. Concurrently, our visual encoder harnesses an\niterative reasoning network, meticulously crafted to distill and emphasize\ncritical visual markers from videos, enhancing the depth of visual\ncomprehension. Culminating this enriched information, we employ the pre-trained\nGPT-2 model as our response generator, stitching together coherent and\ncontextually apt answers. Our empirical assessments, conducted on two renowned\ndatasets, testify to the prowess and adaptability of our proposed design.\n","authors":["Haoyu Zhang","Meng Liu","Yaowei Wang","Da Cao","Weili Guan","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2310.07259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07255v1","updated":"2023-10-11T07:30:37Z","published":"2023-10-11T07:30:37Z","title":"ADASR: An Adversarial Auto-Augmentation Framework for Hyperspectral and\n  Multispectral Data Fusion","summary":"  Deep learning-based hyperspectral image (HSI) super-resolution, which aims to\ngenerate high spatial resolution HSI (HR-HSI) by fusing hyperspectral image\n(HSI) and multispectral image (MSI) with deep neural networks (DNNs), has\nattracted lots of attention. However, neural networks require large amounts of\ntraining data, hindering their application in real-world scenarios. In this\nletter, we propose a novel adversarial automatic data augmentation framework\nADASR that automatically optimizes and augments HSI-MSI sample pairs to enrich\ndata diversity for HSI-MSI fusion. Our framework is sample-aware and optimizes\nan augmentor network and two downsampling networks jointly by adversarial\nlearning so that we can learn more robust downsampling networks for training\nthe upsampling network. Extensive experiments on two public classical\nhyperspectral datasets demonstrate the effectiveness of our ADASR compared to\nthe state-of-the-art methods.\n","authors":["Jinghui Qin","Lihuang Fang","Ruitao Lu","Liang Lin","Yukai Shi"],"pdf_url":"https://arxiv.org/pdf/2310.07255v1.pdf","comment":"This paper has been accepted by IEEE Geoscience and Remote Sensing\n  Letters. Code is released at https://github.com/fangfang11-plog/ADASR"},{"id":"http://arxiv.org/abs/2310.07252v1","updated":"2023-10-11T07:30:01Z","published":"2023-10-11T07:30:01Z","title":"A Comparative Study of Pre-trained CNNs and GRU-Based Attention for\n  Image Caption Generation","summary":"  Image captioning is a challenging task involving generating a textual\ndescription for an image using computer vision and natural language processing\ntechniques. This paper proposes a deep neural framework for image caption\ngeneration using a GRU-based attention mechanism. Our approach employs multiple\npre-trained convolutional neural networks as the encoder to extract features\nfrom the image and a GRU-based language model as the decoder to generate\ndescriptive sentences. To improve performance, we integrate the Bahdanau\nattention model with the GRU decoder to enable learning to focus on specific\nimage parts. We evaluate our approach using the MSCOCO and Flickr30k datasets\nand show that it achieves competitive scores compared to state-of-the-art\nmethods. Our proposed framework can bridge the gap between computer vision and\nnatural language and can be extended to specific domains.\n","authors":["Rashid Khan","Bingding Huang","Haseeb Hassan","Asim Zaman","Zhongfu Ye"],"pdf_url":"https://arxiv.org/pdf/2310.07252v1.pdf","comment":"15pages, 10 figures, 5 tables. 2023 the 5th International Conference\n  on Robotics and Computer Vision (ICRCV 2023). arXiv admin note: substantial\n  text overlap with arXiv:2203.01594"},{"id":"http://arxiv.org/abs/2310.07250v1","updated":"2023-10-11T07:27:28Z","published":"2023-10-11T07:27:28Z","title":"Synthesizing Missing MRI Sequences from Available Modalities using\n  Generative Adversarial Networks in BraTS Dataset","summary":"  Glioblastoma is a highly aggressive and lethal form of brain cancer. Magnetic\nresonance imaging (MRI) plays a significant role in the diagnosis, treatment\nplanning, and follow-up of glioblastoma patients due to its non-invasive and\nradiation-free nature. The International Brain Tumor Segmentation (BraTS)\nchallenge has contributed to generating numerous AI algorithms to accurately\nand efficiently segment glioblastoma sub-compartments using four structural\n(T1, T1Gd, T2, T2-FLAIR) MRI scans. However, these four MRI sequences may not\nalways be available. To address this issue, Generative Adversarial Networks\n(GANs) can be used to synthesize the missing MRI sequences. In this paper, we\nimplement and utilize an open-source GAN approach that takes any three MRI\nsequences as input to generate the missing fourth structural sequence. Our\nproposed approach is contributed to the community-driven generally nuanced deep\nlearning framework (GaNDLF) and demonstrates promising results in synthesizing\nhigh-quality and realistic MRI sequences, enabling clinicians to improve their\ndiagnostic capabilities and support the application of AI methods to brain\ntumor MRI quantification.\n","authors":["Ibrahim Ethem Hamamci"],"pdf_url":"https://arxiv.org/pdf/2310.07250v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07248v1","updated":"2023-10-11T07:25:50Z","published":"2023-10-11T07:25:50Z","title":"IBoxCLA: Towards Robust Box-supervised Segmentation of Polyp via\n  Improved Box-dice and Contrastive Latent-anchors","summary":"  Box-supervised polyp segmentation attracts increasing attention for its\ncost-effective potential. Existing solutions often rely on learning-free\nmethods or pretrained models to laboriously generate pseudo masks, triggering\nDice constraint subsequently. In this paper, we found that a model guided by\nthe simplest box-filled masks can accurately predict polyp locations/sizes, but\nsuffers from shape collapsing. In response, we propose two innovative learning\nfashions, Improved Box-dice (IBox) and Contrastive Latent-Anchors (CLA), and\ncombine them to train a robust box-supervised model IBoxCLA. The core idea\nbehind IBoxCLA is to decouple the learning of location/size and shape, allowing\nfor focused constraints on each of them. Specifically, IBox transforms the\nsegmentation map into a proxy map using shape decoupling and confusion-region\nswapping sequentially. Within the proxy map, shapes are disentangled, while\nlocations/sizes are encoded as box-like responses. By constraining the proxy\nmap instead of the raw prediction, the box-filled mask can well supervise\nIBoxCLA without misleading its shape learning. Furthermore, CLA contributes to\nshape learning by generating two types of latent anchors, which are learned and\nupdated using momentum and segmented polyps to steadily represent polyp and\nbackground features. The latent anchors facilitate IBoxCLA to capture\ndiscriminative features within and outside boxes in a contrastive manner,\nyielding clearer boundaries. We benchmark IBoxCLA on five public polyp\ndatasets. The experimental results demonstrate the competitive performance of\nIBoxCLA compared to recent fully-supervised polyp segmentation methods, and its\nsuperiority over other box-supervised state-of-the-arts with a relative\nincrease of overall mDice and mIoU by at least 6.5% and 7.5%, respectively.\n","authors":["Zhiwei Wang","Qiang Hu","Hongkuan Shi","Li He","Man He","Wenxuan Dai","Ting Li","Yitong Zhang","Dun Li","Mei Liu","Qiang Li"],"pdf_url":"https://arxiv.org/pdf/2310.07248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07247v1","updated":"2023-10-11T07:24:27Z","published":"2023-10-11T07:24:27Z","title":"Optimizing the Placement of Roadside LiDARs for Autonomous Driving","summary":"  Multi-agent cooperative perception is an increasingly popular topic in the\nfield of autonomous driving, where roadside LiDARs play an essential role.\nHowever, how to optimize the placement of roadside LiDARs is a crucial but\noften overlooked problem. This paper proposes an approach to optimize the\nplacement of roadside LiDARs by selecting optimized positions within the scene\nfor better perception performance. To efficiently obtain the best combination\nof locations, a greedy algorithm based on perceptual gain is proposed, which\nselects the location that can maximize the perceptual gain sequentially. We\ndefine perceptual gain as the increased perceptual capability when a new LiDAR\nis placed. To obtain the perception capability, we propose a perception\npredictor that learns to evaluate LiDAR placement using only a single point\ncloud frame. A dataset named Roadside-Opt is created using the CARLA simulator\nto facilitate research on the roadside LiDAR placement problem.\n","authors":["Wentao Jiang","Hao Xiang","Xinyu Cai","Runsheng Xu","Jiaqi Ma","Yikang Li","Gim Hee Lee","Si Liu"],"pdf_url":"https://arxiv.org/pdf/2310.07247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07245v1","updated":"2023-10-11T07:22:37Z","published":"2023-10-11T07:22:37Z","title":"Crowd Counting in Harsh Weather using Image Denoising with Pix2Pix GANs","summary":"  Visual crowd counting estimates the density of the crowd using deep learning\nmodels such as convolution neural networks (CNNs). The performance of the model\nheavily relies on the quality of the training data that constitutes crowd\nimages. In harsh weather such as fog, dust, and low light conditions, the\ninference performance may severely degrade on the noisy and blur images. In\nthis paper, we propose the use of Pix2Pix generative adversarial network (GAN)\nto first denoise the crowd images prior to passing them to the counting model.\nA Pix2Pix network is trained using synthetic noisy images generated from\noriginal crowd images and then the pretrained generator is then used in the\ninference engine to estimate the crowd density in unseen, noisy crowd images.\nThe performance is tested on JHU-Crowd dataset to validate the significance of\nthe proposed method particularly when high reliability and accuracy are\nrequired.\n","authors":["Muhammad Asif Khan","Hamid Menouar","Ridha Hamila"],"pdf_url":"https://arxiv.org/pdf/2310.07245v1.pdf","comment":"The paper has been accepted for presentation in IEEE 38th\n  International Conference on Image and Vision Computing New Zealand (IVCNZ\n  2023). The final manuscript can be accessed at ieeexplore"},{"id":"http://arxiv.org/abs/2310.04991v3","updated":"2023-10-11T07:20:32Z","published":"2023-10-08T03:35:27Z","title":"Video-Teller: Enhancing Cross-Modal Generation with Fusion and\n  Decoupling","summary":"  This paper proposes Video-Teller, a video-language foundation model that\nleverages multi-modal fusion and fine-grained modality alignment to\nsignificantly enhance the video-to-text generation task. Video-Teller boosts\nthe training efficiency by utilizing frozen pretrained vision and language\nmodules. It capitalizes on the robust linguistic capabilities of large language\nmodels, enabling the generation of both concise and elaborate video\ndescriptions. To effectively integrate visual and auditory information,\nVideo-Teller builds upon the image-based BLIP-2 model and introduces a cascaded\nQ-Former which fuses information across frames and ASR texts. To better guide\nvideo summarization, we introduce a fine-grained modality alignment objective,\nwhere the cascaded Q-Former's output embedding is trained to align with the\ncaption/summary embedding created by a pretrained text auto-encoder.\nExperimental results demonstrate the efficacy of our proposed video-language\nfoundation model in accurately comprehending videos and generating coherent and\nprecise language descriptions. It is worth noting that the fine-grained\nalignment enhances the model's capabilities (4% improvement of CIDEr score on\nMSR-VTT) with only 13% extra parameters in training and zero additional cost in\ninference.\n","authors":["Haogeng Liu","Qihang Fan","Tingkai Liu","Linjie Yang","Yunzhe Tao","Huaibo Huang","Ran He","Hongxia Yang"],"pdf_url":"https://arxiv.org/pdf/2310.04991v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05447v2","updated":"2023-10-11T07:10:49Z","published":"2023-10-09T06:43:48Z","title":"Towards Fair and Comprehensive Comparisons for Image-Based 3D Object\n  Detection","summary":"  In this work, we build a modular-designed codebase, formulate strong training\nrecipes, design an error diagnosis toolbox, and discuss current methods for\nimage-based 3D object detection. In particular, different from other highly\nmature tasks, e.g., 2D object detection, the community of image-based 3D object\ndetection is still evolving, where methods often adopt different training\nrecipes and tricks resulting in unfair evaluations and comparisons. What is\nworse, these tricks may overwhelm their proposed designs in performance, even\nleading to wrong conclusions. To address this issue, we build a module-designed\ncodebase and formulate unified training standards for the community.\nFurthermore, we also design an error diagnosis toolbox to measure the detailed\ncharacterization of detection models. Using these tools, we analyze current\nmethods in-depth under varying settings and provide discussions for some open\nquestions, e.g., discrepancies in conclusions on KITTI-3D and nuScenes\ndatasets, which have led to different dominant methods for these datasets. We\nhope that this work will facilitate future research in image-based 3D object\ndetection. Our codes will be released at\n\\url{https://github.com/OpenGVLab/3dodi}\n","authors":["Xinzhu Ma","Yongtao Wang","Yinmin Zhang","Zhiyi Xia","Yuan Meng","Zhihui Wang","Haojie Li","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2310.05447v2.pdf","comment":"ICCV23, code will be released soon"},{"id":"http://arxiv.org/abs/2301.07807v3","updated":"2023-10-11T07:07:18Z","published":"2023-01-18T22:38:03Z","title":"Measuring uncertainty in human visual segmentation","summary":"  Segmenting visual stimuli into distinct groups of features and visual objects\nis central to visual function. Classical psychophysical methods have helped\nuncover many rules of human perceptual segmentation, and recent progress in\nmachine learning has produced successful algorithms. Yet, the computational\nlogic of human segmentation remains unclear, partially because we lack\nwell-controlled paradigms to measure perceptual segmentation maps and compare\nmodels quantitatively. Here we propose a new, integrated approach: given an\nimage, we measure multiple pixel-based same--different judgments and perform\nmodel--based reconstruction of the underlying segmentation map. The\nreconstruction is robust to several experimental manipulations and captures the\nvariability of individual participants. We demonstrate the validity of the\napproach on human segmentation of natural images and composite textures. We\nshow that image uncertainty affects measured human variability, and it\ninfluences how participants weigh different visual features. Because any\nputative segmentation algorithm can be inserted to perform the reconstruction,\nour paradigm affords quantitative tests of theories of perception as well as\nnew benchmarks for segmentation algorithms.\n","authors":["Jonathan Vacher","Claire Launay","Pascal Mamassian","Ruben Coen-Cagli"],"pdf_url":"https://arxiv.org/pdf/2301.07807v3.pdf","comment":"32 pages, 9 figures, 5 appendix, 5 figures in appendix"},{"id":"http://arxiv.org/abs/2305.17382v3","updated":"2023-10-11T07:02:45Z","published":"2023-05-27T06:24:43Z","title":"APRIL-GAN: A Zero-/Few-Shot Anomaly Classification and Segmentation\n  Method for CVPR 2023 VAND Workshop Challenge Tracks 1&2: 1st Place on\n  Zero-shot AD and 4th Place on Few-shot AD","summary":"  In this technical report, we briefly introduce our solution for the\nZero/Few-shot Track of the Visual Anomaly and Novelty Detection (VAND) 2023\nChallenge. For industrial visual inspection, building a single model that can\nbe rapidly adapted to numerous categories without or with only a few normal\nreference images is a promising research direction. This is primarily because\nof the vast variety of the product types. For the zero-shot track, we propose a\nsolution based on the CLIP model by adding extra linear layers. These layers\nare used to map the image features to the joint embedding space, so that they\ncan compare with the text features to generate the anomaly maps. Besides, when\nthe reference images are available, we utilize multiple memory banks to store\ntheir features and compare them with the features of the test images during the\ntesting phase. In this challenge, our method achieved first place in the\nzero-shot track, especially excelling in segmentation with an impressive F1\nscore improvement of 0.0489 over the second-ranked participant. Furthermore, in\nthe few-shot track, we secured the fourth position overall, with our\nclassification F1 score of 0.8687 ranking first among all participating teams.\n","authors":["Xuhai Chen","Yue Han","Jiangning Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.17382v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.16254v3","updated":"2023-10-11T06:59:47Z","published":"2023-03-28T18:59:17Z","title":"CryoFormer: Continuous Heterogeneous Cryo-EM Reconstruction using\n  Transformer-based Neural Representations","summary":"  Cryo-electron microscopy (cryo-EM) allows for the high-resolution\nreconstruction of 3D structures of proteins and other biomolecules. Successful\nreconstruction of both shape and movement greatly helps understand the\nfundamental processes of life. However, it is still challenging to reconstruct\nthe continuous motions of 3D structures from hundreds of thousands of noisy and\nrandomly oriented 2D cryo-EM images. Recent advancements use Fourier domain\ncoordinate-based neural networks to continuously model 3D conformations, yet\nthey often struggle to capture local flexible regions accurately. We propose\nCryoFormer, a new approach for continuous heterogeneous cryo-EM reconstruction.\nOur approach leverages an implicit feature volume directly in the real domain\nas the 3D representation. We further introduce a novel query-based deformation\ntransformer decoder to improve the reconstruction quality. Our approach is\ncapable of refining pre-computed pose estimations and locating flexible\nregions. In experiments, our method outperforms current approaches on three\npublic datasets (1 synthetic and 2 experimental) and a new synthetic dataset of\nPEDV spike protein. The code and new synthetic dataset will be released for\nbetter reproducibility of our results. Project page:\nhttps://cryoformer.github.io.\n","authors":["Xinhang Liu","Yan Zeng","Yifan Qin","Hao Li","Jiakai Zhang","Lan Xu","Jingyi Yu"],"pdf_url":"https://arxiv.org/pdf/2303.16254v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07237v1","updated":"2023-10-11T06:58:22Z","published":"2023-10-11T06:58:22Z","title":"SAGE-ICP: Semantic Information-Assisted ICP","summary":"  Robust and accurate pose estimation in unknown environments is an essential\npart of robotic applications. We focus on LiDAR-based point-to-point ICP\ncombined with effective semantic information. This paper proposes a novel\nsemantic information-assisted ICP method named SAGE-ICP, which leverages\nsemantics in odometry. The semantic information for the whole scan is timely\nand efficiently extracted by a 3D convolution network, and these point-wise\nlabels are deeply involved in every part of the registration, including\nsemantic voxel downsampling, data association, adaptive local map, and dynamic\nvehicle removal. Unlike previous semantic-aided approaches, the proposed method\ncan improve localization accuracy in large-scale scenes even if the semantic\ninformation has certain errors. Experimental evaluations on KITTI and KITTI-360\nshow that our method outperforms the baseline methods, and improves accuracy\nwhile maintaining real-time performance, i.e., runs faster than the sensor\nframe rate.\n","authors":["Jiaming Cui","Jiming Chen","Liang Li"],"pdf_url":"https://arxiv.org/pdf/2310.07237v1.pdf","comment":"6+1 pages, 4 figures"},{"id":"http://arxiv.org/abs/2310.07236v1","updated":"2023-10-11T06:56:08Z","published":"2023-10-11T06:56:08Z","title":"AdaMesh: Personalized Facial Expressions and Head Poses for\n  Speech-Driven 3D Facial Animation","summary":"  Speech-driven 3D facial animation aims at generating facial movements that\nare synchronized with the driving speech, which has been widely explored\nrecently. Existing works mostly neglect the person-specific talking style in\ngeneration, including facial expression and head pose styles. Several works\nintend to capture the personalities by fine-tuning modules. However, limited\ntraining data leads to the lack of vividness. In this work, we propose AdaMesh,\na novel adaptive speech-driven facial animation approach, which learns the\npersonalized talking style from a reference video of about 10 seconds and\ngenerates vivid facial expressions and head poses. Specifically, we propose\nmixture-of-low-rank adaptation (MoLoRA) to fine-tune the expression adapter,\nwhich efficiently captures the facial expression style. For the personalized\npose style, we propose a pose adapter by building a discrete pose prior and\nretrieving the appropriate style embedding with a semantic-aware pose style\nmatrix without fine-tuning. Extensive experimental results show that our\napproach outperforms state-of-the-art methods, preserves the talking style in\nthe reference video, and generates vivid facial animation. The supplementary\nvideo and code will be available at https://adamesh.github.io.\n","authors":["Liyang Chen","Weihong Bao","Shun Lei","Boshi Tang","Zhiyong Wu","Shiyin Kang","Haozhi Huang"],"pdf_url":"https://arxiv.org/pdf/2310.07236v1.pdf","comment":"Project Page: https://adamesh.github.io"},{"id":"http://arxiv.org/abs/2309.13438v3","updated":"2023-10-11T06:43:08Z","published":"2023-09-23T17:29:38Z","title":"Rethinking Superpixel Segmentation from Biologically Inspired Mechanisms","summary":"  Recently, advancements in deep learning-based superpixel segmentation methods\nhave brought about improvements in both the efficiency and the performance of\nsegmentation. However, a significant challenge remains in generating\nsuperpixels that strictly adhere to object boundaries while conveying rich\nvisual significance, especially when cross-surface color correlations may\ninterfere with objects. Drawing inspiration from neural structure and visual\nmechanisms, we propose a biological network architecture comprising an Enhanced\nScreening Module (ESM) and a novel Boundary-Aware Label (BAL) for superpixel\nsegmentation. The ESM enhances semantic information by simulating the\ninteractive projection mechanisms of the visual cortex. Additionally, the BAL\nemulates the spatial frequency characteristics of visual cortical cells to\nfacilitate the generation of superpixels with strong boundary adherence. We\ndemonstrate the effectiveness of our approach through evaluations on both the\nBSDS500 dataset and the NYUv2 dataset.\n","authors":["Tingyu Zhao","Bo Peng","Yuan Sun","Daipeng Yang","Zhenguang Zhang","Xi Wu"],"pdf_url":"https://arxiv.org/pdf/2309.13438v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13311v2","updated":"2023-10-11T06:28:41Z","published":"2023-05-22T17:59:45Z","title":"VDT: General-purpose Video Diffusion Transformers via Mask Modeling","summary":"  This work introduces Video Diffusion Transformer (VDT), which pioneers the\nuse of transformers in diffusion-based video generation. It features\ntransformer blocks with modularized temporal and spatial attention modules to\nleverage the rich spatial-temporal representation inherited in transformers. We\nalso propose a unified spatial-temporal mask modeling mechanism, seamlessly\nintegrated with the model, to cater to diverse video generation scenarios. VDT\noffers several appealing benefits. 1) It excels at capturing temporal\ndependencies to produce temporally consistent video frames and even simulate\nthe physics and dynamics of 3D objects over time. 2) It facilitates flexible\nconditioning information, \\eg, simple concatenation in the token space,\neffectively unifying different token lengths and modalities. 3) Pairing with\nour proposed spatial-temporal mask modeling mechanism, it becomes a\ngeneral-purpose video diffuser for harnessing a range of tasks, including\nunconditional generation, video prediction, interpolation, animation, and\ncompletion, etc. Extensive experiments on these tasks spanning various\nscenarios, including autonomous driving, natural weather, human action, and\nphysics-based simulation, demonstrate the effectiveness of VDT. Additionally,\nwe present comprehensive studies on how \\model handles conditioning information\nwith the mask modeling mechanism, which we believe will benefit future research\nand advance the field. Project page: https:VDT-2023.github.io\n","authors":["Haoyu Lu","Guoxing Yang","Nanyi Fei","Yuqi Huo","Zhiwu Lu","Ping Luo","Mingyu Ding"],"pdf_url":"https://arxiv.org/pdf/2305.13311v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07223v1","updated":"2023-10-11T06:13:50Z","published":"2023-10-11T06:13:50Z","title":"Deep Learning for blind spectral unmixing of LULC classes with MODIS\n  multispectral time series and ancillary data","summary":"  Remotely sensed data are dominated by mixed Land Use and Land Cover (LULC)\ntypes. Spectral unmixing is a technique to extract information from mixed\npixels into their constituent LULC types and corresponding abundance fractions.\nTraditionally, solving this task has relied on either classical methods that\nrequire prior knowledge of endmembers or machine learning methods that avoid\nexplicit endmembers calculation, also known as blind spectral unmixing (BSU).\nMost BSU studies based on Deep Learning (DL) focus on one time-step\nhyperspectral data, yet its acquisition remains quite costly compared with\nmultispectral data. To our knowledge, here we provide the first study on BSU of\nLULC classes using multispectral time series data with DL models. We further\nboost the performance of a Long-Short Term Memory (LSTM)-based model by\nincorporating geographic plus topographic (geo-topographic) and climatic\nancillary information. Our experiments show that combining spectral-temporal\ninput data together with geo-topographic and climatic information substantially\nimproves the abundance estimation of LULC classes in mixed pixels. To carry out\nthis study, we built a new labeled dataset of the region of Andalusia (Spain)\nwith monthly multispectral time series of pixels for the year 2013 from MODIS\nat 460m resolution, for two hierarchical levels of LULC classes, named\nAndalusia MultiSpectral MultiTemporal Unmixing (Andalusia-MSMTU). This dataset\nprovides, at the pixel level, a multispectral time series plus ancillary\ninformation annotated with the abundance of each LULC class inside each pixel.\nThe dataset and code are available to the public.\n","authors":["José Rodríguez-Ortega","Rohaifa Khaldi","Domingo Alcaraz-Segura","Siham Tabik"],"pdf_url":"https://arxiv.org/pdf/2310.07223v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07222v1","updated":"2023-10-11T06:11:42Z","published":"2023-10-11T06:11:42Z","title":"Uni-paint: A Unified Framework for Multimodal Image Inpainting with\n  Pretrained Diffusion Model","summary":"  Recently, text-to-image denoising diffusion probabilistic models (DDPMs) have\ndemonstrated impressive image generation capabilities and have also been\nsuccessfully applied to image inpainting. However, in practice, users often\nrequire more control over the inpainting process beyond textual guidance,\nespecially when they want to composite objects with customized appearance,\ncolor, shape, and layout. Unfortunately, existing diffusion-based inpainting\nmethods are limited to single-modal guidance and require task-specific\ntraining, hindering their cross-modal scalability. To address these\nlimitations, we propose Uni-paint, a unified framework for multimodal\ninpainting that offers various modes of guidance, including unconditional,\ntext-driven, stroke-driven, exemplar-driven inpainting, as well as a\ncombination of these modes. Furthermore, our Uni-paint is based on pretrained\nStable Diffusion and does not require task-specific training on specific\ndatasets, enabling few-shot generalizability to customized images. We have\nconducted extensive qualitative and quantitative evaluations that show our\napproach achieves comparable results to existing single-modal methods while\noffering multimodal inpainting capabilities not available in other methods.\nCode will be available at https://github.com/ysy31415/unipaint.\n","authors":["Shiyuan Yang","Xiaodong Chen","Jing Liao"],"pdf_url":"https://arxiv.org/pdf/2310.07222v1.pdf","comment":"Accepted by ACMMM'23"},{"id":"http://arxiv.org/abs/2310.04895v2","updated":"2023-10-11T05:59:53Z","published":"2023-10-07T18:47:17Z","title":"Cell Tracking-by-detection using Elliptical Bounding Boxes","summary":"  Cell detection and tracking are paramount for bio-analysis. Recent approaches\nrely on the tracking-by-model evolution paradigm, which usually consists of\ntraining end-to-end deep learning models to detect and track the cells on the\nframes with promising results. However, such methods require extensive amounts\nof annotated data, which is time-consuming to obtain and often requires\nspecialized annotators. This work proposes a new approach based on the\nclassical tracking-by-detection paradigm that alleviates the requirement of\nannotated data. More precisely, it approximates the cell shapes as oriented\nellipses and then uses generic-purpose oriented object detectors to identify\nthe cells in each frame. We then rely on a global data association algorithm\nthat explores temporal cell similarity using probability distance metrics,\nconsidering that the ellipses relate to two-dimensional Gaussian distributions.\nOur results show that our method can achieve detection and tracking results\ncompetitively with state-of-the-art techniques that require considerably more\nextensive data annotation. Our code is available at:\nhttps://github.com/LucasKirsten/Deep-Cell-Tracking-EBB.\n","authors":["Lucas N. Kirsten","Cláudio R. Jung"],"pdf_url":"https://arxiv.org/pdf/2310.04895v2.pdf","comment":"Paper under review on IEEE/ACM Transactions on Computational Biology\n  and Bioinformatics"},{"id":"http://arxiv.org/abs/2310.07212v1","updated":"2023-10-11T05:58:14Z","published":"2023-10-11T05:58:14Z","title":"Multi-Task Learning-Enabled Automatic Vessel Draft Reading for\n  Intelligent Maritime Surveillance","summary":"  The accurate and efficient vessel draft reading (VDR) is an important\ncomponent of intelligent maritime surveillance, which could be exploited to\nassist in judging whether the vessel is normally loaded or overloaded. The\ncomputer vision technique with an excellent price-to-performance ratio has\nbecome a popular medium to estimate vessel draft depth. However, the\ntraditional estimation methods easily suffer from several limitations, such as\nsensitivity to low-quality images, high computational cost, etc. In this work,\nwe propose a multi-task learning-enabled computational method (termed MTL-VDR)\nfor generating highly reliable VDR. In particular, our MTL-VDR mainly consists\nof four components, i.e., draft mark detection, draft scale recognition,\nvessel/water segmentation, and final draft depth estimation. We first construct\na benchmark dataset related to draft mark detection and employ a powerful and\nefficient convolutional neural network to accurately perform the detection\ntask. The multi-task learning method is then proposed for simultaneous draft\nscale recognition and vessel/water segmentation. To obtain more robust VDR\nunder complex conditions (e.g., damaged and stained scales, etc.), the accurate\ndraft scales are generated by an automatic correction method, which is\npresented based on the spatial distribution rules of draft scales. Finally, an\nadaptive computational method is exploited to yield an accurate and robust\ndraft depth. Extensive experiments have been implemented on the realistic\ndataset to compare our MTL-VDR with state-of-the-art methods. The results have\ndemonstrated its superior performance in terms of accuracy, robustness, and\nefficiency. The computational speed exceeds 40 FPS, which satisfies the\nrequirements of real-time maritime surveillance to guarantee vessel traffic\nsafety.\n","authors":["Jingxiang Qu","Ryan Wen Liu","Chenjie Zhao","Yu Guo","Sendren Sheng-Dong Xu","Fenghua Zhu","Yisheng Lv"],"pdf_url":"https://arxiv.org/pdf/2310.07212v1.pdf","comment":"12 pages,11 figures, submitted to IEEE T-ITS"},{"id":"http://arxiv.org/abs/2310.07209v1","updated":"2023-10-11T05:49:47Z","published":"2023-10-11T05:49:47Z","title":"Multi-task Explainable Skin Lesion Classification","summary":"  Skin cancer is one of the deadliest diseases and has a high mortality rate if\nleft untreated. The diagnosis generally starts with visual screening and is\nfollowed by a biopsy or histopathological examination. Early detection can aid\nin lowering mortality rates. Visual screening can be limited by the experience\nof the doctor. Due to the long tail distribution of dermatological datasets and\nsignificant intra-variability between classes, automatic classification\nutilizing computer-aided methods becomes challenging. In this work, we propose\na multitask few-shot-based approach for skin lesions that generalizes well with\nfew labelled data to address the small sample space challenge. The proposed\napproach comprises a fusion of a segmentation network that acts as an attention\nmodule and classification network. The output of the segmentation network helps\nto focus on the most discriminatory features while making a decision by the\nclassification network. To further enhance the classification performance, we\nhave combined segmentation and classification loss in a weighted manner. We\nhave also included the visualization results that explain the decisions made by\nthe algorithm. Three dermatological datasets are used to evaluate the proposed\nmethod thoroughly. We also conducted cross-database experiments to ensure that\nthe proposed approach is generalizable across similar datasets. Experimental\nresults demonstrate the efficacy of the proposed work.\n","authors":["Mahapara Khurshid","Mayank Vatsa","Richa Singh"],"pdf_url":"https://arxiv.org/pdf/2310.07209v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05341v3","updated":"2023-10-11T05:46:28Z","published":"2023-10-09T01:59:49Z","title":"A Critical Look at Classic Test-Time Adaptation Methods in Semantic\n  Segmentation","summary":"  Test-time adaptation (TTA) aims to adapt a model, initially trained on\ntraining data, to potential distribution shifts in the test data. Most existing\nTTA studies, however, focus on classification tasks, leaving a notable gap in\nthe exploration of TTA for semantic segmentation. This pronounced emphasis on\nclassification might lead numerous newcomers and engineers to mistakenly assume\nthat classic TTA methods designed for classification can be directly applied to\nsegmentation. Nonetheless, this assumption remains unverified, posing an open\nquestion. To address this, we conduct a systematic, empirical study to disclose\nthe unique challenges of segmentation TTA, and to determine whether classic TTA\nstrategies can effectively address this task. Our comprehensive results have\nled to three key observations. First, the classic batch norm updating strategy,\ncommonly used in classification TTA, only brings slight performance\nimprovement, and in some cases it might even adversely affect the results. Even\nwith the application of advanced distribution estimation techniques like batch\nrenormalization, the problem remains unresolved. Second, the teacher-student\nscheme does enhance training stability for segmentation TTA in the presence of\nnoisy pseudo-labels. However, it cannot directly result in performance\nimprovement compared to the original model without TTA. Third, segmentation TTA\nsuffers a severe long-tailed imbalance problem, which is substantially more\ncomplex than that in TTA for classification. This long-tailed challenge\nsignificantly affects segmentation TTA performance, even when the accuracy of\npseudo-labels is high. In light of these observations, we conclude that TTA for\nsegmentation presents significant challenges, and simply using classic TTA\nmethods cannot address this problem well.\n","authors":["Chang'an Yi","Haotian Chen","Yifan Zhang","Yonghui Xu","Lizhen Cui"],"pdf_url":"https://arxiv.org/pdf/2310.05341v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05917v2","updated":"2023-10-11T05:41:16Z","published":"2023-10-09T17:59:12Z","title":"Drivable Avatar Clothing: Faithful Full-Body Telepresence with Dynamic\n  Clothing Driven by Sparse RGB-D Input","summary":"  Clothing is an important part of human appearance but challenging to model in\nphotorealistic avatars. In this work we present avatars with dynamically moving\nloose clothing that can be faithfully driven by sparse RGB-D inputs as well as\nbody and face motion. We propose a Neural Iterative Closest Point (N-ICP)\nalgorithm that can efficiently track the coarse garment shape given sparse\ndepth input. Given the coarse tracking results, the input RGB-D images are then\nremapped to texel-aligned features, which are fed into the drivable avatar\nmodels to faithfully reconstruct appearance details. We evaluate our method\nagainst recent image-driven synthesis baselines, and conduct a comprehensive\nanalysis of the N-ICP algorithm. We demonstrate that our method can generalize\nto a novel testing environment, while preserving the ability to produce\nhigh-fidelity and faithful clothing dynamics and appearance.\n","authors":["Donglai Xiang","Fabian Prada","Zhe Cao","Kaiwen Guo","Chenglei Wu","Jessica Hodgins","Timur Bagautdinov"],"pdf_url":"https://arxiv.org/pdf/2310.05917v2.pdf","comment":"SIGGRAPH Asia 2023 Conference Paper. Project website:\n  https://xiangdonglai.github.io/www-sa23-drivable-clothing/"},{"id":"http://arxiv.org/abs/2310.07206v1","updated":"2023-10-11T05:34:36Z","published":"2023-10-11T05:34:36Z","title":"DeepSimHO: Stable Pose Estimation for Hand-Object Interaction via\n  Physics Simulation","summary":"  This paper addresses the task of 3D pose estimation for a hand interacting\nwith an object from a single image observation. When modeling hand-object\ninteraction, previous works mainly exploit proximity cues, while overlooking\nthe dynamical nature that the hand must stably grasp the object to counteract\ngravity and thus preventing the object from slipping or falling. These works\nfail to leverage dynamical constraints in the estimation and consequently often\nproduce unstable results. Meanwhile, refining unstable configurations with\nphysics-based reasoning remains challenging, both by the complexity of contact\ndynamics and by the lack of effective and efficient physics inference in the\ndata-driven learning framework. To address both issues, we present DeepSimHO: a\nnovel deep-learning pipeline that combines forward physics simulation and\nbackward gradient approximation with a neural network. Specifically, for an\ninitial hand-object pose estimated by a base network, we forward it to a\nphysics simulator to evaluate its stability. However, due to non-smooth contact\ngeometry and penetration, existing differentiable simulators can not provide\nreliable state gradient. To remedy this, we further introduce a deep network to\nlearn the stability evaluation process from the simulator, while smoothly\napproximating its gradient and thus enabling effective back-propagation.\nExtensive experiments show that our method noticeably improves the stability of\nthe estimation and achieves superior efficiency over test-time optimization.\nThe code is available at https://github.com/rongakowang/DeepSimHO.\n","authors":["Rong Wang","Wei Mao","Hongdong Li"],"pdf_url":"https://arxiv.org/pdf/2310.07206v1.pdf","comment":"Accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2309.17421v2","updated":"2023-10-11T05:07:37Z","published":"2023-09-29T17:34:51Z","title":"The Dawn of LMMs: Preliminary Explorations with GPT-4V(ision)","summary":"  Large multimodal models (LMMs) extend large language models (LLMs) with\nmulti-sensory skills, such as visual understanding, to achieve stronger generic\nintelligence. In this paper, we analyze the latest model, GPT-4V(ision), to\ndeepen the understanding of LMMs. The analysis focuses on the intriguing tasks\nthat GPT-4V can perform, containing test samples to probe the quality and\ngenericity of GPT-4V's capabilities, its supported inputs and working modes,\nand the effective ways to prompt the model. In our approach to exploring\nGPT-4V, we curate and organize a collection of carefully designed qualitative\nsamples spanning a variety of domains and tasks. Observations from these\nsamples demonstrate that GPT-4V's unprecedented ability in processing\narbitrarily interleaved multimodal inputs and the genericity of its\ncapabilities together make GPT-4V a powerful multimodal generalist system.\nFurthermore, GPT-4V's unique capability of understanding visual markers drawn\non input images can give rise to new human-computer interaction methods such as\nvisual referring prompting. We conclude the report with in-depth discussions on\nthe emerging application scenarios and the future research directions for\nGPT-4V-based systems. We hope that this preliminary exploration will inspire\nfuture research on the next-generation multimodal task formulation, new ways to\nexploit and enhance LMMs to solve real-world problems, and gaining better\nunderstanding of multimodal foundation models. Finally, we acknowledge that the\nmodel under our study is solely the product of OpenAI's innovative work, and\nthey should be fully credited for its development. Please see the GPT-4V\ncontributions paper for the authorship and credit attribution:\nhttps://cdn.openai.com/contributions/gpt-4v.pdf\n","authors":["Zhengyuan Yang","Linjie Li","Kevin Lin","Jianfeng Wang","Chung-Ching Lin","Zicheng Liu","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2309.17421v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07189v1","updated":"2023-10-11T04:38:21Z","published":"2023-10-11T04:38:21Z","title":"SpikePoint: An Efficient Point-based Spiking Neural Network for Event\n  Cameras Action Recognition","summary":"  Event cameras are bio-inspired sensors that respond to local changes in light\nintensity and feature low latency, high energy efficiency, and high dynamic\nrange. Meanwhile, Spiking Neural Networks (SNNs) have gained significant\nattention due to their remarkable efficiency and fault tolerance. By\nsynergistically harnessing the energy efficiency inherent in event cameras and\nthe spike-based processing capabilities of SNNs, their integration could enable\nultra-low-power application scenarios, such as action recognition tasks.\nHowever, existing approaches often entail converting asynchronous events into\nconventional frames, leading to additional data mapping efforts and a loss of\nsparsity, contradicting the design concept of SNNs and event cameras. To\naddress this challenge, we propose SpikePoint, a novel end-to-end point-based\nSNN architecture. SpikePoint excels at processing sparse event cloud data,\neffectively extracting both global and local features through a singular-stage\nstructure. Leveraging the surrogate training method, SpikePoint achieves high\naccuracy with few parameters and maintains low power consumption, specifically\nemploying the identity mapping feature extractor on diverse datasets.\nSpikePoint achieves state-of-the-art (SOTA) performance on four event-based\naction recognition datasets using only 16 timesteps, surpassing other SNN\nmethods. Moreover, it also achieves SOTA performance across all methods on\nthree datasets, utilizing approximately 0.3\\% of the parameters and 0.5\\% of\npower consumption employed by artificial neural networks (ANNs). These results\nemphasize the significance of Point Cloud and pave the way for many\nultra-low-power event-based data processing applications.\n","authors":["Hongwei Ren","Yue Zhou","Yulong Huang","Haotian Fu","Xiaopeng Lin","Jie Song","Bojun Cheng"],"pdf_url":"https://arxiv.org/pdf/2310.07189v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2310.07186v1","updated":"2023-10-11T04:25:24Z","published":"2023-10-11T04:25:24Z","title":"Multiview Transformer: Rethinking Spatial Information in Hyperspectral\n  Image Classification","summary":"  Identifying the land cover category for each pixel in a hyperspectral image\n(HSI) relies on spectral and spatial information. An HSI cuboid with a specific\npatch size is utilized to extract spatial-spectral feature representation for\nthe central pixel. In this article, we investigate that scene-specific but not\nessential correlations may be recorded in an HSI cuboid. This additional\ninformation improves the model performance on existing HSI datasets and makes\nit hard to properly evaluate the ability of a model. We refer to this problem\nas the spatial overfitting issue and utilize strict experimental settings to\navoid it. We further propose a multiview transformer for HSI classification,\nwhich consists of multiview principal component analysis (MPCA), spectral\nencoder-decoder (SED), and spatial-pooling tokenization transformer (SPTT).\nMPCA performs dimension reduction on an HSI via constructing spectral multiview\nobservations and applying PCA on each view data to extract low-dimensional view\nrepresentation. The combination of view representations, named multiview\nrepresentation, is the dimension reduction output of the MPCA. To aggregate the\nmultiview information, a fully-convolutional SED with a U-shape in spectral\ndimension is introduced to extract a multiview feature map. SPTT transforms the\nmultiview features into tokens using the spatial-pooling tokenization strategy\nand learns robust and discriminative spatial-spectral features for land cover\nidentification. Classification is conducted with a linear classifier.\nExperiments on three HSI datasets with rigid settings demonstrate the\nsuperiority of the proposed multiview transformer over the state-of-the-art\nmethods.\n","authors":["Jie Zhang","Yongshan Zhang","Yicong Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.07186v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07184v1","updated":"2023-10-11T04:20:32Z","published":"2023-10-11T04:20:32Z","title":"NeuroInspect: Interpretable Neuron-based Debugging Framework through\n  Class-conditional Visualizations","summary":"  Despite deep learning (DL) has achieved remarkable progress in various\ndomains, the DL models are still prone to making mistakes. This issue\nnecessitates effective debugging tools for DL practitioners to interpret the\ndecision-making process within the networks. However, existing debugging\nmethods often demand extra data or adjustments to the decision process,\nlimiting their applicability. To tackle this problem, we present NeuroInspect,\nan interpretable neuron-based debugging framework with three key stages:\ncounterfactual explanations, feature visualizations, and false correlation\nmitigation. Our debugging framework first pinpoints neurons responsible for\nmistakes in the network and then visualizes features embedded in the neurons to\nbe human-interpretable. To provide these explanations, we introduce\nCLIP-Illusion, a novel feature visualization method that generates images\nrepresenting features conditioned on classes to examine the connection between\nneurons and the decision layer. We alleviate convoluted explanations of the\nconventional visualization approach by employing class information, thereby\nisolating mixed properties. This process offers more human-interpretable\nexplanations for model errors without altering the trained network or requiring\nadditional data. Furthermore, our framework mitigates false correlations\nlearned from a dataset under a stochastic perspective, modifying decisions for\nthe neurons considered as the main causes. We validate the effectiveness of our\nframework by addressing false correlations and improving inferences for classes\nwith the worst performance in real-world settings. Moreover, we demonstrate\nthat NeuroInspect helps debug the mistakes of DL models through evaluation for\nhuman understanding. The code is openly available at\nhttps://github.com/yeongjoonJu/NeuroInspect.\n","authors":["Yeong-Joon Ju","Ji-Hoon Park","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2310.07184v1.pdf","comment":"Summitted to IEEE Transactions on Neural Networks and Learning\n  Systems (TNNLS)"},{"id":"http://arxiv.org/abs/2310.07179v1","updated":"2023-10-11T04:05:11Z","published":"2023-10-11T04:05:11Z","title":"rpcPRF: Generalizable MPI Neural Radiance Field for Satellite Camera","summary":"  Novel view synthesis of satellite images holds a wide range of practical\napplications. While recent advances in the Neural Radiance Field have\npredominantly targeted pin-hole cameras, and models for satellite cameras often\ndemand sufficient input views. This paper presents rpcPRF, a Multiplane Images\n(MPI) based Planar neural Radiance Field for Rational Polynomial Camera (RPC).\nUnlike coordinate-based neural radiance fields in need of sufficient views of\none scene, our model is applicable to single or few inputs and performs well on\nimages from unseen scenes. To enable generalization across scenes, we propose\nto use reprojection supervision to induce the predicted MPI to learn the\ncorrect geometry between the 3D coordinates and the images. Moreover, we remove\nthe stringent requirement of dense depth supervision from deep\nmultiview-stereo-based methods by introducing rendering techniques of radiance\nfields. rpcPRF combines the superiority of implicit representations and the\nadvantages of the RPC model, to capture the continuous altitude space while\nlearning the 3D structure. Given an RGB image and its corresponding RPC, the\nend-to-end model learns to synthesize the novel view with a new RPC and\nreconstruct the altitude of the scene. When multiple views are provided as\ninputs, rpcPRF exerts extra supervision provided by the extra views. On the TLC\ndataset from ZY-3, and the SatMVS3D dataset with urban scenes from WV-3, rpcPRF\noutperforms state-of-the-art nerf-based methods by a significant margin in\nterms of image fidelity, reconstruction accuracy, and efficiency, for both\nsingle-view and multiview task.\n","authors":["Tongtong Zhang","Yuanxiang Li"],"pdf_url":"https://arxiv.org/pdf/2310.07179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05136v3","updated":"2023-10-11T04:04:07Z","published":"2023-10-08T12:10:44Z","title":"InstructDET: Diversifying Referring Object Detection with Generalized\n  Instructions","summary":"  We propose InstructDET, a data-centric method for referring object detection\n(ROD) that localizes target objects based on user instructions. While deriving\nfrom referring expressions (REC), the instructions we leverage are greatly\ndiversified to encompass common user intentions related to object detection.\nFor one image, we produce tremendous instructions that refer to every single\nobject and different combinations of multiple objects. Each instruction and its\ncorresponding object bounding boxes (bbxs) constitute one training data pair.\nIn order to encompass common detection expressions, we involve emerging\nvision-language model (VLM) and large language model (LLM) to generate\ninstructions guided by text prompts and object bbxs, as the generalizations of\nfoundation models are effective to produce human-like expressions (e.g.,\ndescribing object property, category, and relationship). We name our\nconstructed dataset as InDET. It contains images, bbxs and generalized\ninstructions that are from foundation models. Our InDET is developed from\nexisting REC datasets and object detection datasets, with the expanding\npotential that any image with object bbxs can be incorporated through using our\nInstructDET method. By using our InDET dataset, we show that a conventional ROD\nmodel surpasses existing methods on standard REC datasets and our InDET test\nset. Our data-centric method InstructDET, with automatic data expansion by\nleveraging foundation models, directs a promising field that ROD can be greatly\ndiversified to execute common object detection instructions.\n","authors":["Ronghao Dang","Jiangyan Feng","Haodong Zhang","Chongjian Ge","Lin Song","Lijun Gong","Chengju Liu","Qijun Chen","Feng Zhu","Rui Zhao","Yibing Song"],"pdf_url":"https://arxiv.org/pdf/2310.05136v3.pdf","comment":"Adjust the subject"},{"id":"http://arxiv.org/abs/2310.07176v1","updated":"2023-10-11T04:00:17Z","published":"2023-10-11T04:00:17Z","title":"Improving mitosis detection on histopathology images using large\n  vision-language models","summary":"  In certain types of cancerous tissue, mitotic count has been shown to be\nassociated with tumor proliferation, poor prognosis, and therapeutic\nresistance. Due to the high inter-rater variability of mitotic counting by\npathologists, convolutional neural networks (CNNs) have been employed to reduce\nthe subjectivity of mitosis detection in hematoxylin and eosin (H&E)-stained\nwhole slide images. However, most existing models have performance that lags\nbehind expert panel review and only incorporate visual information. In this\nwork, we demonstrate that pre-trained large-scale vision-language models that\nleverage both visual features and natural language improve mitosis detection\naccuracy. We formulate the mitosis detection task as an image captioning task\nand a visual question answering (VQA) task by including metadata such as tumor\nand scanner types as context. The effectiveness of our pipeline is demonstrated\nvia comparison with various baseline models using 9,501 mitotic figures and\n11,051 hard negatives (non-mitotic figures that are difficult to characterize)\nfrom the publicly available Mitosis Domain Generalization Challenge (MIDOG22)\ndataset.\n","authors":["Ruiwen Ding","James Hall","Neil Tenenholtz","Kristen Severson"],"pdf_url":"https://arxiv.org/pdf/2310.07176v1.pdf","comment":"Submitted to IEEE ISBI 2024. Under review"},{"id":"http://arxiv.org/abs/2310.07166v1","updated":"2023-10-11T03:29:13Z","published":"2023-10-11T03:29:13Z","title":"Anchor-based Multi-view Subspace Clustering with Hierarchical Feature\n  Descent","summary":"  Multi-view clustering has attracted growing attention owing to its\ncapabilities of aggregating information from various sources and its promising\nhorizons in public affairs. Up till now, many advanced approaches have been\nproposed in recent literature. However, there are several ongoing difficulties\nto be tackled. One common dilemma occurs while attempting to align the features\nof different views. We dig out as well as deploy the dependency amongst views\nthrough hierarchical feature descent, which leads to a common latent space(\nSTAGE 1). This latent space, for the first time of its kind, is regarded as a\n'resemblance space', as it reveals certain correlations and dependencies of\ndifferent views. To be exact, the one-hot encoding of a category can also be\nreferred to as a resemblance space in its terminal phase. Moreover, due to the\nintrinsic fact that most of the existing multi-view clustering algorithms stem\nfrom k-means clustering and spectral clustering, this results in cubic time\ncomplexity w.r.t. the number of the objects. However, we propose Anchor-based\nMulti-view Subspace Clustering with Hierarchical Feature Descent(MVSC-HFD) to\nfurther reduce the computing complexity to linear time cost through a unified\nsampling strategy in resemblance space( STAGE 2), followed by subspace\nclustering to learn the representation collectively( STAGE 3). Extensive\nexperimental results on public benchmark datasets demonstrate that our proposed\nmodel consistently outperforms the state-of-the-art techniques.\n","authors":["Qiyuan Ou","Siwei Wang","Pei Zhang","Sihang Zhou","En Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.07166v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.17546v2","updated":"2023-10-11T03:19:18Z","published":"2023-03-30T17:13:56Z","title":"PAIR-Diffusion: A Comprehensive Multimodal Object-Level Image Editor","summary":"  Generative image editing has recently witnessed extremely fast-paced growth.\nSome works use high-level conditioning such as text, while others use low-level\nconditioning. Nevertheless, most of them lack fine-grained control over the\nproperties of the different objects present in the image, i.e.\\,object-level\nimage editing. In this work, we tackle the task by perceiving the images as an\namalgamation of various objects and aim to control the properties of each\nobject in a fine-grained manner. Out of these properties, we identify structure\nand appearance as the most intuitive to understand and useful for editing\npurposes. We propose \\textbf{PAIR} Diffusion, a generic framework that can\nenable a diffusion model to control the structure and appearance properties of\neach object in the image. We show that having control over the properties of\neach object in an image leads to comprehensive editing capabilities. Our\nframework allows for various object-level editing operations on real images\nsuch as reference image-based appearance editing, free-form shape editing,\nadding objects, and variations. Thanks to our design, we do not require any\ninversion step. Additionally, we propose multimodal classifier-free guidance\nwhich enables editing images using both reference images and text when using\nour approach with foundational diffusion models. We validate the above claims\nby extensively evaluating our framework on both unconditional and foundational\ndiffusion models. Please refer to\nhttps://vidit98.github.io/publication/conference-paper/pair_diff.html for code\nand model release.\n","authors":["Vidit Goel","Elia Peruzzo","Yifan Jiang","Dejia Xu","Xingqian Xu","Nicu Sebe","Trevor Darrell","Zhangyang Wang","Humphrey Shi"],"pdf_url":"https://arxiv.org/pdf/2303.17546v2.pdf","comment":"26 pages and 17 figures"},{"id":"http://arxiv.org/abs/2202.07870v2","updated":"2023-10-11T03:11:14Z","published":"2022-02-16T05:47:31Z","title":"IPD:An Incremental Prototype based DBSCAN for large-scale data with\n  cluster representatives","summary":"  DBSCAN is a fundamental density-based clustering technique that identifies\nany arbitrary shape of the clusters. However, it becomes infeasible while\nhandling big data. On the other hand, centroid-based clustering is important\nfor detecting patterns in a dataset since unprocessed data points can be\nlabeled to their nearest centroid. However, it can not detect non-spherical\nclusters. For a large data, it is not feasible to store and compute labels of\nevery samples. These can be done as and when the information is required. The\npurpose can be accomplished when clustering act as a tool to identify cluster\nrepresentatives and query is served by assigning cluster labels of nearest\nrepresentative. In this paper, we propose an Incremental Prototype-based DBSCAN\n(IPD) algorithm which is designed to identify arbitrary-shaped clusters for\nlarge-scale data. Additionally, it chooses a set of representatives for each\ncluster.\n","authors":["Jayasree Saha","Jayanta Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2202.07870v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06925v2","updated":"2023-10-11T02:52:46Z","published":"2023-04-14T05:21:47Z","title":"YOLO-Drone:Airborne real-time detection of dense small objects from\n  high-altitude perspective","summary":"  Unmanned Aerial Vehicles (UAVs), specifically drones equipped with remote\nsensing object detection technology, have rapidly gained a broad spectrum of\napplications and emerged as one of the primary research focuses in the field of\ncomputer vision. Although UAV remote sensing systems have the ability to detect\nvarious objects, small-scale objects can be challenging to detect reliably due\nto factors such as object size, image degradation, and real-time limitations.\nTo tackle these issues, a real-time object detection algorithm (YOLO-Drone) is\nproposed and applied to two new UAV platforms as well as a specific light\nsource (silicon-based golden LED). YOLO-Drone presents several novelties: 1)\nincluding a new backbone Darknet59; 2) a new complex feature aggregation module\nMSPP-FPN that incorporated one spatial pyramid pooling and three atrous spatial\npyramid pooling modules; 3) and the use of Generalized Intersection over Union\n(GIoU) as the loss function. To evaluate performance, two benchmark datasets,\nUAVDT and VisDrone, along with one homemade dataset acquired at night under\nsilicon-based golden LEDs, are utilized. The experimental results show that, in\nboth UAVDT and VisDrone, the proposed YOLO-Drone outperforms state-of-the-art\n(SOTA) object detection methods by improving the mAP of 10.13% and 8.59%,\nrespectively. With regards to UAVDT, the YOLO-Drone exhibits both high\nreal-time inference speed of 53 FPS and a maximum mAP of 34.04%. Notably,\nYOLO-Drone achieves high performance under the silicon-based golden LEDs, with\na mAP of up to 87.71%, surpassing the performance of YOLO series under ordinary\nlight sources. To conclude, the proposed YOLO-Drone is a highly effective\nsolution for object detection in UAV applications, particularly for night\ndetection tasks where silicon-based golden light LED technology exhibits\nsignificant superiority.\n","authors":["Li Zhu","Jiahui Xiong","Feng Xiong","Hanzheng Hu","Zhengnan Jiang"],"pdf_url":"https://arxiv.org/pdf/2304.06925v2.pdf","comment":"Some contributing authors are not signed"},{"id":"http://arxiv.org/abs/2310.07149v1","updated":"2023-10-11T02:50:16Z","published":"2023-10-11T02:50:16Z","title":"Robust Unsupervised Domain Adaptation by Retaining Confident Entropy via\n  Edge Concatenation","summary":"  The generalization capability of unsupervised domain adaptation can mitigate\nthe need for extensive pixel-level annotations to train semantic segmentation\nnetworks by training models on synthetic data as a source with\ncomputer-generated annotations. Entropy-based adversarial networks are proposed\nto improve source domain prediction; however, they disregard significant\nexternal information, such as edges, which have the potential to identify and\ndistinguish various objects within an image accurately. To address this issue,\nwe introduce a novel approach to domain adaptation, leveraging the synergy of\ninternal and external information within entropy-based adversarial networks. In\nthis approach, we enrich the discriminator network with edge-predicted\nprobability values within this innovative framework to enhance the clarity of\nclass boundaries. Furthermore, we devised a probability-sharing network that\nintegrates diverse information for more effective segmentation. Incorporating\nobject edges addresses a pivotal aspect of unsupervised domain adaptation that\nhas frequently been neglected in the past -- the precise delineation of object\nboundaries. Conventional unsupervised domain adaptation methods usually center\naround aligning feature distributions and may not explicitly model object\nboundaries. Our approach effectively bridges this gap by offering clear\nguidance on object boundaries, thereby elevating the quality of domain\nadaptation. Our approach undergoes rigorous evaluation on the established\nunsupervised domain adaptation benchmarks, specifically in adapting SYNTHIA\n$\\rightarrow$ Cityscapes and SYNTHIA $\\rightarrow$ Mapillary. Experimental\nresults show that the proposed model attains better performance than\nstate-of-the-art methods. The superior performance across different\nunsupervised domain adaptation scenarios highlights the versatility and\nrobustness of the proposed method.\n","authors":["Hye-Seong Hong","Abhishek Kumar","Dong-Gyu Lee"],"pdf_url":"https://arxiv.org/pdf/2310.07149v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06282v2","updated":"2023-10-11T02:46:12Z","published":"2023-10-10T03:32:33Z","title":"MuseChat: A Conversational Music Recommendation System for Videos","summary":"  We introduce MuseChat, an innovative dialog-based music recommendation\nsystem. This unique platform not only offers interactive user engagement but\nalso suggests music tailored for input videos, so that users can refine and\npersonalize their music selections. In contrast, previous systems predominantly\nemphasized content compatibility, often overlooking the nuances of users'\nindividual preferences. For example, all the datasets only provide basic\nmusic-video pairings or such pairings with textual music descriptions. To\naddress this gap, our research offers three contributions. First, we devise a\nconversation-synthesis method that simulates a two-turn interaction between a\nuser and a recommendation system, which leverages pre-trained music tags and\nartist information. In this interaction, users submit a video to the system,\nwhich then suggests a suitable music piece with a rationale. Afterwards, users\ncommunicate their musical preferences, and the system presents a refined music\nrecommendation with reasoning. Second, we introduce a multi-modal\nrecommendation engine that matches music either by aligning it with visual cues\nfrom the video or by harmonizing visual information, feedback from previously\nrecommended music, and the user's textual input. Third, we bridge music\nrepresentations and textual data with a Large Language Model(Vicuna-7B). This\nalignment equips MuseChat to deliver music recommendations and their underlying\nreasoning in a manner resembling human communication. Our evaluations show that\nMuseChat surpasses existing state-of-the-art models in music retrieval tasks\nand pioneers the integration of the recommendation process within a natural\nlanguage framework.\n","authors":["Zhikang Dong","Bin Chen","Xiulong Liu","Pawel Polak","Peng Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.06282v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15854v2","updated":"2023-10-11T02:34:23Z","published":"2023-08-30T08:40:15Z","title":"Zero-shot Inversion Process for Image Attribute Editing with Diffusion\n  Models","summary":"  Denoising diffusion models have shown outstanding performance in image\nediting. Existing works tend to use either image-guided methods, which provide\na visual reference but lack control over semantic coherence, or text-guided\nmethods, which ensure faithfulness to text guidance but lack visual quality. To\naddress the problem, we propose the Zero-shot Inversion Process (ZIP), a\nframework that injects a fusion of generated visual reference and text guidance\ninto the semantic latent space of a \\textit{frozen} pre-trained diffusion\nmodel. Only using a tiny neural network, the proposed ZIP produces diverse\ncontent and attributes under the intuitive control of the text prompt.\nMoreover, ZIP shows remarkable robustness for both in-domain and out-of-domain\nattribute manipulation on real images. We perform detailed experiments on\nvarious benchmark datasets. Compared to state-of-the-art methods, ZIP produces\nimages of equivalent quality while providing a realistic editing effect.\n","authors":["Zhanbo Feng","Zenan Ling","Ci Gong","Feng Zhou","Jie Li","Robert C. Qiu"],"pdf_url":"https://arxiv.org/pdf/2308.15854v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07138v1","updated":"2023-10-11T02:23:18Z","published":"2023-10-11T02:23:18Z","title":"Denoising Task Routing for Diffusion Models","summary":"  Diffusion models generate highly realistic images through learning a\nmulti-step denoising process, naturally embodying the principles of multi-task\nlearning (MTL). Despite the inherent connection between diffusion models and\nMTL, there remains an unexplored area in designing neural architectures that\nexplicitly incorporate MTL into the framework of diffusion models. In this\npaper, we present Denoising Task Routing (DTR), a simple add-on strategy for\nexisting diffusion model architectures to establish distinct information\npathways for individual tasks within a single architecture by selectively\nactivating subsets of channels in the model. What makes DTR particularly\ncompelling is its seamless integration of prior knowledge of denoising tasks\ninto the framework: (1) Task Affinity: DTR activates similar channels for tasks\nat adjacent timesteps and shifts activated channels as sliding windows through\ntimesteps, capitalizing on the inherent strong affinity between tasks at\nadjacent timesteps. (2) Task Weights: During the early stages (higher\ntimesteps) of the denoising process, DTR assigns a greater number of\ntask-specific channels, leveraging the insight that diffusion models prioritize\nreconstructing global structure and perceptually rich contents in earlier\nstages, and focus on simple noise removal in later stages. Our experiments\ndemonstrate that DTR consistently enhances the performance of diffusion models\nacross various evaluation protocols, all without introducing additional\nparameters. Furthermore, DTR contributes to accelerating convergence during\ntraining. Finally, we show the complementarity between our architectural\napproach and existing MTL optimization techniques, providing a more complete\nview of MTL within the context of diffusion training.\n","authors":["Byeongjun Park","Sangmin Woo","Hyojun Go","Jin-Young Kim","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2310.07138v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07131v1","updated":"2023-10-11T02:08:05Z","published":"2023-10-11T02:08:05Z","title":"Echocardiography video synthesis from end diastolic semantic map via\n  diffusion model","summary":"  Denoising Diffusion Probabilistic Models (DDPMs) have demonstrated\nsignificant achievements in various image and video generation tasks, including\nthe domain of medical imaging. However, generating echocardiography videos\nbased on semantic anatomical information remains an unexplored area of\nresearch. This is mostly due to the constraints imposed by the currently\navailable datasets, which lack sufficient scale and comprehensive frame-wise\nannotations for every cardiac cycle. This paper aims to tackle the\naforementioned challenges by expanding upon existing video diffusion models for\nthe purpose of cardiac video synthesis. More specifically, our focus lies in\ngenerating video using semantic maps of the initial frame during the cardiac\ncycle, commonly referred to as end diastole. To further improve the synthesis\nprocess, we integrate spatial adaptive normalization into multiscale feature\nmaps. This enables the inclusion of semantic guidance during synthesis,\nresulting in enhanced realism and coherence of the resultant video sequences.\nExperiments are conducted on the CAMUS dataset, which is a highly used dataset\nin the field of echocardiography. Our model exhibits better performance\ncompared to the standard diffusion technique in terms of multiple metrics,\nincluding FID, FVD, and SSMI.\n","authors":["Phi Nguyen Van","Duc Tran Minh","Hieu Pham Huy","Long Tran Quoc"],"pdf_url":"https://arxiv.org/pdf/2310.07131v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01636v2","updated":"2023-10-11T02:02:48Z","published":"2023-10-02T21:02:23Z","title":"Adaptive Visual Scene Understanding: Incremental Scene Graph Generation","summary":"  Scene graph generation (SGG) involves analyzing images to extract meaningful\ninformation about objects and their relationships. Given the dynamic nature of\nthe visual world, it becomes crucial for AI systems to detect new objects and\nestablish their new relationships with existing objects. To address the lack of\ncontinual learning methodologies in SGG, we introduce the comprehensive\nContinual ScenE Graph Generation (CSEGG) dataset along with 3 learning\nscenarios and 8 evaluation metrics. Our research investigates the continual\nlearning performances of existing SGG methods on the retention of previous\nobject entities and relationships as they learn new ones. Moreover, we also\nexplore how continual object detection enhances generalization in classifying\nknown relationships on unknown objects. We conduct extensive experiments\nbenchmarking and analyzing the classical two-stage SGG methods and the most\nrecent transformer-based SGG methods in continual learning settings, and gain\nvaluable insights into the CSEGG problem. We invite the research community to\nexplore this emerging field of study.\n","authors":["Naitik Khandelwal","Xiao Liu","Mengmi Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.01636v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15142v4","updated":"2023-10-11T01:33:21Z","published":"2023-06-27T02:03:46Z","title":"LRANet: Towards Accurate and Efficient Scene Text Detection with\n  Low-Rank Approximation Network","summary":"  Recently, regression-based methods, which predict parameterized text shapes\nfor text localization, have gained popularity in scene text detection. However,\nthe existing parameterized text shape methods still have limitations in\nmodeling arbitrary-shaped texts due to ignoring the utilization of\ntext-specific shape information. Moreover, the time consumption of the entire\npipeline has been largely overlooked, leading to a suboptimal overall inference\nspeed. To address these issues, we first propose a novel parameterized text\nshape method based on low-rank approximation. Unlike other shape representation\nmethods that employ data-irrelevant parameterization, our approach utilizes\nsingular value decomposition and reconstructs the text shape using a few\neigenvectors learned from labeled text contours. By exploring the shape\ncorrelation among different text contours, our method achieves consistency,\ncompactness, simplicity, and robustness in shape representation. Next, we\npropose a dual assignment scheme for speed acceleration. It adopts a sparse\nassignment branch to accelerate the inference speed, and meanwhile, provides\nample supervised signals for training through a dense assignment branch.\nBuilding upon these designs, we implement an accurate and efficient\narbitrary-shaped text detector named LRANet. Extensive experiments are\nconducted on several challenging benchmarks, demonstrating the superior\naccuracy and efficiency of LRANet compared to state-of-the-art methods. Code\nwill be released soon.\n","authors":["Yuchen Su","Zhineng Chen","Zhiwen Shao","Yuning Du","Zhilong Ji","Jinfeng Bai","Yong Zhou","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2306.15142v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.14349v3","updated":"2023-10-11T01:30:37Z","published":"2021-06-28T00:34:15Z","title":"PNet -- A Deep Learning Based Photometry and Astrometry Bayesian\n  Framework","summary":"  Time domain astronomy has emerged as a vibrant research field in recent\nyears, focusing on celestial objects that exhibit variable magnitudes or\npositions. Given the urgency of conducting follow-up observations for such\nobjects, the development of an algorithm capable of detecting them and\ndetermining their magnitudes and positions has become imperative. Leveraging\nthe advancements in deep neural networks, we present the PNet, an end-to-end\nframework designed not only to detect celestial objects and extract their\nmagnitudes and positions but also to estimate photometry uncertainty. The PNet\ncomprises two essential steps. Firstly, it detects stars and retrieves their\npositions, magnitudes, and calibrated magnitudes. Subsequently, in the second\nphase, the PNet estimates the uncertainty associated with the photometry\nresults, serving as a valuable reference for the light curve classification\nalgorithm. Our algorithm has been tested using both simulated and real\nobservation data, demonstrating the PNet's ability to deliver consistent and\nreliable outcomes. Integration of the PNet into data processing pipelines for\ntime-domain astronomy holds significant potential for enhancing response speed\nand improving the detection capabilities for celestial objects with variable\npositions and magnitudes.\n","authors":["Rui Sun","Peng Jia","Yongyang Sun","Zhimin Yang","Qiang Liu","Hongyan Wei"],"pdf_url":"https://arxiv.org/pdf/2106.14349v3.pdf","comment":"To be published in the AJ and welcome to any comments"},{"id":"http://arxiv.org/abs/2310.04780v2","updated":"2023-10-11T00:38:50Z","published":"2023-10-07T11:45:33Z","title":"IPMix: Label-Preserving Data Augmentation Method for Training Robust\n  Classifiers","summary":"  Data augmentation has been proven effective for training high-accuracy\nconvolutional neural network classifiers by preventing overfitting. However,\nbuilding deep neural networks in real-world scenarios requires not only high\naccuracy on clean data but also robustness when data distributions shift. While\nprior methods have proposed that there is a trade-off between accuracy and\nrobustness, we propose IPMix, a simple data augmentation approach to improve\nrobustness without hurting clean accuracy. IPMix integrates three levels of\ndata augmentation (image-level, patch-level, and pixel-level) into a coherent\nand label-preserving technique to increase the diversity of training data with\nlimited computational overhead. To further improve the robustness, IPMix\nintroduces structural complexity at different levels to generate more diverse\nimages and adopts the random mixing method for multi-scale information fusion.\nExperiments demonstrate that IPMix outperforms state-of-the-art corruption\nrobustness on CIFAR-C and ImageNet-C. In addition, we show that IPMix also\nsignificantly improves the other safety measures, including robustness to\nadversarial perturbations, calibration, prediction consistency, and anomaly\ndetection, achieving state-of-the-art or comparable results on several\nbenchmarks, including ImageNet-R, ImageNet-A, and ImageNet-O.\n","authors":["Zhenglin Huang","Xianan Bao","Na Zhang","Qingqi Zhang","Xiaomei Tu","Biao Wu","Xi Yang"],"pdf_url":"https://arxiv.org/pdf/2310.04780v2.pdf","comment":null},{"id":"http://arxiv.org/abs/1910.11103v2","updated":"2023-10-11T00:11:45Z","published":"2019-10-16T23:30:22Z","title":"SPEC2: SPECtral SParsE CNN Accelerator on FPGAs","summary":"  To accelerate inference of Convolutional Neural Networks (CNNs), various\ntechniques have been proposed to reduce computation redundancy. Converting\nconvolutional layers into frequency domain significantly reduces the\ncomputation complexity of the sliding window operations in space domain. On the\nother hand, weight pruning techniques address the redundancy in model\nparameters by converting dense convolutional kernels into sparse ones. To\nobtain high-throughput FPGA implementation, we propose SPEC2 -- the first work\nto prune and accelerate spectral CNNs. First, we propose a systematic pruning\nalgorithm based on Alternative Direction Method of Multipliers (ADMM). The\noffline pruning iteratively sets the majority of spectral weights to zero,\nwithout using any handcrafted heuristics. Then, we design an optimized pipeline\narchitecture on FPGA that has efficient random access into the sparse kernels\nand exploits various dimensions of parallelism in convolutional layers.\nOverall, SPEC2 achieves high inference throughput with extremely low\ncomputation complexity and negligible accuracy degradation. We demonstrate\nSPEC2 by pruning and implementing LeNet and VGG16 on the Xilinx Virtex\nplatform. After pruning 75% of the spectral weights, SPEC2 achieves 0% accuracy\nloss for LeNet, and <1% accuracy loss for VGG16. The resulting accelerators\nachieve up to 24x higher throughput, compared with the state-of-the-art FPGA\nimplementations for VGG16.\n","authors":["Yue Niu","Hanqing Zeng","Ajitesh Srivastava","Kartik Lakhotia","Rajgopal Kannan","Yanzhi Wang","Viktor Prasanna"],"pdf_url":"https://arxiv.org/pdf/1910.11103v2.pdf","comment":"This is a 10-page conference paper in 26TH IEEE International\n  Conference On High Performance Computing, Data, and Analytics (HiPC)"},{"id":"http://arxiv.org/abs/2306.06323v2","updated":"2023-10-11T23:40:03Z","published":"2023-06-10T00:27:37Z","title":"Learning Joint Latent Space EBM Prior Model for Multi-layer Generator","summary":"  This paper studies the fundamental problem of learning multi-layer generator\nmodels. The multi-layer generator model builds multiple layers of latent\nvariables as a prior model on top of the generator, which benefits learning\ncomplex data distribution and hierarchical representations. However, such a\nprior model usually focuses on modeling inter-layer relations between latent\nvariables by assuming non-informative (conditional) Gaussian distributions,\nwhich can be limited in model expressivity. To tackle this issue and learn more\nexpressive prior models, we propose an energy-based model (EBM) on the joint\nlatent space over all layers of latent variables with the multi-layer generator\nas its backbone. Such joint latent space EBM prior model captures the\nintra-layer contextual relations at each layer through layer-wise energy terms,\nand latent variables across different layers are jointly corrected. We develop\na joint training scheme via maximum likelihood estimation (MLE), which involves\nMarkov Chain Monte Carlo (MCMC) sampling for both prior and posterior\ndistributions of the latent variables from different layers. To ensure\nefficient inference and learning, we further propose a variational training\nscheme where an inference model is used to amortize the costly posterior MCMC\nsampling. Our experiments demonstrate that the learned model can be expressive\nin generating high-quality images and capturing hierarchical features for\nbetter outlier detection.\n","authors":["Jiali Cui","Ying Nian Wu","Tian Han"],"pdf_url":"https://arxiv.org/pdf/2306.06323v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03135v3","updated":"2023-10-11T23:38:36Z","published":"2023-07-06T17:05:26Z","title":"Distilling Large Vision-Language Model with Out-of-Distribution\n  Generalizability","summary":"  Large vision-language models have achieved outstanding performance, but their\nsize and computational requirements make their deployment on\nresource-constrained devices and time-sensitive tasks impractical. Model\ndistillation, the process of creating smaller, faster models that maintain the\nperformance of larger models, is a promising direction towards the solution.\nThis paper investigates the distillation of visual representations in large\nteacher vision-language models into lightweight student models using a small-\nor mid-scale dataset. Notably, this study focuses on open-vocabulary\nout-of-distribution (OOD) generalization, a challenging problem that has been\noverlooked in previous model distillation literature. We propose two principles\nfrom vision and language modality perspectives to enhance student's OOD\ngeneralization: (1) by better imitating teacher's visual representation space,\nand carefully promoting better coherence in vision-language alignment with the\nteacher; (2) by enriching the teacher's language representations with\ninformative and finegrained semantic attributes to effectively distinguish\nbetween different labels. We propose several metrics and conduct extensive\nexperiments to investigate their techniques. The results demonstrate\nsignificant improvements in zero-shot and few-shot student performance on\nopen-vocabulary out-of-distribution classification, highlighting the\neffectiveness of our proposed approaches. Poster:\nhttps://xuanlinli17.github.io/pdfs/iccv23_large_vlm_distillation_poster.pdf\nCode: https://github.com/xuanlinli17/large_vlm_distillation_ood\n","authors":["Xuanlin Li","Yunhao Fang","Minghua Liu","Zhan Ling","Zhuowen Tu","Hao Su"],"pdf_url":"https://arxiv.org/pdf/2307.03135v3.pdf","comment":"Published at International Conference on Computer Vision (ICCV) 2023.\n  Poster at\n  https://xuanlinli17.github.io/pdfs/iccv23_large_vlm_distillation_poster.pdf"},{"id":"http://arxiv.org/abs/2310.07932v1","updated":"2023-10-11T23:04:07Z","published":"2023-10-11T23:04:07Z","title":"What Matters to You? Towards Visual Representation Alignment for Robot\n  Learning","summary":"  When operating in service of people, robots need to optimize rewards aligned\nwith end-user preferences. Since robots will rely on raw perceptual inputs like\nRGB images, their rewards will inevitably use visual representations. Recently\nthere has been excitement in using representations from pre-trained visual\nmodels, but key to making these work in robotics is fine-tuning, which is\ntypically done via proxy tasks like dynamics prediction or enforcing temporal\ncycle-consistency. However, all these proxy tasks bypass the human's input on\nwhat matters to them, exacerbating spurious correlations and ultimately leading\nto robot behaviors that are misaligned with user preferences. In this work, we\npropose that robots should leverage human feedback to align their visual\nrepresentations with the end-user and disentangle what matters for the task. We\npropose Representation-Aligned Preference-based Learning (RAPL), a method for\nsolving the visual representation alignment problem and visual reward learning\nproblem through the lens of preference-based learning and optimal transport.\nAcross experiments in X-MAGICAL and in robotic manipulation, we find that\nRAPL's reward consistently generates preferred robot behaviors with high sample\nefficiency, and shows strong zero-shot generalization when the visual\nrepresentation is learned from a different embodiment than the robot's.\n","authors":["Ran Tian","Chenfeng Xu","Masayoshi Tomizuka","Jitendra Malik","Andrea Bajcsy"],"pdf_url":"https://arxiv.org/pdf/2310.07932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07931v1","updated":"2023-10-11T23:01:29Z","published":"2023-10-11T23:01:29Z","title":"D2 Pruning: Message Passing for Balancing Diversity and Difficulty in\n  Data Pruning","summary":"  Analytical theories suggest that higher-quality data can lead to lower test\nerrors in models trained on a fixed data budget. Moreover, a model can be\ntrained on a lower compute budget without compromising performance if a dataset\ncan be stripped of its redundancies. Coreset selection (or data pruning) seeks\nto select a subset of the training data so as to maximize the performance of\nmodels trained on this subset, also referred to as coreset. There are two\ndominant approaches: (1) geometry-based data selection for maximizing data\ndiversity in the coreset, and (2) functions that assign difficulty scores to\nsamples based on training dynamics. Optimizing for data diversity leads to a\ncoreset that is biased towards easier samples, whereas, selection by difficulty\nranking omits easy samples that are necessary for the training of deep learning\nmodels. This demonstrates that data diversity and importance scores are two\ncomplementary factors that need to be jointly considered during coreset\nselection. We represent a dataset as an undirected graph and propose a novel\npruning algorithm, D2 Pruning, that uses forward and reverse message passing\nover this dataset graph for coreset selection. D2 Pruning updates the\ndifficulty scores of each example by incorporating the difficulty of its\nneighboring examples in the dataset graph. Then, these updated difficulty\nscores direct a graph-based sampling method to select a coreset that\nencapsulates both diverse and difficult regions of the dataset space. We\nevaluate supervised and self-supervised versions of our method on various\nvision and language datasets. Results show that D2 Pruning improves coreset\nselection over previous state-of-the-art methods for up to 70% pruning rates.\nAdditionally, we find that using D2 Pruning for filtering large multimodal\ndatasets leads to increased diversity in the dataset and improved\ngeneralization of pretrained models.\n","authors":["Adyasha Maharana","Prateek Yadav","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2310.07931v1.pdf","comment":"17 pages (Our code is available at\n  https://github.com/adymaharana/d2pruning)"},{"id":"http://arxiv.org/abs/2308.12364v2","updated":"2023-10-11T22:38:38Z","published":"2023-08-23T18:08:32Z","title":"Saliency-based Video Summarization for Face Anti-spoofing","summary":"  With the growing availability of databases for face presentation attack\ndetection, researchers are increasingly focusing on video-based face\nanti-spoofing methods that involve hundreds to thousands of images for training\nthe models. However, there is currently no clear consensus on the optimal\nnumber of frames in a video to improve face spoofing detection. Inspired by the\nvisual saliency theory, we present a video summarization method for face\nanti-spoofing detection that aims to enhance the performance and efficiency of\ndeep learning models by leveraging visual saliency. In particular, saliency\ninformation is extracted from the differences between the Laplacian and Wiener\nfilter outputs of the source images, enabling identification of the most\nvisually salient regions within each frame. Subsequently, the source images are\ndecomposed into base and detail images, enhancing the representation of the\nmost important information. Weighting maps are then computed based on the\nsaliency information, indicating the importance of each pixel in the image. By\nlinearly combining the base and detail images using the weighting maps, the\nmethod fuses the source images to create a single representative image that\nsummarizes the entire video. The key contribution of the proposed method lies\nin demonstrating how visual saliency can be used as a data-centric approach to\nimprove the performance and efficiency for face presentation attack detection.\nBy focusing on the most salient images or regions within the images, a more\nrepresentative and diverse training set can be created, potentially leading to\nmore effective models. To validate the method's effectiveness, a simple CNN-RNN\ndeep learning architecture was used, and the experimental results showcased\nstate-of-the-art performance on five challenging face anti-spoofing datasets\n","authors":["Usman Muhammad","Mourad Oussalah","Jorma Laaksonen"],"pdf_url":"https://arxiv.org/pdf/2308.12364v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14616v3","updated":"2023-10-11T22:15:54Z","published":"2023-09-26T02:09:52Z","title":"NDC-Scene: Boost Monocular 3D Semantic Scene Completion in Normalized\n  Device Coordinates Space","summary":"  Monocular 3D Semantic Scene Completion (SSC) has garnered significant\nattention in recent years due to its potential to predict complex semantics and\ngeometry shapes from a single image, requiring no 3D inputs. In this paper, we\nidentify several critical issues in current state-of-the-art methods, including\nthe Feature Ambiguity of projected 2D features in the ray to the 3D space, the\nPose Ambiguity of the 3D convolution, and the Computation Imbalance in the 3D\nconvolution across different depth levels. To address these problems, we devise\na novel Normalized Device Coordinates scene completion network (NDC-Scene) that\ndirectly extends the 2D feature map to a Normalized Device Coordinates (NDC)\nspace, rather than to the world space directly, through progressive restoration\nof the dimension of depth with deconvolution operations. Experiment results\ndemonstrate that transferring the majority of computation from the target 3D\nspace to the proposed normalized device coordinates space benefits monocular\nSSC tasks. Additionally, we design a Depth-Adaptive Dual Decoder to\nsimultaneously upsample and fuse the 2D and 3D feature maps, further improving\noverall performance. Our extensive experiments confirm that the proposed method\nconsistently outperforms state-of-the-art methods on both outdoor SemanticKITTI\nand indoor NYUv2 datasets. Our code are available at\nhttps://github.com/Jiawei-Yao0812/NDCScene.\n","authors":["Jiawei Yao","Chuming Li","Keqiang Sun","Yingjie Cai","Hao Li","Wanli Ouyang","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2309.14616v3.pdf","comment":"Accepted at ICCV 2023. Project page:\n  https://jiawei-yao0812.github.io/NDC-Scene/"},{"id":"http://arxiv.org/abs/2310.07916v1","updated":"2023-10-11T22:04:33Z","published":"2023-10-11T22:04:33Z","title":"Dynamic Appearance Particle Neural Radiance Field","summary":"  Neural Radiance Fields (NeRFs) have shown great potential in modelling 3D\nscenes. Dynamic NeRFs extend this model by capturing time-varying elements,\ntypically using deformation fields. The existing dynamic NeRFs employ a similar\nEulerian representation for both light radiance and deformation fields. This\nleads to a close coupling of appearance and motion and lacks a physical\ninterpretation. In this work, we propose Dynamic Appearance Particle Neural\nRadiance Field (DAP-NeRF), which introduces particle-based representation to\nmodel the motions of visual elements in a dynamic 3D scene. DAP-NeRF consists\nof superposition of a static field and a dynamic field. The dynamic field is\nquantised as a collection of {\\em appearance particles}, which carries the\nvisual information of a small dynamic element in the scene and is equipped with\na motion model. All components, including the static field, the visual features\nand motion models of the particles, are learned from monocular videos without\nany prior geometric knowledge of the scene. We develop an efficient\ncomputational framework for the particle-based model. We also construct a new\ndataset to evaluate motion modelling. Experimental results show that DAP-NeRF\nis an effective technique to capture not only the appearance but also the\nphysically meaningful motions in a 3D dynamic scene.\n","authors":["Ancheng Lin","Jun Li"],"pdf_url":"https://arxiv.org/pdf/2310.07916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07896v1","updated":"2023-10-11T21:07:14Z","published":"2023-10-11T21:07:14Z","title":"NoMaD: Goal Masked Diffusion Policies for Navigation and Exploration","summary":"  Robotic learning for navigation in unfamiliar environments needs to provide\npolicies for both task-oriented navigation (i.e., reaching a goal that the\nrobot has located), and task-agnostic exploration (i.e., searching for a goal\nin a novel setting). Typically, these roles are handled by separate models, for\nexample by using subgoal proposals, planning, or separate navigation\nstrategies. In this paper, we describe how we can train a single unified\ndiffusion policy to handle both goal-directed navigation and goal-agnostic\nexploration, with the latter providing the ability to search novel\nenvironments, and the former providing the ability to reach a user-specified\ngoal once it has been located. We show that this unified policy results in\nbetter overall performance when navigating to visually indicated goals in novel\nenvironments, as compared to approaches that use subgoal proposals from\ngenerative models, or prior methods based on latent variable models. We\ninstantiate our method by using a large-scale Transformer-based policy trained\non data from multiple ground robots, with a diffusion model decoder to flexibly\nhandle both goal-conditioned and goal-agnostic navigation. Our experiments,\nconducted on a real-world mobile robot platform, show effective navigation in\nunseen environments in comparison with five alternative methods, and\ndemonstrate significant improvements in performance and lower collision rates,\ndespite utilizing smaller models than state-of-the-art approaches. For more\nvideos, code, and pre-trained model checkpoints, see\nhttps://general-navigation-models.github.io/nomad/\n","authors":["Ajay Sridhar","Dhruv Shah","Catherine Glossop","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2310.07896v1.pdf","comment":"Project page https://general-navigation-models.github.io/nomad/"},{"id":"http://arxiv.org/abs/2310.07894v1","updated":"2023-10-11T21:04:42Z","published":"2023-10-11T21:04:42Z","title":"Efficient Integrators for Diffusion Generative Models","summary":"  Diffusion models suffer from slow sample generation at inference time.\nTherefore, developing a principled framework for fast deterministic/stochastic\nsampling for a broader class of diffusion models is a promising direction. We\npropose two complementary frameworks for accelerating sample generation in\npre-trained models: Conjugate Integrators and Splitting Integrators. Conjugate\nintegrators generalize DDIM, mapping the reverse diffusion dynamics to a more\namenable space for sampling. In contrast, splitting-based integrators, commonly\nused in molecular dynamics, reduce the numerical simulation error by cleverly\nalternating between numerical updates involving the data and auxiliary\nvariables. After extensively studying these methods empirically and\ntheoretically, we present a hybrid method that leads to the best-reported\nperformance for diffusion models in augmented spaces. Applied to Phase Space\nLangevin Diffusion [Pandey & Mandt, 2023] on CIFAR-10, our deterministic and\nstochastic samplers achieve FID scores of 2.11 and 2.36 in only 100 network\nfunction evaluations (NFE) as compared to 2.57 and 2.63 for the best-performing\nbaselines, respectively. Our code and model checkpoints will be made publicly\navailable at \\url{https://github.com/mandt-lab/PSLD}.\n","authors":["Kushagra Pandey","Maja Rudolph","Stephan Mandt"],"pdf_url":"https://arxiv.org/pdf/2310.07894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07889v1","updated":"2023-10-11T20:52:30Z","published":"2023-10-11T20:52:30Z","title":"LangNav: Language as a Perceptual Representation for Navigation","summary":"  We explore the use of language as a perceptual representation for\nvision-and-language navigation. Our approach uses off-the-shelf vision systems\n(for image captioning and object detection) to convert an agent's egocentric\npanoramic view at each time step into natural language descriptions. We then\nfinetune a pretrained language model to select an action, based on the current\nview and the trajectory history, that would best fulfill the navigation\ninstructions. In contrast to the standard setup which adapts a pretrained\nlanguage model to work directly with continuous visual features from pretrained\nvision models, our approach instead uses (discrete) language as the perceptual\nrepresentation. We explore two use cases of our language-based navigation\n(LangNav) approach on the R2R vision-and-language navigation benchmark:\ngenerating synthetic trajectories from a prompted large language model (GPT-4)\nwith which to finetune a smaller language model; and sim-to-real transfer where\nwe transfer a policy learned on a simulated environment (ALFRED) to a\nreal-world environment (R2R). Our approach is found to improve upon strong\nbaselines that rely on visual features in settings where only a few gold\ntrajectories (10-100) are available, demonstrating the potential of using\nlanguage as a perceptual representation for navigation tasks.\n","authors":["Bowen Pan","Rameswar Panda","SouYoung Jin","Rogerio Feris","Aude Oliva","Phillip Isola","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2310.07889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07887v1","updated":"2023-10-11T20:48:20Z","published":"2023-10-11T20:48:20Z","title":"Unsupervised Structured Noise Removal with Variational Lossy Autoencoder","summary":"  Most unsupervised denoising methods are based on the assumption that imaging\nnoise is either pixel-independent, i.e., spatially uncorrelated, or\nsignal-independent, i.e., purely additive. However, in practice many imaging\nsetups, especially in microscopy, suffer from a combination of signal-dependent\nnoise (e.g. Poisson shot noise) and axis-aligned correlated noise (e.g. stripe\nshaped scanning or readout artifacts). In this paper, we present the first\nunsupervised deep learning-based denoiser that can remove this type of noise\nwithout access to any clean images or a noise model. Unlike self-supervised\ntechniques, our method does not rely on removing pixels by masking or\nsubsampling so can utilize all available information. We implement a\nVariational Autoencoder (VAE) with a specially designed autoregressive decoder\ncapable of modelling the noise component of an image but incapable of\nindependently modelling the underlying clean signal component. As a\nconsequence, our VAE's encoder learns to encode only underlying clean signal\ncontent and to discard imaging noise. We also propose an additional decoder for\nmapping the encoder's latent variables back into image space, thereby sampling\ndenoised images. Experimental results demonstrate that our approach surpasses\nexisting methods for self- and unsupervised image denoising while being robust\nwith respect to the size of the autoregressive receptive field. Code for this\nproject can be found at https://github.com/krulllab/DVLAE.\n","authors":["Benjamin Salmon","Alexander Krull"],"pdf_url":"https://arxiv.org/pdf/2310.07887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07886v1","updated":"2023-10-11T20:48:19Z","published":"2023-10-11T20:48:19Z","title":"A Survey of Feature Types and Their Contributions for Camera Tampering\n  Detection","summary":"  Camera tamper detection is the ability to detect unauthorized and\nunintentional alterations in surveillance cameras by analyzing the video.\nCamera tampering can occur due to natural events or it can be caused\nintentionally to disrupt surveillance. We cast tampering detection as a change\ndetection problem, and perform a review of the existing literature with\nemphasis on feature types. We formulate tampering detection as a time series\nanalysis problem, and design experiments to study the robustness and capability\nof various feature types. We compute ten features on real-world surveillance\nvideo and apply time series analysis to ascertain their predictability, and\ntheir capability to detect tampering. Finally, we quantify the performance of\nvarious time series models using each feature type to detect tampering.\n","authors":["Pranav Mantini","Shishir K. Shah"],"pdf_url":"https://arxiv.org/pdf/2310.07886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16319v2","updated":"2023-10-11T20:33:37Z","published":"2023-05-25T17:59:50Z","title":"Image as First-Order Norm+Linear Autoregression: Unveiling Mathematical\n  Invariance","summary":"  This paper introduces a novel mathematical property applicable to diverse\nimages, referred to as FINOLA (First-Order Norm+Linear Autoregressive). FINOLA\nrepresents each image in the latent space as a first-order autoregressive\nprocess, in which each regression step simply applies a shared linear model on\nthe normalized value of its immediate neighbor. This intriguing property\nreveals a mathematical invariance that transcends individual images. Expanding\nfrom image grids to continuous coordinates, we unveil the presence of two\nunderlying partial differential equations. We validate the FINOLA property from\ntwo distinct angles: image reconstruction and self-supervised learning.\nFirstly, we demonstrate the ability of FINOLA to auto-regress up to a 256x256\nfeature map (the same resolution to the image) from a single vector placed at\nthe center, successfully reconstructing the original image by only using three\n3x3 convolution layers as decoder. Secondly, we leverage FINOLA for\nself-supervised learning by employing a simple masked prediction approach.\nEncoding a single unmasked quadrant block, we autoregressively predict the\nsurrounding masked region. Remarkably, this pre-trained representation proves\nhighly effective in image classification and object detection tasks, even when\nintegrated into lightweight networks, all without the need for extensive\nfine-tuning. The code will be made publicly available.\n","authors":["Yinpeng Chen","Xiyang Dai","Dongdong Chen","Mengchen Liu","Lu Yuan","Zicheng Liu","Youzuo Lin"],"pdf_url":"https://arxiv.org/pdf/2305.16319v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.01748v2","updated":"2023-10-11T20:28:58Z","published":"2023-03-03T07:20:58Z","title":"A Complete Recipe for Diffusion Generative Models","summary":"  Score-based Generative Models (SGMs) have demonstrated exceptional synthesis\noutcomes across various tasks. However, the current design landscape of the\nforward diffusion process remains largely untapped and often relies on physical\nheuristics or simplifying assumptions. Utilizing insights from the development\nof scalable Bayesian posterior samplers, we present a complete recipe for\nformulating forward processes in SGMs, ensuring convergence to the desired\ntarget distribution. Our approach reveals that several existing SGMs can be\nseen as specific manifestations of our framework. Building upon this method, we\nintroduce Phase Space Langevin Diffusion (PSLD), which relies on score-based\nmodeling within an augmented space enriched by auxiliary variables akin to\nphysical phase space. Empirical results exhibit the superior sample quality and\nimproved speed-quality trade-off of PSLD compared to various competing\napproaches on established image synthesis benchmarks. Remarkably, PSLD achieves\nsample quality akin to state-of-the-art SGMs (FID: 2.10 for unconditional\nCIFAR-10 generation). Lastly, we demonstrate the applicability of PSLD in\nconditional synthesis using pre-trained score networks, offering an appealing\nalternative as an SGM backbone for future advancements. Code and model\ncheckpoints can be accessed at \\url{https://github.com/mandt-lab/PSLD}.\n","authors":["Kushagra Pandey","Stephan Mandt"],"pdf_url":"https://arxiv.org/pdf/2303.01748v2.pdf","comment":"Accepted in ICCV'23 (Oral Presentation)"},{"id":"http://arxiv.org/abs/2210.09222v2","updated":"2023-10-11T19:59:02Z","published":"2022-10-14T08:05:16Z","title":"MMTSA: Multimodal Temporal Segment Attention Network for Efficient Human\n  Activity Recognition","summary":"  Multimodal sensors provide complementary information to develop accurate\nmachine-learning methods for human activity recognition (HAR), but introduce\nsignificantly higher computational load, which reduces efficiency. This paper\nproposes an efficient multimodal neural architecture for HAR using an RGB\ncamera and inertial measurement units (IMUs) called Multimodal Temporal Segment\nAttention Network (MMTSA). MMTSA first transforms IMU sensor data into a\ntemporal and structure-preserving gray-scale image using the Gramian Angular\nField (GAF), representing the inherent properties of human activities. MMTSA\nthen applies a multimodal sparse sampling method to reduce data redundancy.\nLastly, MMTSA adopts an inter-segment attention module for efficient multimodal\nfusion. Using three well-established public datasets, we evaluated MMTSA's\neffectiveness and efficiency in HAR. Results show that our method achieves\nsuperior performance improvements 11.13% of cross-subject F1-score on the MMAct\ndataset than the previous state-of-the-art (SOTA) methods. The ablation study\nand analysis suggest that MMTSA's effectiveness in fusing multimodal data for\naccurate HAR. The efficiency evaluation on an edge device showed that MMTSA\nachieved significantly better accuracy, lower computational load, and lower\ninference latency than SOTA methods.\n","authors":["Ziqi Gao","Yuntao Wang","Jianguo Chen","Junliang Xing","Shwetak Patel","Xin Liu","Yuanchun Shi"],"pdf_url":"https://arxiv.org/pdf/2210.09222v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07855v1","updated":"2023-10-11T19:57:51Z","published":"2023-10-11T19:57:51Z","title":"CrIBo: Self-Supervised Learning via Cross-Image Object-Level\n  Bootstrapping","summary":"  Leveraging nearest neighbor retrieval for self-supervised representation\nlearning has proven beneficial with object-centric images. However, this\napproach faces limitations when applied to scene-centric datasets, where\nmultiple objects within an image are only implicitly captured in the global\nrepresentation. Such global bootstrapping can lead to undesirable entanglement\nof object representations. Furthermore, even object-centric datasets stand to\nbenefit from a finer-grained bootstrapping approach. In response to these\nchallenges, we introduce a novel Cross-Image Object-Level Bootstrapping method\ntailored to enhance dense visual representation learning. By employing\nobject-level nearest neighbor bootstrapping throughout the training, CrIBo\nemerges as a notably strong and adequate candidate for in-context learning,\nleveraging nearest neighbor retrieval at test time. CrIBo shows\nstate-of-the-art performance on the latter task while being highly competitive\nin more standard downstream segmentation tasks. Our code and pretrained models\nwill be publicly available upon acceptance.\n","authors":["Tim Lebailly","Thomas Stegmüller","Behzad Bozorgtabar","Jean-Philippe Thiran","Tinne Tuytelaars"],"pdf_url":"https://arxiv.org/pdf/2310.07855v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2310.07713v1","updated":"2023-10-11T17:59:05Z","published":"2023-10-11T17:59:05Z","title":"InstructRetro: Instruction Tuning post Retrieval-Augmented Pretraining","summary":"  Pretraining auto-regressive large language models (LLMs) with retrieval\ndemonstrates better perplexity and factual accuracy by leveraging external\ndatabases. However, the size of existing pretrained retrieval-augmented LLM is\nstill limited (e.g., Retro has 7.5B parameters), which limits the effectiveness\nof instruction tuning and zero-shot generalization. In this work, we introduce\nRetro 48B, the largest LLM pretrained with retrieval before instruction tuning.\nSpecifically, we continue to pretrain the 43B GPT model on additional 100\nbillion tokens using the Retro augmentation method by retrieving from 1.2\ntrillion tokens. The obtained foundation model, Retro 48B, largely outperforms\nthe original 43B GPT in terms of perplexity. After instruction tuning on Retro,\nInstructRetro demonstrates significant improvement over the instruction tuned\nGPT on zero-shot question answering (QA) tasks. Specifically, the average\nimprovement of InstructRetro is 7% over its GPT counterpart across 8 short-form\nQA tasks, and 10% over GPT across 4 challenging long-form QA tasks.\nSurprisingly, we find that one can ablate the encoder from InstructRetro\narchitecture and directly use its decoder backbone, while achieving comparable\nresults. We hypothesize that pretraining with retrieval makes its decoder good\nat incorporating context for QA. Our results highlights the promising direction\nto obtain a better GPT decoder for QA through continued pretraining with\nretrieval before instruction tuning.\n","authors":["Boxin Wang","Wei Ping","Lawrence McAfee","Peng Xu","Bo Li","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2310.07713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08703v3","updated":"2023-10-11T17:00:34Z","published":"2023-05-15T15:06:20Z","title":"Schema-adaptable Knowledge Graph Construction","summary":"  Conventional Knowledge Graph Construction (KGC) approaches typically follow\nthe static information extraction paradigm with a closed set of pre-defined\nschema. As a result, such approaches fall short when applied to dynamic\nscenarios or domains, whereas a new type of knowledge emerges. This\nnecessitates a system that can handle evolving schema automatically to extract\ninformation for KGC. To address this need, we propose a new task called\nschema-adaptable KGC, which aims to continually extract entity, relation, and\nevent based on a dynamically changing schema graph without re-training. We\nfirst split and convert existing datasets based on three principles to build a\nbenchmark, i.e., horizontal schema expansion, vertical schema expansion, and\nhybrid schema expansion; then investigate the schema-adaptable performance of\nseveral well-known approaches such as Text2Event, TANL, UIE and GPT-3.5. We\nfurther propose a simple yet effective baseline dubbed \\textsc{AdaKGC}, which\ncontains schema-enriched prefix instructor and schema-conditioned dynamic\ndecoding to better handle evolving schema. Comprehensive experimental results\nillustrate that AdaKGC can outperform baselines but still have room for\nimprovement. We hope the proposed work can deliver benefits to the community.\nCode and datasets available at https://github.com/zjunlp/AdaKGC.\n","authors":["Hongbin Ye","Honghao Gui","Xin Xu","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.08703v3.pdf","comment":"EMNLP 2023 (Findings)"},{"id":"http://arxiv.org/abs/2305.13172v2","updated":"2023-10-11T16:51:50Z","published":"2023-05-22T16:00:00Z","title":"Editing Large Language Models: Problems, Methods, and Opportunities","summary":"  Despite the ability to train capable LLMs, the methodology for maintaining\ntheir relevancy and rectifying errors remains elusive. To this end, the past\nfew years have witnessed a surge in techniques for editing LLMs, the objective\nof which is to efficiently alter the behavior of LLMs within a specific domain\nwithout negatively impacting performance across other inputs. This paper\nembarks on a deep exploration of the problems, methods, and opportunities\nrelated to model editing for LLMs. In particular, we provide an exhaustive\noverview of the task definition and challenges associated with model editing,\nalong with an in-depth empirical analysis of the most progressive methods\ncurrently at our disposal. We also build a new benchmark dataset to facilitate\na more robust evaluation and pinpoint enduring issues intrinsic to existing\ntechniques. Our objective is to provide valuable insights into the\neffectiveness and feasibility of each editing technique, thereby assisting the\ncommunity in making informed decisions on the selection of the most appropriate\nmethod for a specific task or context. Code and datasets are available at\nhttps://github.com/zjunlp/EasyEdit.\n","authors":["Yunzhi Yao","Peng Wang","Bozhong Tian","Siyuan Cheng","Zhoubo Li","Shumin Deng","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.13172v2.pdf","comment":"EMNLP 2023. Updated with new experiments"},{"id":"http://arxiv.org/abs/2211.15743v3","updated":"2023-10-11T16:18:59Z","published":"2022-11-28T19:49:02Z","title":"Towards Reliable Item Sampling for Recommendation Evaluation","summary":"  Since Rendle and Krichene argued that commonly used sampling-based evaluation\nmetrics are \"inconsistent\" with respect to the global metrics (even in\nexpectation), there have been a few studies on the sampling-based recommender\nsystem evaluation. Existing methods try either mapping the sampling-based\nmetrics to their global counterparts or more generally, learning the empirical\nrank distribution to estimate the top-$K$ metrics. However, despite existing\nefforts, there is still a lack of rigorous theoretical understanding of the\nproposed metric estimators, and the basic item sampling also suffers from the\n\"blind spot\" issue, i.e., estimation accuracy to recover the top-$K$ metrics\nwhen $K$ is small can still be rather substantial. In this paper, we provide an\nin-depth investigation into these problems and make two innovative\ncontributions. First, we propose a new item-sampling estimator that explicitly\noptimizes the error with respect to the ground truth, and theoretically\nhighlight its subtle difference against prior work. Second, we propose a new\nadaptive sampling method which aims to deal with the \"blind spot\" problem and\nalso demonstrate the expectation-maximization (EM) algorithm can be generalized\nfor such a setting. Our experimental results confirm our statistical analysis\nand the superiority of the proposed works. This study helps lay the theoretical\nfoundation for adopting item sampling metrics for recommendation evaluation,\nand provides strong evidence towards making item sampling a powerful and\nreliable tool for recommendation evaluation.\n","authors":["Dong Li","Ruoming Jin","Zhenming Liu","Bin Ren","Jing Gao","Zhi Liu"],"pdf_url":"https://arxiv.org/pdf/2211.15743v3.pdf","comment":"aaai2023"},{"id":"http://arxiv.org/abs/2310.07554v1","updated":"2023-10-11T14:59:53Z","published":"2023-10-11T14:59:53Z","title":"Retrieve Anything To Augment Large Language Models","summary":"  Large language models (LLMs) face significant challenges stemming from the\ninherent limitations in knowledge, memory, alignment, and action. These\nchallenges cannot be addressed by LLMs alone, but should rely on assistance\nfrom the external world, such as knowledge base, memory store, demonstration\nexamples, and tools. Retrieval augmentation stands as a vital mechanism for\nbridging the gap between LLMs and the external assistance. However,\nconventional methods encounter two pressing issues. On one hand, the\ngeneral-purpose retrievers are not properly optimized for the retrieval\naugmentation of LLMs. On the other hand, the task-specific retrievers lack the\nrequired versatility, hindering their performance across the diverse retrieval\naugmentation scenarios.\n  In this work, we present a novel approach, the LLM Embedder, which\ncomprehensively support the diverse needs of LLMs' retrieval augmentation with\none unified embedding model. Training such an unified model is non-trivial, as\nvarious retrieval tasks aim to capture distinct semantic relationships, often\nsubject to mutual interference. To address this challenge, we systematically\noptimize our training methodology. This includes reward formulation based on\nLLMs' feedback, the stabilization of knowledge distillation, multi-task\nfine-tuning with explicit instructions, and the use of homogeneous in-batch\nnegative sampling. These optimization strategies contribute to the outstanding\nempirical performance of the LLM-Embedder. Notably, it yields remarkable\nenhancements in retrieval augmentation for LLMs, surpassing both\ngeneral-purpose and task-specific retrievers in various evaluation scenarios.\nThis project is made publicly available at\nhttps://github.com/FlagOpen/FlagEmbedding.\n","authors":["Peitian Zhang","Shitao Xiao","Zheng Liu","Zhicheng Dou","Jian-Yun Nie"],"pdf_url":"https://arxiv.org/pdf/2310.07554v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09447v3","updated":"2023-10-11T13:46:25Z","published":"2023-07-18T17:22:19Z","title":"Deep Neural Aggregation for Recommending Items to Group of Users","summary":"  Modern society devotes a significant amount of time to digital interaction.\nMany of our daily actions are carried out through digital means. This has led\nto the emergence of numerous Artificial Intelligence tools that assist us in\nvarious aspects of our lives. One key tool for the digital society is\nRecommender Systems, intelligent systems that learn from our past actions to\npropose new ones that align with our interests. Some of these systems have\nspecialized in learning from the behavior of user groups to make\nrecommendations to a group of individuals who want to perform a joint task. In\nthis article, we analyze the current state of Group Recommender Systems and\npropose two new models that use emerging Deep Learning architectures.\nExperimental results demonstrate the improvement achieved by employing the\nproposed models compared to the state-of-the-art models using four different\ndatasets. The source code of the models, as well as that of all the experiments\nconducted, is available in a public repository.\n","authors":["Jorge Dueñas-Lerín","Raúl Lara-Cabrera","Fernando Ortega","Jesús Bobadilla"],"pdf_url":"https://arxiv.org/pdf/2307.09447v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07477v1","updated":"2023-10-11T13:24:38Z","published":"2023-10-11T13:24:38Z","title":"GMOCAT: A Graph-Enhanced Multi-Objective Method for Computerized\n  Adaptive Testing","summary":"  Computerized Adaptive Testing(CAT) refers to an online system that adaptively\nselects the best-suited question for students with various abilities based on\ntheir historical response records. Most CAT methods only focus on the quality\nobjective of predicting the student ability accurately, but neglect concept\ndiversity or question exposure control, which are important considerations in\nensuring the performance and validity of CAT. Besides, the students' response\nrecords contain valuable relational information between questions and knowledge\nconcepts. The previous methods ignore this relational information, resulting in\nthe selection of sub-optimal test questions. To address these challenges, we\npropose a Graph-Enhanced Multi-Objective method for CAT (GMOCAT). Firstly,\nthree objectives, namely quality, diversity and novelty, are introduced into\nthe Scalarized Multi-Objective Reinforcement Learning framework of CAT, which\nrespectively correspond to improving the prediction accuracy, increasing the\nconcept diversity and reducing the question exposure. We use an Actor-Critic\nRecommender to select questions and optimize three objectives simultaneously by\nthe scalarization function. Secondly, we utilize the graph neural network to\nlearn relation-aware embeddings of questions and concepts. These embeddings are\nable to aggregate neighborhood information in the relation graphs between\nquestions and concepts. We conduct experiments on three real-world educational\ndatasets, and show that GMOCAT not only outperforms the state-of-the-art\nmethods in the ability prediction, but also achieve superior performance in\nimproving the concept diversity and alleviating the question exposure. Our code\nis available at https://github.com/justarter/GMOCAT.\n","authors":["Hangyu Wang","Ting Long","Liang Yin","Weinan Zhang","Wei Xia","Qichen Hong","Dingyin Xia","Ruiming Tang","Yong Yu"],"pdf_url":"https://arxiv.org/pdf/2310.07477v1.pdf","comment":"KDD23"},{"id":"http://arxiv.org/abs/2303.09902v2","updated":"2023-10-11T12:48:30Z","published":"2023-03-17T11:39:35Z","title":"Contrastive Self-supervised Learning in Recommender Systems: A Survey","summary":"  Deep learning-based recommender systems have achieved remarkable success in\nrecent years. However, these methods usually heavily rely on labeled data\n(i.e., user-item interactions), suffering from problems such as data sparsity\nand cold-start. Self-supervised learning, an emerging paradigm that extracts\ninformation from unlabeled data, provides insights into addressing these\nproblems. Specifically, contrastive self-supervised learning, due to its\nflexibility and promising performance, has attracted considerable interest and\nrecently become a dominant branch in self-supervised learning-based\nrecommendation methods. In this survey, we provide an up-to-date and\ncomprehensive review of current contrastive self-supervised learning-based\nrecommendation methods. Firstly, we propose a unified framework for these\nmethods. We then introduce a taxonomy based on the key components of the\nframework, including view generation strategy, contrastive task, and\ncontrastive objective. For each component, we provide detailed descriptions and\ndiscussions to guide the choice of the appropriate method. Finally, we outline\nopen issues and promising directions for future research.\n","authors":["Mengyuan Jing","Yanmin Zhu","Tianzi Zang","Ke Wang"],"pdf_url":"https://arxiv.org/pdf/2303.09902v2.pdf","comment":"Accepted by ACM Transactions on Information Systems (TOIS)"},{"id":"http://arxiv.org/abs/2305.08732v3","updated":"2023-10-11T10:51:12Z","published":"2023-05-15T15:47:09Z","title":"Knowledge Rumination for Pre-trained Language Models","summary":"  Previous studies have revealed that vanilla pre-trained language models\n(PLMs) lack the capacity to handle knowledge-intensive NLP tasks alone; thus,\nseveral works have attempted to integrate external knowledge into PLMs.\nHowever, despite the promising outcome, we empirically observe that PLMs may\nhave already encoded rich knowledge in their pre-trained parameters but fail to\nfully utilize them when applying them to knowledge-intensive tasks. In this\npaper, we propose a new paradigm dubbed Knowledge Rumination to help the\npre-trained language model utilize that related latent knowledge without\nretrieving it from the external corpus. By simply adding a prompt like \"As far\nas I know\" to the PLMs, we try to review related latent knowledge and inject\nthem back into the model for knowledge consolidation. We apply the proposed\nknowledge rumination to various language models, including RoBERTa, DeBERTa,\nand GPT-3. Experimental results on six commonsense reasoning tasks and GLUE\nbenchmarks demonstrate the effectiveness of our proposed approach, which proves\nthat the knowledge stored in PLMs can be better exploited to enhance\nperformance. Code is available in\nhttps://github.com/zjunlp/knowledge-rumination.\n","authors":["Yunzhi Yao","Peng Wang","Shengyu Mao","Chuanqi Tan","Fei Huang","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.08732v3.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.07346v1","updated":"2023-10-11T09:53:43Z","published":"2023-10-11T09:53:43Z","title":"Preliminary Results of a Scientometric Analysis of the German\n  Information Retrieval Community 2020-2023","summary":"  The German Information Retrieval community is located in two different\nsub-fields: Information and computer science. There are no current studies that\ninvestigate these communities on a scientometric level. Available studies only\nfocus on the information scientific part of the community. We generated a data\nset of 401 recent IR-related publications extracted from six core IR\nconferences from a mainly computer scientific background. We analyze this data\nset at the institutional and researcher level. The data set is publicly\nreleased, and we also demonstrate a mapping use case.\n","authors":["Philipp Schaer","Svetlana Myshkina","Jüri Keller"],"pdf_url":"https://arxiv.org/pdf/2310.07346v1.pdf","comment":"Data available at https://github.com/irgroup/LWDA2023-IR-community"},{"id":"http://arxiv.org/abs/2206.12781v4","updated":"2023-10-11T09:03:18Z","published":"2022-06-26T03:59:41Z","title":"Efficiently Leveraging Multi-level User Intent for Session-based\n  Recommendation via Atten-Mixer Network","summary":"  Session-based recommendation (SBR) aims to predict the user's next action\nbased on short and dynamic sessions. Recently, there has been an increasing\ninterest in utilizing various elaborately designed graph neural networks (GNNs)\nto capture the pair-wise relationships among items, seemingly suggesting the\ndesign of more complicated models is the panacea for improving the empirical\nperformance. However, these models achieve relatively marginal improvements\nwith exponential growth in model complexity. In this paper, we dissect the\nclassical GNN-based SBR models and empirically find that some sophisticated GNN\npropagations are redundant, given the readout module plays a significant role\nin GNN-based models. Based on this observation, we intuitively propose to\nremove the GNN propagation part, while the readout module will take on more\nresponsibility in the model reasoning process. To this end, we propose the\nMulti-Level Attention Mixture Network (Atten-Mixer), which leverages both\nconcept-view and instance-view readouts to achieve multi-level reasoning over\nitem transitions. As simply enumerating all possible high-level concepts is\ninfeasible for large real-world recommender systems, we further incorporate\nSBR-related inductive biases, i.e., local invariance and inherent priority to\nprune the search space. Experiments on three benchmarks demonstrate the\neffectiveness and efficiency of our proposal. We also have already launched the\nproposed techniques to a large-scale e-commercial online service since April\n2021, with significant improvements of top-tier business metrics demonstrated\nin the online experiments on live traffic.\n","authors":["Peiyan Zhang","Jiayan Guo","Chaozhuo Li","Yueqi Xie","Jaeboum Kim","Yan Zhang","Xing Xie","Haohan Wang","Sunghun Kim"],"pdf_url":"https://arxiv.org/pdf/2206.12781v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.07678v2","updated":"2023-10-11T08:34:42Z","published":"2023-03-14T07:27:30Z","title":"Query2doc: Query Expansion with Large Language Models","summary":"  This paper introduces a simple yet effective query expansion approach,\ndenoted as query2doc, to improve both sparse and dense retrieval systems. The\nproposed method first generates pseudo-documents by few-shot prompting large\nlanguage models (LLMs), and then expands the query with generated\npseudo-documents. LLMs are trained on web-scale text corpora and are adept at\nknowledge memorization. The pseudo-documents from LLMs often contain highly\nrelevant information that can aid in query disambiguation and guide the\nretrievers. Experimental results demonstrate that query2doc boosts the\nperformance of BM25 by 3% to 15% on ad-hoc IR datasets, such as MS-MARCO and\nTREC DL, without any model fine-tuning. Furthermore, our method also benefits\nstate-of-the-art dense retrievers in terms of both in-domain and out-of-domain\nresults.\n","authors":["Liang Wang","Nan Yang","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2303.07678v2.pdf","comment":"Accepted to EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.07281v1","updated":"2023-10-11T08:15:10Z","published":"2023-10-11T08:15:10Z","title":"A Completely Locale-independent Session-based Recommender System by\n  Leveraging Trained Model","summary":"  In this paper, we propose a solution that won the 10th prize in the KDD Cup\n2023 Challenge Task 2 (Next Product Recommendation for Underrepresented\nLanguages/Locales). Our approach involves two steps: (i) Identify candidate\nitem sets based on co-visitation, and (ii) Re-ranking the items using LightGBM\nwith locale-independent features, including session-based features and product\nsimilarity. The experiment demonstrated that the locale-independent model\nperformed consistently well across different test locales, and performed even\nbetter when incorporating data from other locales into the training.\n","authors":["Yu Tokutake","Chihiro Yamasaki","Yongzhi Jin","Ayuka Inoue","Kei Harada"],"pdf_url":"https://arxiv.org/pdf/2310.07281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06282v2","updated":"2023-10-11T02:46:12Z","published":"2023-10-10T03:32:33Z","title":"MuseChat: A Conversational Music Recommendation System for Videos","summary":"  We introduce MuseChat, an innovative dialog-based music recommendation\nsystem. This unique platform not only offers interactive user engagement but\nalso suggests music tailored for input videos, so that users can refine and\npersonalize their music selections. In contrast, previous systems predominantly\nemphasized content compatibility, often overlooking the nuances of users'\nindividual preferences. For example, all the datasets only provide basic\nmusic-video pairings or such pairings with textual music descriptions. To\naddress this gap, our research offers three contributions. First, we devise a\nconversation-synthesis method that simulates a two-turn interaction between a\nuser and a recommendation system, which leverages pre-trained music tags and\nartist information. In this interaction, users submit a video to the system,\nwhich then suggests a suitable music piece with a rationale. Afterwards, users\ncommunicate their musical preferences, and the system presents a refined music\nrecommendation with reasoning. Second, we introduce a multi-modal\nrecommendation engine that matches music either by aligning it with visual cues\nfrom the video or by harmonizing visual information, feedback from previously\nrecommended music, and the user's textual input. Third, we bridge music\nrepresentations and textual data with a Large Language Model(Vicuna-7B). This\nalignment equips MuseChat to deliver music recommendations and their underlying\nreasoning in a manner resembling human communication. Our evaluations show that\nMuseChat surpasses existing state-of-the-art models in music retrieval tasks\nand pioneers the integration of the recommendation process within a natural\nlanguage framework.\n","authors":["Zhikang Dong","Bin Chen","Xiulong Liu","Pawel Polak","Peng Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.06282v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07142v1","updated":"2023-10-11T02:36:38Z","published":"2023-10-11T02:36:38Z","title":"Validating Synthetic Usage Data in Living Lab Environments","summary":"  Evaluating retrieval performance without editorial relevance judgments is\nchallenging, but instead, user interactions can be used as relevance signals.\nLiving labs offer a way for small-scale platforms to validate information\nretrieval systems with real users. If enough user interaction data are\navailable, click models can be parameterized from historical sessions to\nevaluate systems before exposing users to experimental rankings. However,\ninteraction data are sparse in living labs, and little is studied about how\nclick models can be validated for reliable user simulations when click data are\navailable in moderate amounts.\n  This work introduces an evaluation approach for validating synthetic usage\ndata generated by click models in data-sparse human-in-the-loop environments\nlike living labs. We ground our methodology on the click model's estimates\nabout a system ranking compared to a reference ranking for which the relative\nperformance is known. Our experiments compare different click models and their\nreliability and robustness as more session log data becomes available. In our\nsetup, simple click models can reliably determine the relative system\nperformance with already 20 logged sessions for 50 queries. In contrast, more\ncomplex click models require more session data for reliable estimates, but they\nare a better choice in simulated interleaving experiments when enough session\ndata are available. While it is easier for click models to distinguish between\nmore diverse systems, it is harder to reproduce the system ranking based on the\nsame retrieval algorithm with different interpolation weights. Our setup is\nentirely open, and we share the code to reproduce the experiments.\n","authors":["Timo Breuer","Norbert Fuhr","Philipp Schaer"],"pdf_url":"https://arxiv.org/pdf/2310.07142v1.pdf","comment":"25 pages + appendix and references, accepted JDIQ journal paper"},{"id":"http://arxiv.org/abs/2310.07137v1","updated":"2023-10-11T02:22:28Z","published":"2023-10-11T02:22:28Z","title":"AE-smnsMLC: Multi-Label Classification with Semantic Matching and\n  Negative Label Sampling for Product Attribute Value Extraction","summary":"  Product attribute value extraction plays an important role for many\nreal-world applications in e-Commerce such as product search and\nrecommendation. Previous methods treat it as a sequence labeling task that\nneeds more annotation for position of values in the product text. This limits\ntheir application to real-world scenario in which only attribute values are\nweakly-annotated for each product without their position. Moreover, these\nmethods only use product text (i.e., product title and description) and do not\nconsider the semantic connection between the multiple attribute values of a\ngiven product and its text, which can help attribute value extraction. In this\npaper, we reformulate this task as a multi-label classification task that can\nbe applied for real-world scenario in which only annotation of attribute values\nis available to train models (i.e., annotation of positional information of\nattribute values is not available). We propose a classification model with\nsemantic matching and negative label sampling for attribute value extraction.\nSemantic matching aims to capture semantic interactions between attribute\nvalues of a given product and its text. Negative label sampling aims to enhance\nthe model's ability of distinguishing similar values belonging to the same\nattribute. Experimental results on three subsets of a large real-world\ne-Commerce dataset demonstrate the effectiveness and superiority of our\nproposed model.\n","authors":["Zhongfen Deng","Wei-Te Chen","Lei Chen","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2310.07137v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07874v1","updated":"2023-10-11T20:34:17Z","published":"2023-10-11T20:34:17Z","title":"Refined Mechanism Design for Approximately Structured Priors via Active\n  Regression","summary":"  We consider the problem of a revenue-maximizing seller with a large number of\nitems $m$ for sale to $n$ strategic bidders, whose valuations are drawn\nindependently from high-dimensional, unknown prior distributions. It is\nwell-known that optimal and even approximately-optimal mechanisms for this\nsetting are notoriously difficult to characterize or compute, and, even when\nthey can be found, are often rife with various counter-intuitive properties. In\nthis paper, following a model introduced recently by Cai and\nDaskalakis~\\cite{cai2022recommender}, we consider the case that bidders' prior\ndistributions can be well-approximated by a topic model. We design an active\nlearning component, responsible for interacting with the bidders and outputting\nlow-dimensional approximations of their types, and a mechanism design\ncomponent, responsible for robustifying mechanisms for the low-dimensional\nmodel to work for the approximate types of the former component. On the active\nlearning front, we cast our problem in the framework of Randomized Linear\nAlgebra (RLA) for regression problems, allowing us to import several\nbreakthrough results from that line of research, and adapt them to our setting.\nOn the mechanism design front, we remove many restrictive assumptions of prior\nwork on the type of access needed to the underlying distributions and the\nassociated mechanisms. To the best of our knowledge, our work is the first to\nformulate connections between mechanism design, and RLA for active learning of\nregression problems, opening the door for further applications of randomized\nlinear algebra primitives to mechanism design.\n","authors":["Christos Boutsikas","Petros Drineas","Marios Mertzanidis","Alexandros Psomas","Paritosh Verma"],"pdf_url":"https://arxiv.org/pdf/2310.07874v1.pdf","comment":"37th Conference on Neural Information Processing Systems (NeurIPS\n  2023)"},{"id":"http://arxiv.org/abs/2310.07815v1","updated":"2023-10-11T18:56:15Z","published":"2023-10-11T18:56:15Z","title":"Language Models As Semantic Indexers","summary":"  Semantic identifier (ID) is an important concept in information retrieval\nthat aims to preserve the semantics of objects such as documents and items\ninside their IDs. Previous studies typically adopt a two-stage pipeline to\nlearn semantic IDs by first procuring embeddings using off-the-shelf text\nencoders and then deriving IDs based on the embeddings. However, each step\nintroduces potential information loss and there is usually an inherent mismatch\nbetween the distribution of embeddings within the latent space produced by text\nencoders and the anticipated distribution required for semantic indexing.\nNevertheless, it is non-trivial to design a method that can learn the\ndocument's semantic representations and its hierarchical structure\nsimultaneously, given that semantic IDs are discrete and sequentially\nstructured, and the semantic supervision is deficient. In this paper, we\nintroduce LMINDEXER, a self-supervised framework to learn semantic IDs with a\ngenerative language model. We tackle the challenge of sequential discrete ID by\nintroducing a semantic indexer capable of generating neural sequential discrete\nrepresentations with progressive training and contrastive learning. In response\nto the semantic supervision deficiency, we propose to train the model with a\nself-supervised document reconstruction objective. The learned semantic indexer\ncan facilitate various downstream tasks, such as recommendation and retrieval.\nWe conduct experiments on three tasks including recommendation, product search,\nand document retrieval on five datasets from various domains, where LMINDEXER\noutperforms competitive baselines significantly and consistently.\n","authors":["Bowen Jin","Hansi Zeng","Guoyin Wang","Xiusi Chen","Tianxin Wei","Ruirui Li","Zhengyang Wang","Zheng Li","Yang Li","Hanqing Lu","Suhang Wang","Jiawei Han","Xianfeng Tang"],"pdf_url":"https://arxiv.org/pdf/2310.07815v1.pdf","comment":"9 pages, 3 appendix pages"},{"id":"http://arxiv.org/abs/2310.07786v1","updated":"2023-10-11T18:15:55Z","published":"2023-10-11T18:15:55Z","title":"Non-Stationary Contextual Bandit Learning via Neural Predictive Ensemble\n  Sampling","summary":"  Real-world applications of contextual bandits often exhibit non-stationarity\ndue to seasonality, serendipity, and evolving social trends. While a number of\nnon-stationary contextual bandit learning algorithms have been proposed in the\nliterature, they excessively explore due to a lack of prioritization for\ninformation of enduring value, or are designed in ways that do not scale in\nmodern applications with high-dimensional user-specific features and large\naction set, or both. In this paper, we introduce a novel non-stationary\ncontextual bandit algorithm that addresses these concerns. It combines a\nscalable, deep-neural-network-based architecture with a carefully designed\nexploration mechanism that strategically prioritizes collecting information\nwith the most lasting value in a non-stationary environment. Through empirical\nevaluations on two real-world recommendation datasets, which exhibit pronounced\nnon-stationarity, we demonstrate that our approach significantly outperforms\nthe state-of-the-art baselines.\n","authors":["Zheqing Zhu","Yueyang Liu","Xu Kuang","Benjamin Van Roy"],"pdf_url":"https://arxiv.org/pdf/2310.07786v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2305.05658v2","updated":"2023-10-11T17:59:44Z","published":"2023-05-09T17:52:59Z","title":"TidyBot: Personalized Robot Assistance with Large Language Models","summary":"  For a robot to personalize physical assistance effectively, it must learn\nuser preferences that can be generally reapplied to future scenarios. In this\nwork, we investigate personalization of household cleanup with robots that can\ntidy up rooms by picking up objects and putting them away. A key challenge is\ndetermining the proper place to put each object, as people's preferences can\nvary greatly depending on personal taste or cultural background. For instance,\none person may prefer storing shirts in the drawer, while another may prefer\nthem on the shelf. We aim to build systems that can learn such preferences from\njust a handful of examples via prior interactions with a particular person. We\nshow that robots can combine language-based planning and perception with the\nfew-shot summarization capabilities of large language models (LLMs) to infer\ngeneralized user preferences that are broadly applicable to future\ninteractions. This approach enables fast adaptation and achieves 91.2% accuracy\non unseen objects in our benchmark dataset. We also demonstrate our approach on\na real-world mobile manipulator called TidyBot, which successfully puts away\n85.0% of objects in real-world test scenarios.\n","authors":["Jimmy Wu","Rika Antonova","Adam Kan","Marion Lepert","Andy Zeng","Shuran Song","Jeannette Bohg","Szymon Rusinkiewicz","Thomas Funkhouser"],"pdf_url":"https://arxiv.org/pdf/2305.05658v2.pdf","comment":"Accepted to Autonomous Robots (AuRo) - Special Issue: Large Language\n  Models in Robotics, 2023 and IEEE/RSJ International Conference on Intelligent\n  Robots and Systems (IROS), 2023. Project page:\n  https://tidybot.cs.princeton.edu"},{"id":"http://arxiv.org/abs/2310.07713v1","updated":"2023-10-11T17:59:05Z","published":"2023-10-11T17:59:05Z","title":"InstructRetro: Instruction Tuning post Retrieval-Augmented Pretraining","summary":"  Pretraining auto-regressive large language models (LLMs) with retrieval\ndemonstrates better perplexity and factual accuracy by leveraging external\ndatabases. However, the size of existing pretrained retrieval-augmented LLM is\nstill limited (e.g., Retro has 7.5B parameters), which limits the effectiveness\nof instruction tuning and zero-shot generalization. In this work, we introduce\nRetro 48B, the largest LLM pretrained with retrieval before instruction tuning.\nSpecifically, we continue to pretrain the 43B GPT model on additional 100\nbillion tokens using the Retro augmentation method by retrieving from 1.2\ntrillion tokens. The obtained foundation model, Retro 48B, largely outperforms\nthe original 43B GPT in terms of perplexity. After instruction tuning on Retro,\nInstructRetro demonstrates significant improvement over the instruction tuned\nGPT on zero-shot question answering (QA) tasks. Specifically, the average\nimprovement of InstructRetro is 7% over its GPT counterpart across 8 short-form\nQA tasks, and 10% over GPT across 4 challenging long-form QA tasks.\nSurprisingly, we find that one can ablate the encoder from InstructRetro\narchitecture and directly use its decoder backbone, while achieving comparable\nresults. We hypothesize that pretraining with retrieval makes its decoder good\nat incorporating context for QA. Our results highlights the promising direction\nto obtain a better GPT decoder for QA through continued pretraining with\nretrieval before instruction tuning.\n","authors":["Boxin Wang","Wei Ping","Lawrence McAfee","Peng Xu","Bo Li","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2310.07713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07712v1","updated":"2023-10-11T17:59:02Z","published":"2023-10-11T17:59:02Z","title":"Found in the Middle: Permutation Self-Consistency Improves Listwise\n  Ranking in Large Language Models","summary":"  Large language models (LLMs) exhibit positional bias in how they use context,\nwhich especially complicates listwise ranking. To address this, we propose\npermutation self-consistency, a form of self-consistency over ranking list\noutputs of black-box LLMs. Our key idea is to marginalize out different list\norders in the prompt to produce an order-independent ranking with less\npositional bias. First, given some input prompt, we repeatedly shuffle the list\nin the prompt and pass it through the LLM while holding the instructions the\nsame. Next, we aggregate the resulting sample of rankings by computing the\ncentral ranking closest in distance to all of them, marginalizing out prompt\norder biases in the process. Theoretically, we prove the robustness of our\nmethod, showing convergence to the true ranking in the presence of random\nperturbations. Empirically, on five list-ranking datasets in sorting and\npassage reranking, our approach improves scores from conventional inference by\nup to 7-18% for GPT-3.5 and 8-16% for LLaMA v2 (70B), surpassing the previous\nstate of the art in passage reranking. Our code is at\nhttps://github.com/castorini/perm-sc.\n","authors":["Raphael Tang","Xinyu Zhang","Xueguang Ma","Jimmy Lin","Ferhan Ture"],"pdf_url":"https://arxiv.org/pdf/2310.07712v1.pdf","comment":"First two authors contributed equally; 10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2310.07711v1","updated":"2023-10-11T17:58:25Z","published":"2023-10-11T17:58:25Z","title":"Growing Brains: Co-emergence of Anatomical and Functional Modularity in\n  Recurrent Neural Networks","summary":"  Recurrent neural networks (RNNs) trained on compositional tasks can exhibit\nfunctional modularity, in which neurons can be clustered by activity similarity\nand participation in shared computational subtasks. Unlike brains, these RNNs\ndo not exhibit anatomical modularity, in which functional clustering is\ncorrelated with strong recurrent coupling and spatial localization of\nfunctional clusters. Contrasting with functional modularity, which can be\nephemerally dependent on the input, anatomically modular networks form a robust\nsubstrate for solving the same subtasks in the future. To examine whether it is\npossible to grow brain-like anatomical modularity, we apply a recent machine\nlearning method, brain-inspired modular training (BIMT), to a network being\ntrained to solve a set of compositional cognitive tasks. We find that\nfunctional and anatomical clustering emerge together, such that functionally\nsimilar neurons also become spatially localized and interconnected. Moreover,\ncompared to standard $L_1$ or no regularization settings, the model exhibits\nsuperior performance by optimally balancing task performance and network\nsparsity. In addition to achieving brain-like organization in RNNs, our\nfindings also suggest that BIMT holds promise for applications in neuromorphic\ncomputing and enhancing the interpretability of neural network architectures.\n","authors":["Ziming Liu","Mikail Khona","Ila R. Fiete","Max Tegmark"],"pdf_url":"https://arxiv.org/pdf/2310.07711v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2310.07710v1","updated":"2023-10-11T17:57:35Z","published":"2023-10-11T17:57:35Z","title":"DiPmark: A Stealthy, Efficient and Resilient Watermark for Large\n  Language Models","summary":"  Watermarking techniques offer a promising way to secure data via embedding\ncovert information into the data. A paramount challenge in the domain lies in\npreserving the distribution of original data during watermarking. Our research\nextends and refines existing watermarking framework, placing emphasis on the\nimportance of a distribution-preserving (DiP) watermark. Contrary to the\ncurrent strategies, our proposed DiPmark preserves the original token\ndistribution during watermarking (stealthy), is detectable without access to\nthe language model API or weights (efficient), and is robust to moderate\nchanges of tokens (resilient). This is achieved by incorporating a novel\nreweight strategy, combined with a hash function that assigns unique\n\\textit{i.i.d.} ciphers based on the context. The empirical benchmarks of our\napproach underscore its stealthiness, efficiency, and resilience, making it a\nrobust solution for watermarking tasks that demand impeccable quality\npreservation.\n","authors":["Yihan Wu","Zhengmian Hu","Hongyang Zhang","Heng Huang"],"pdf_url":"https://arxiv.org/pdf/2310.07710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07707v1","updated":"2023-10-11T17:57:14Z","published":"2023-10-11T17:57:14Z","title":"MatFormer: Nested Transformer for Elastic Inference","summary":"  Transformer models are deployed in a wide range of settings, from\nmulti-accelerator clusters to standalone mobile phones. The diverse inference\nconstraints in these scenarios necessitate practitioners to train foundation\nmodels such as PaLM 2, Llama, & ViTs as a series of models of varying sizes.\nDue to significant training costs, only a select few model sizes are trained\nand supported, limiting more fine-grained control over relevant tradeoffs,\nincluding latency, cost, and accuracy. This work introduces MatFormer, a nested\nTransformer architecture designed to offer elasticity in a variety of\ndeployment constraints. Each Feed Forward Network (FFN) block of a MatFormer\nmodel is jointly optimized with a few nested smaller FFN blocks. This training\nprocedure allows for the Mix'n'Match of model granularities across layers --\ni.e., a trained universal MatFormer model enables extraction of hundreds of\naccurate smaller models, which were never explicitly optimized. We empirically\ndemonstrate MatFormer's effectiveness across different model classes (decoders\n& encoders), modalities (language & vision), and scales (up to 2.6B\nparameters). We find that a 2.6B decoder-only MatFormer language model (MatLM)\nallows us to extract smaller models spanning from 1.5B to 2.6B, each exhibiting\ncomparable validation loss and one-shot downstream evaluations to their\nindependently trained counterparts. Furthermore, we observe that smaller\nencoders extracted from a universal MatFormer-based ViT (MatViT) encoder\npreserve the metric-space structure for adaptive large-scale retrieval.\nFinally, we showcase that speculative decoding with the accurate and consistent\nsubmodels extracted from MatFormer can further reduce inference latency.\n","authors":[" Devvrit","Sneha Kudugunta","Aditya Kusupati","Tim Dettmers","Kaifeng Chen","Inderjit Dhillon","Yulia Tsvetkov","Hannaneh Hajishirzi","Sham Kakade","Ali Farhadi","Prateek Jain"],"pdf_url":"https://arxiv.org/pdf/2310.07707v1.pdf","comment":"31 pages, 12 figures, first three authors contributed equally"},{"id":"http://arxiv.org/abs/2310.07699v1","updated":"2023-10-11T17:49:13Z","published":"2023-10-11T17:49:13Z","title":"From Scarcity to Efficiency: Improving CLIP Training via Visual-enriched\n  Captions","summary":"  Web-crawled datasets are pivotal to the success of pre-training\nvision-language models, exemplified by CLIP. However, web-crawled AltTexts can\nbe noisy and potentially irrelevant to images, thereby undermining the crucial\nimage-text alignment. Existing methods for rewriting captions using large\nlanguage models (LLMs) have shown promise on small, curated datasets like CC3M\nand CC12M. Nevertheless, their efficacy on massive web-captured captions is\nconstrained by the inherent noise and randomness in such data. In this study,\nwe address this limitation by focusing on two key aspects: data quality and\ndata variety. Unlike recent LLM rewriting techniques, we emphasize exploiting\nvisual concepts and their integration into the captions to improve data\nquality. For data variety, we propose a novel mixed training scheme that\noptimally leverages AltTexts alongside newly generated Visual-enriched Captions\n(VeC). We use CLIP as one example and adapt the method for CLIP training on\nlarge-scale web-crawled datasets, named VeCLIP. We conduct a comprehensive\nevaluation of VeCLIP across small, medium, and large scales of raw data. Our\nresults show significant advantages in image-text alignment and overall model\nperformance, underscoring the effectiveness of VeCLIP in improving CLIP\ntraining. For example, VeCLIP achieves a remarkable over 20% improvement in\nCOCO and Flickr30k retrieval tasks under the 12M setting. For data efficiency,\nwe also achieve a notable over 3% improvement while using only 14% of the data\nemployed in the vanilla CLIP and 11% in ALIGN.\n","authors":["Zhengfeng Lai","Haotian Zhang","Wentao Wu","Haoping Bai","Aleksei Timofeev","Xianzhi Du","Zhe Gan","Jiulong Shan","Chen-Nee Chuah","Yinfei Yang","Meng Cao"],"pdf_url":"https://arxiv.org/pdf/2310.07699v1.pdf","comment":"CV/ML"},{"id":"http://arxiv.org/abs/2310.07698v1","updated":"2023-10-11T17:46:59Z","published":"2023-10-11T17:46:59Z","title":"SurroCBM: Concept Bottleneck Surrogate Models for Generative Post-hoc\n  Explanation","summary":"  Explainable AI seeks to bring light to the decision-making processes of\nblack-box models. Traditional saliency-based methods, while highlighting\ninfluential data segments, often lack semantic understanding. Recent\nadvancements, such as Concept Activation Vectors (CAVs) and Concept Bottleneck\nModels (CBMs), offer concept-based explanations but necessitate human-defined\nconcepts. However, human-annotated concepts are expensive to attain. This paper\nintroduces the Concept Bottleneck Surrogate Models (SurroCBM), a novel\nframework that aims to explain the black-box models with automatically\ndiscovered concepts. SurroCBM identifies shared and unique concepts across\nvarious black-box models and employs an explainable surrogate model for\npost-hoc explanations. An effective training strategy using self-generated data\nis proposed to enhance explanation quality continuously. Through extensive\nexperiments, we demonstrate the efficacy of SurroCBM in concept discovery and\nexplanation, underscoring its potential in advancing the field of explainable\nAI.\n","authors":["Bo Pan","Zhenke Liu","Yifei Zhang","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.07698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.11196v2","updated":"2023-10-11T17:38:26Z","published":"2022-03-18T20:23:41Z","title":"Performance of Deep Learning models with transfer learning for\n  multiple-step-ahead forecasts in monthly time series","summary":"  Deep Learning and transfer learning models are being used to generate time\nseries forecasts; however, there is scarce evidence about their performance\nprediction that it is more evident for monthly time series. The purpose of this\npaper is to compare Deep Learning models with transfer learning and without\ntransfer learning and other traditional methods used for monthly forecasts to\nanswer three questions about the suitability of Deep Learning and Transfer\nLearning to generate predictions of time series. Time series of M4 and M3\ncompetitions were used for the experiments. The results suggest that deep\nlearning models based on TCN, LSTM, and CNN with transfer learning tend to\nsurpass the performance prediction of other traditional methods. On the other\nhand, TCN and LSTM, trained directly on the target time series, got similar or\nbetter performance than traditional methods for some forecast horizons.\n","authors":["Martín Solís","Luis-Alexander Calvo-Valverde"],"pdf_url":"https://arxiv.org/pdf/2203.11196v2.pdf","comment":"20 pages, 7 figures, 5 tables"},{"id":"http://arxiv.org/abs/2310.07683v1","updated":"2023-10-11T17:34:56Z","published":"2023-10-11T17:34:56Z","title":"Controllable Data Generation Via Iterative Data-Property Mutual Mappings","summary":"  Deep generative models have been widely used for their ability to generate\nrealistic data samples in various areas, such as images, molecules, text, and\nspeech. One major goal of data generation is controllability, namely to\ngenerate new data with desired properties. Despite growing interest in the area\nof controllable generation, significant challenges still remain, including 1)\ndisentangling desired properties with unrelated latent variables, 2)\nout-of-distribution property control, and 3) objective optimization for\nout-of-distribution property control. To address these challenges, in this\npaper, we propose a general framework to enhance VAE-based data generators with\nproperty controllability and ensure disentanglement. Our proposed objective can\nbe optimized on both data seen and unseen in the training set. We propose a\ntraining procedure to train the objective in a semi-supervised manner by\niteratively conducting mutual mappings between the data and properties. The\nproposed framework is implemented on four VAE-based controllable generators to\nevaluate its performance on property error, disentanglement, generation\nquality, and training time. The results indicate that our proposed framework\nenables more precise control over the properties of generated samples in a\nshort training time, ensuring the disentanglement and keeping the validity of\nthe generated samples.\n","authors":["Bo Pan","Muran Qin","Shiyu Wang","Yifei Zhang","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.07683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07678v1","updated":"2023-10-11T17:21:48Z","published":"2023-10-11T17:21:48Z","title":"Explainable Image Similarity: Integrating Siamese Networks and Grad-CAM","summary":"  With the proliferation of image-based applications in various domains, the\nneed for accurate and interpretable image similarity measures has become\nincreasingly critical. Existing image similarity models often lack\ntransparency, making it challenging to understand the reasons why two images\nare considered similar. In this paper, we propose the concept of explainable\nimage similarity, where the goal is the development of an approach, which is\ncapable of providing similarity scores along with visual factual and\ncounterfactual explanations. Along this line, we present a new framework, which\nintegrates Siamese Networks and Grad-CAM for providing explainable image\nsimilarity and discuss the potential benefits and challenges of adopting this\napproach. In addition, we provide a comprehensive discussion about factual and\ncounterfactual explanations provided by the proposed framework for assisting\ndecision making. The proposed approach has the potential to enhance the\ninterpretability, trustworthiness and user acceptance of image-based systems in\nreal-world image similarity applications. The implementation code can be found\nin https://github.com/ioannislivieris/Grad_CAM_Siamese.git.\n","authors":["Ioannis E. Livieris","Emmanuel Pintelas","Niki Kiriakidou","Panagiotis Pintelas"],"pdf_url":"https://arxiv.org/pdf/2310.07678v1.pdf","comment":"The manuscript has been submitted for publication in \"Journal of\n  Imaging\""},{"id":"http://arxiv.org/abs/2310.07676v1","updated":"2023-10-11T17:21:03Z","published":"2023-10-11T17:21:03Z","title":"Composite Backdoor Attacks Against Large Language Models","summary":"  Large language models (LLMs) have demonstrated superior performance compared\nto previous methods on various tasks, and often serve as the foundation models\nfor many researches and services. However, the untrustworthy third-party LLMs\nmay covertly introduce vulnerabilities for downstream tasks. In this paper, we\nexplore the vulnerability of LLMs through the lens of backdoor attacks.\nDifferent from existing backdoor attacks against LLMs, ours scatters multiple\ntrigger keys in different prompt components. Such a Composite Backdoor Attack\n(CBA) is shown to be stealthier than implanting the same multiple trigger keys\nin only a single component. CBA ensures that the backdoor is activated only\nwhen all trigger keys appear. Our experiments demonstrate that CBA is effective\nin both natural language processing (NLP) and multimodal tasks. For instance,\nwith $3\\%$ poisoning samples against the LLaMA-7B model on the Emotion dataset,\nour attack achieves a $100\\%$ Attack Success Rate (ASR) with a False Triggered\nRate (FTR) below $2.06\\%$ and negligible model accuracy degradation. The unique\ncharacteristics of our CBA can be tailored for various practical scenarios,\ne.g., targeting specific user groups. Our work highlights the necessity of\nincreased security research on the trustworthiness of foundation LLMs.\n","authors":["Hai Huang","Zhengyu Zhao","Michael Backes","Yun Shen","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.07676v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07672v1","updated":"2023-10-11T17:18:51Z","published":"2023-10-11T17:18:51Z","title":"Stabilizing Estimates of Shapley Values with Control Variates","summary":"  Shapley values are among the most popular tools for explaining predictions of\nblackbox machine learning models. However, their high computational cost\nmotivates the use of sampling approximations, inducing a considerable degree of\nuncertainty. To stabilize these model explanations, we propose ControlSHAP, an\napproach based on the Monte Carlo technique of control variates. Our\nmethodology is applicable to any machine learning model and requires virtually\nno extra computation or modeling effort. On several high-dimensional datasets,\nwe find it can produce dramatic reductions in the Monte Carlo variability of\nShapley estimates.\n","authors":["Jeremy Goldwasser","Giles Hooker"],"pdf_url":"https://arxiv.org/pdf/2310.07672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07668v1","updated":"2023-10-11T17:17:40Z","published":"2023-10-11T17:17:40Z","title":"GRaMuFeN: Graph-based Multi-modal Fake News Detection in Social Media","summary":"  The proliferation of social media platforms such as Twitter, Instagram, and\nWeibo has significantly enhanced the dissemination of false information. This\nphenomenon grants both individuals and governmental entities the ability to\nshape public opinions, highlighting the need for deploying effective detection\nmethods. In this paper, we propose GraMuFeN, a model designed to detect fake\ncontent by analyzing both the textual and image content of news. GraMuFeN\ncomprises two primary components: a text encoder and an image encoder. For\ntextual analysis, GraMuFeN treats each text as a graph and employs a Graph\nConvolutional Neural Network (GCN) as the text encoder. Additionally, the\npre-trained ResNet-152, as a Convolutional Neural Network (CNN), has been\nutilized as the image encoder. By integrating the outputs from these two\nencoders and implementing a contrastive similarity loss function, GraMuFeN\nachieves remarkable results. Extensive evaluations conducted on two publicly\navailable benchmark datasets for social media news indicate a 10 % increase in\nmicro F1-Score, signifying improvement over existing state-of-the-art models.\nThese findings underscore the effectiveness of combining GCN and CNN models for\ndetecting fake news in multi-modal data, all while minimizing the additional\ncomputational burden imposed by model parameters.\n","authors":["Makan Kananian","Fatima Badiei","S. AmirAli Gh. Ghahramani"],"pdf_url":"https://arxiv.org/pdf/2310.07668v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07667v1","updated":"2023-10-11T17:16:33Z","published":"2023-10-11T17:16:33Z","title":"Global Minima, Recoverability Thresholds, and Higher-Order Structure in\n  GNNS","summary":"  We analyze the performance of graph neural network (GNN) architectures from\nthe perspective of random graph theory. Our approach promises to complement\nexisting lenses on GNN analysis, such as combinatorial expressive power and\nworst-case adversarial analysis, by connecting the performance of GNNs to\ntypical-case properties of the training data. First, we theoretically\ncharacterize the nodewise accuracy of one- and two-layer GCNs relative to the\ncontextual stochastic block model (cSBM) and related models. We additionally\nprove that GCNs cannot beat linear models under certain circumstances. Second,\nwe numerically map the recoverability thresholds, in terms of accuracy, of four\ndiverse GNN architectures (GCN, GAT, SAGE, and Graph Transformer) under a\nvariety of assumptions about the data. Sample results of this second analysis\ninclude: heavy-tailed degree distributions enhance GNN performance, GNNs can\nwork well on strongly heterophilous graphs, and SAGE and Graph Transformer can\nperform well on arbitrarily noisy edge data, but no architecture handled\nsufficiently noisy feature data well. Finally, we show how both specific\nhigher-order structures in synthetic data and the mix of empirical structures\nin real data have dramatic effects (usually negative) on GNN performance.\n","authors":["Drake Brown","Trevor Garrity","Kaden Parker","Jason Oliphant","Stone Carson","Cole Hanson","Zachary Boyd"],"pdf_url":"https://arxiv.org/pdf/2310.07667v1.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/1910.08883v4","updated":"2023-10-11T17:14:41Z","published":"2019-10-20T03:14:20Z","title":"High-dimensional and universally consistent k-sample tests","summary":"  The k-sample testing problem involves determining whether $k$ groups of data\npoints are each drawn from the same distribution. The standard method for\nk-sample testing in biomedicine is Multivariate analysis of variance (MANOVA),\ndespite that it depends on strong, and often unsuitable, parametric\nassumptions. Moreover, independence testing and k-sample testing are closely\nrelated, and several universally consistent high-dimensional independence tests\nsuch as distance correlation (Dcorr) and Hilbert-Schmidt-Independence-Criterion\n(Hsic) enjoy solid theoretical and empirical properties. In this paper, we\nprove that independence tests achieve universally consistent k-sample testing\nand that k-sample statistics such as Energy and Maximum Mean Discrepancy (MMD)\nare precisely equivalent to Dcorr. An empirical evaluation of nonparametric\nindependence tests showed that they generally perform better than the popular\nMANOVA test, even in Gaussian distributed scenarios. The evaluation included\nseveral popular independence statistics and covered a comprehensive set of\nsimulations. Additionally, the testing approach was extended to perform\nmultiway and multilevel tests, which were demonstrated in a simulated study as\nwell as a real-world fMRI brain scans with a set of attributes.\n","authors":["Sambit Panda","Cencheng Shen","Ronan Perry","Jelle Zorn","Antoine Lutz","Carey E. Priebe","Joshua T. Vogelstein"],"pdf_url":"https://arxiv.org/pdf/1910.08883v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07665v1","updated":"2023-10-11T17:11:10Z","published":"2023-10-11T17:11:10Z","title":"Deep Backtracking Counterfactuals for Causally Compliant Explanations","summary":"  Counterfactuals can offer valuable insights by answering what would have been\nobserved under altered circumstances, conditional on a factual observation.\nWhereas the classical interventional interpretation of counterfactuals has been\nstudied extensively, backtracking constitutes a less studied alternative the\nbacktracking principle has emerged as an alternative philosophy where all\ncausal laws are kept intact. In the present work, we introduce a practical\nmethod for computing backtracking counterfactuals in structural causal models\nthat consist of deep generative components. To this end, we impose conditions\non the structural assignments that enable the generation of counterfactuals by\nsolving a tractable constrained optimization problem in the structured latent\nspace of a causal model. Our formulation also facilitates a comparison with\nmethods in the field of counterfactual explanations. Compared to these, our\nmethod represents a versatile, modular and causally compliant alternative. We\ndemonstrate these properties experimentally on a modified version of MNIST and\nCelebA.\n","authors":["Klaus-Rudolf Kladny","Julius von Kügelgen","Bernhard Schölkopf","Michael Muehlebach"],"pdf_url":"https://arxiv.org/pdf/2310.07665v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.06577v2","updated":"2023-10-11T17:10:13Z","published":"2023-01-16T19:27:16Z","title":"Learning from Very Little Data: On the Value of Landscape Analysis for\n  Predicting Software Project Health","summary":"  When data is scarce, software analytics can make many mistakes. For example,\nconsider learning predictors for open source project health (e.g. the number of\nclosed pull requests in twelve months time). The training data for this task\nmay be very small (e.g. five years of data, collected every month means just 60\nrows of training data). The models generated from such tiny data sets can make\nmany prediction errors.\n  Those errors can be tamed by a {\\em landscape analysis} that selects better\nlearner control parameters. Our niSNEAK tool (a)~clusters the data to find the\ngeneral landscape of the hyperparameters; then (b)~explores a few\nrepresentatives from each part of that landscape. niSNEAK is both faster and\nmore effective than prior state-of-the-art hyperparameter optimization\nalgorithms (e.g. FLASH, HYPEROPT, OPTUNA).\n  The configurations found by niSNEAK have far less error than other methods.\nFor example, for project health indicators such as $C$= number of commits;\n$I$=number of closed issues, and $R$=number of closed pull requests, niSNEAK's\n12 month prediction errors are \\{I=0\\%, R=33\\%\\,C=47\\%\\}\n  Based on the above, we recommend landscape analytics (e.g. niSNEAK)\nespecially when learning from very small data sets. This paper only explores\nthe application of niSNEAK to project health. That said, we see nothing in\nprinciple that prevents the application of this technique to a wider range of\nproblems.\n  To assist other researchers in repeating, improving, or even refuting our\nresults, all our scripts and data are available on GitHub at\nhttps://github.com/zxcv123456qwe/niSneak\n","authors":["Andre Lustosa","Tim Menzies"],"pdf_url":"https://arxiv.org/pdf/2301.06577v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08703v3","updated":"2023-10-11T17:00:34Z","published":"2023-05-15T15:06:20Z","title":"Schema-adaptable Knowledge Graph Construction","summary":"  Conventional Knowledge Graph Construction (KGC) approaches typically follow\nthe static information extraction paradigm with a closed set of pre-defined\nschema. As a result, such approaches fall short when applied to dynamic\nscenarios or domains, whereas a new type of knowledge emerges. This\nnecessitates a system that can handle evolving schema automatically to extract\ninformation for KGC. To address this need, we propose a new task called\nschema-adaptable KGC, which aims to continually extract entity, relation, and\nevent based on a dynamically changing schema graph without re-training. We\nfirst split and convert existing datasets based on three principles to build a\nbenchmark, i.e., horizontal schema expansion, vertical schema expansion, and\nhybrid schema expansion; then investigate the schema-adaptable performance of\nseveral well-known approaches such as Text2Event, TANL, UIE and GPT-3.5. We\nfurther propose a simple yet effective baseline dubbed \\textsc{AdaKGC}, which\ncontains schema-enriched prefix instructor and schema-conditioned dynamic\ndecoding to better handle evolving schema. Comprehensive experimental results\nillustrate that AdaKGC can outperform baselines but still have room for\nimprovement. We hope the proposed work can deliver benefits to the community.\nCode and datasets available at https://github.com/zjunlp/AdaKGC.\n","authors":["Hongbin Ye","Honghao Gui","Xin Xu","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.08703v3.pdf","comment":"EMNLP 2023 (Findings)"},{"id":"http://arxiv.org/abs/2310.07658v1","updated":"2023-10-11T17:00:03Z","published":"2023-10-11T17:00:03Z","title":"The First Pathloss Radio Map Prediction Challenge","summary":"  To foster research and facilitate fair comparisons among recently proposed\npathloss radio map prediction methods, we have launched the ICASSP 2023 First\nPathloss Radio Map Prediction Challenge. In this short overview paper, we\nbriefly describe the pathloss prediction problem, the provided datasets, the\nchallenge task and the challenge evaluation methodology. Finally, we present\nthe results of the challenge.\n","authors":["Çağkan Yapar","Fabian Jaensch","Ron Levie","Gitta Kutyniok","Giuseppe Caire"],"pdf_url":"https://arxiv.org/pdf/2310.07658v1.pdf","comment":"ICASSP 2023"},{"id":"http://arxiv.org/abs/2310.07654v1","updated":"2023-10-11T16:54:57Z","published":"2023-10-11T16:54:57Z","title":"Audio-Visual Neural Syntax Acquisition","summary":"  We study phrase structure induction from visually-grounded speech. The core\nidea is to first segment the speech waveform into sequences of word segments,\nand subsequently induce phrase structure using the inferred segment-level\ncontinuous representations. We present the Audio-Visual Neural Syntax Learner\n(AV-NSL) that learns phrase structure by listening to audio and looking at\nimages, without ever being exposed to text. By training on paired images and\nspoken captions, AV-NSL exhibits the capability to infer meaningful phrase\nstructures that are comparable to those derived by naturally-supervised text\nparsers, for both English and German. Our findings extend prior work in\nunsupervised language acquisition from speech and grounded grammar induction,\nand present one approach to bridge the gap between the two topics.\n","authors":["Cheng-I Jeff Lai","Freda Shi","Puyuan Peng","Yoon Kim","Kevin Gimpel","Shiyu Chang","Yung-Sung Chuang","Saurabhchand Bhati","David Cox","David Harwath","Yang Zhang","Karen Livescu","James Glass"],"pdf_url":"https://arxiv.org/pdf/2310.07654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13172v2","updated":"2023-10-11T16:51:50Z","published":"2023-05-22T16:00:00Z","title":"Editing Large Language Models: Problems, Methods, and Opportunities","summary":"  Despite the ability to train capable LLMs, the methodology for maintaining\ntheir relevancy and rectifying errors remains elusive. To this end, the past\nfew years have witnessed a surge in techniques for editing LLMs, the objective\nof which is to efficiently alter the behavior of LLMs within a specific domain\nwithout negatively impacting performance across other inputs. This paper\nembarks on a deep exploration of the problems, methods, and opportunities\nrelated to model editing for LLMs. In particular, we provide an exhaustive\noverview of the task definition and challenges associated with model editing,\nalong with an in-depth empirical analysis of the most progressive methods\ncurrently at our disposal. We also build a new benchmark dataset to facilitate\na more robust evaluation and pinpoint enduring issues intrinsic to existing\ntechniques. Our objective is to provide valuable insights into the\neffectiveness and feasibility of each editing technique, thereby assisting the\ncommunity in making informed decisions on the selection of the most appropriate\nmethod for a specific task or context. Code and datasets are available at\nhttps://github.com/zjunlp/EasyEdit.\n","authors":["Yunzhi Yao","Peng Wang","Bozhong Tian","Siyuan Cheng","Zhoubo Li","Shumin Deng","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.13172v2.pdf","comment":"EMNLP 2023. Updated with new experiments"},{"id":"http://arxiv.org/abs/2310.07648v1","updated":"2023-10-11T16:45:44Z","published":"2023-10-11T16:45:44Z","title":"Hypercomplex Multimodal Emotion Recognition from EEG and Peripheral\n  Physiological Signals","summary":"  Multimodal emotion recognition from physiological signals is receiving an\nincreasing amount of attention due to the impossibility to control them at will\nunlike behavioral reactions, thus providing more reliable information. Existing\ndeep learning-based methods still rely on extracted handcrafted features, not\ntaking full advantage of the learning ability of neural networks, and often\nadopt a single-modality approach, while human emotions are inherently expressed\nin a multimodal way. In this paper, we propose a hypercomplex multimodal\nnetwork equipped with a novel fusion module comprising parameterized\nhypercomplex multiplications. Indeed, by operating in a hypercomplex domain the\noperations follow algebraic rules which allow to model latent relations among\nlearned feature dimensions for a more effective fusion step. We perform\nclassification of valence and arousal from electroencephalogram (EEG) and\nperipheral physiological signals, employing the publicly available database\nMAHNOB-HCI surpassing a multimodal state-of-the-art network. The code of our\nwork is freely available at https://github.com/ispamm/MHyEEG.\n","authors":["Eleonora Lopez","Eleonora Chiarantano","Eleonora Grassucci","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2310.07648v1.pdf","comment":"Published at IEEE ICASSP workshops 2023"},{"id":"http://arxiv.org/abs/2310.07644v1","updated":"2023-10-11T16:40:57Z","published":"2023-10-11T16:40:57Z","title":"Rethinking the BERT-like Pretraining for DNA Sequences","summary":"  With the success of large-scale pretraining in NLP, there is an increasing\ntrend of applying it to the domain of life sciences. In particular, pretraining\nmethods based on DNA sequences have garnered growing attention due to their\npotential to capture generic information about genes. However, existing\npretraining methods for DNA sequences largely rely on direct adoptions of BERT\npretraining from NLP, lacking a comprehensive understanding and a specifically\ntailored approach. To address this research gap, we first conducted a series of\nexploratory experiments and gained several insightful observations: 1) In the\nfine-tuning phase of downstream tasks, when using K-mer overlapping\ntokenization instead of K-mer non-overlapping tokenization, both overlapping\nand non-overlapping pretraining weights show consistent performance\nimprovement.2) During the pre-training process, using K-mer overlapping\ntokenization quickly produces clear K-mer embeddings and reduces the loss to a\nvery low level, while using K-mer non-overlapping tokenization results in less\ndistinct embeddings and continuously decreases the loss. 3) Using overlapping\ntokenization causes the self-attention in the intermediate layers of\npre-trained models to tend to overly focus on certain tokens, reflecting that\nthese layers are not adequately optimized. In summary, overlapping tokenization\ncan benefit the fine-tuning of downstream tasks but leads to inadequate\npretraining with fast convergence. To unleash the pretraining potential, we\nintroduce a novel approach called RandomMask, which gradually increases the\ntask difficulty of BERT-like pretraining by continuously expanding its mask\nboundary, forcing the model to learn more knowledge. RandomMask is simple but\neffective, achieving top-tier performance across 26 datasets of 28 datasets\nspanning 7 downstream tasks.\n","authors":["Chaoqi Liang","Weiqiang Bai","Lifeng Qiao","Yuchen Ren","Jianle Sun","Peng Ye","Hongliang Yan","Xinzhu Ma","Wangmeng Zuo","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2310.07644v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07641v1","updated":"2023-10-11T16:38:11Z","published":"2023-10-11T16:38:11Z","title":"Evaluating Large Language Models at Evaluating Instruction Following","summary":"  As research in large language models (LLMs) continues to accelerate,\nLLM-based evaluation has emerged as a scalable and cost-effective alternative\nto human evaluations for comparing the ever increasing list of models. This\npaper investigates the efficacy of these \"LLM evaluators\", particularly in\nusing them to assess instruction following, a metric that gauges how closely\ngenerated text adheres to the given instruction. We introduce a challenging\nmeta-evaluation benchmark, LLMBar, designed to test the ability of an LLM\nevaluator in discerning instruction-following outputs. The authors manually\ncurated 419 pairs of outputs, one adhering to instructions while the other\ndiverging, yet may possess deceptive qualities that mislead an LLM evaluator,\ne.g., a more engaging tone. Contrary to existing meta-evaluation, we discover\nthat different evaluators (i.e., combinations of LLMs and prompts) exhibit\ndistinct performance on LLMBar and even the highest-scoring ones have\nsubstantial room for improvement. We also present a novel suite of prompting\nstrategies that further close the gap between LLM and human evaluators. With\nLLMBar, we hope to offer more insight into LLM evaluators and foster future\nresearch in developing better instruction-following models.\n","authors":["Zhiyuan Zeng","Jiatong Yu","Tianyu Gao","Yu Meng","Tanya Goyal","Danqi Chen"],"pdf_url":"https://arxiv.org/pdf/2310.07641v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2207.10062v3","updated":"2023-10-11T16:32:49Z","published":"2022-07-20T17:47:54Z","title":"DataPerf: Benchmarks for Data-Centric AI Development","summary":"  Machine learning research has long focused on models rather than datasets,\nand prominent datasets are used for common ML tasks without regard to the\nbreadth, difficulty, and faithfulness of the underlying problems. Neglecting\nthe fundamental importance of data has given rise to inaccuracy, bias, and\nfragility in real-world applications, and research is hindered by saturation\nacross existing dataset benchmarks. In response, we present DataPerf, a\ncommunity-led benchmark suite for evaluating ML datasets and data-centric\nalgorithms. We aim to foster innovation in data-centric AI through competition,\ncomparability, and reproducibility. We enable the ML community to iterate on\ndatasets, instead of just architectures, and we provide an open, online\nplatform with multiple rounds of challenges to support this iterative\ndevelopment. The first iteration of DataPerf contains five benchmarks covering\na wide spectrum of data-centric techniques, tasks, and modalities in vision,\nspeech, acquisition, debugging, and diffusion prompting, and we support hosting\nnew contributed benchmarks from the community. The benchmarks, online\nevaluation platform, and baseline implementations are open source, and the\nMLCommons Association will maintain DataPerf to ensure long-term benefits to\nacademia and industry.\n","authors":["Mark Mazumder","Colby Banbury","Xiaozhe Yao","Bojan Karlaš","William Gaviria Rojas","Sudnya Diamos","Greg Diamos","Lynn He","Alicia Parrish","Hannah Rose Kirk","Jessica Quaye","Charvi Rastogi","Douwe Kiela","David Jurado","David Kanter","Rafael Mosquera","Juan Ciro","Lora Aroyo","Bilge Acun","Lingjiao Chen","Mehul Smriti Raje","Max Bartolo","Sabri Eyuboglu","Amirata Ghorbani","Emmett Goodman","Oana Inel","Tariq Kane","Christine R. Kirkpatrick","Tzu-Sheng Kuo","Jonas Mueller","Tristan Thrush","Joaquin Vanschoren","Margaret Warren","Adina Williams","Serena Yeung","Newsha Ardalani","Praveen Paritosh","Lilith Bath-Leah","Ce Zhang","James Zou","Carole-Jean Wu","Cody Coleman","Andrew Ng","Peter Mattson","Vijay Janapa Reddi"],"pdf_url":"https://arxiv.org/pdf/2207.10062v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07632v1","updated":"2023-10-11T16:25:45Z","published":"2023-10-11T16:25:45Z","title":"Prompt Backdoors in Visual Prompt Learning","summary":"  Fine-tuning large pre-trained computer vision models is infeasible for\nresource-limited users. Visual prompt learning (VPL) has thus emerged to\nprovide an efficient and flexible alternative to model fine-tuning through\nVisual Prompt as a Service (VPPTaaS). Specifically, the VPPTaaS provider\noptimizes a visual prompt given downstream data, and downstream users can use\nthis prompt together with the large pre-trained model for prediction. However,\nthis new learning paradigm may also pose security risks when the VPPTaaS\nprovider instead provides a malicious visual prompt. In this paper, we take the\nfirst step to explore such risks through the lens of backdoor attacks.\nSpecifically, we propose BadVisualPrompt, a simple yet effective backdoor\nattack against VPL. For example, poisoning $5\\%$ CIFAR10 training data leads to\nabove $99\\%$ attack success rates with only negligible model accuracy drop by\n$1.5\\%$. In particular, we identify and then address a new technical challenge\nrelated to interactions between the backdoor trigger and visual prompt, which\ndoes not exist in conventional, model-level backdoors. Moreover, we provide\nin-depth analyses of seven backdoor defenses from model, prompt, and input\nlevels. Overall, all these defenses are either ineffective or impractical to\nmitigate our BadVisualPrompt, implying the critical vulnerability of VPL.\n","authors":["Hai Huang","Zhengyu Zhao","Michael Backes","Yun Shen","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.07632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07631v1","updated":"2023-10-11T16:24:06Z","published":"2023-10-11T16:24:06Z","title":"Graph Transformer Network for Flood Forecasting with Heterogeneous\n  Covariates","summary":"  Floods can be very destructive causing heavy damage to life, property, and\nlivelihoods. Global climate change and the consequent sea-level rise have\nincreased the occurrence of extreme weather events, resulting in elevated and\nfrequent flood risk. Therefore, accurate and timely flood forecasting in\ncoastal river systems is critical to facilitate good flood management. However,\nthe computational tools currently used are either slow or inaccurate. In this\npaper, we propose a Flood prediction tool using Graph Transformer Network\n(FloodGTN) for river systems. More specifically, FloodGTN learns the\nspatio-temporal dependencies of water levels at different monitoring stations\nusing Graph Neural Networks (GNNs) and an LSTM. It is currently implemented to\nconsider external covariates such as rainfall, tide, and the settings of\nhydraulic structures (e.g., outflows of dams, gates, pumps, etc.) along the\nriver. We use a Transformer to learn the attention given to external covariates\nin computing water levels. We apply the FloodGTN tool to data from the South\nFlorida Water Management District, which manages a coastal area prone to\nfrequent storms and hurricanes. Experimental results show that FloodGTN\noutperforms the physics-based model (HEC-RAS) by achieving higher accuracy with\n70% improvement while speeding up run times by at least 500x.\n","authors":["Jimeng Shi","Vitalii Stebliankin","Zhaonan Wang","Shaowen Wang","Giri Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2310.07631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07630v1","updated":"2023-10-11T16:23:07Z","published":"2023-10-11T16:23:07Z","title":"Differentiable Euler Characteristic Transforms for Shape Classification","summary":"  The Euler Characteristic Transform (ECT) has proven to be a powerful\nrepresentation, combining geometrical and topological characteristics of shapes\nand graphs. However, the ECT was hitherto unable to learn task-specific\nrepresentations. We overcome this issue and develop a novel computational layer\nthat enables learning the ECT in an end-to-end fashion. Our method DECT is fast\nand computationally efficient, while exhibiting performance on a par with more\ncomplex models in both graph and point cloud classification tasks. Moreover, we\nshow that this seemingly unexpressive statistic still provides the same\ntopological expressivity as more complex topological deep learning layers\nprovide.\n","authors":["Ernst Roell","Bastian Rieck"],"pdf_url":"https://arxiv.org/pdf/2310.07630v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08268v3","updated":"2023-10-11T16:17:20Z","published":"2023-03-14T23:01:27Z","title":"Chat with the Environment: Interactive Multimodal Perception Using Large\n  Language Models","summary":"  Programming robot behavior in a complex world faces challenges on multiple\nlevels, from dextrous low-level skills to high-level planning and reasoning.\nRecent pre-trained Large Language Models (LLMs) have shown remarkable reasoning\nability in few-shot robotic planning. However, it remains challenging to ground\nLLMs in multimodal sensory input and continuous action output, while enabling a\nrobot to interact with its environment and acquire novel information as its\npolicies unfold. We develop a robot interaction scenario with a partially\nobservable state, which necessitates a robot to decide on a range of epistemic\nactions in order to sample sensory information among multiple modalities,\nbefore being able to execute the task correctly. Matcha (Multimodal environment\nchatting) agent, an interactive perception framework, is therefore proposed\nwith an LLM as its backbone, whose ability is exploited to instruct epistemic\nactions and to reason over the resulting multimodal sensations (vision, sound,\nhaptics, proprioception), as well as to plan an entire task execution based on\nthe interactively acquired information. Our study demonstrates that LLMs can\nprovide high-level planning and reasoning skills and control interactive robot\nbehavior in a multimodal environment, while multimodal modules with the context\nof the environmental state help ground the LLMs and extend their processing\nability. The project website can be found at https://matcha-agent.github.io.\n","authors":["Xufeng Zhao","Mengdi Li","Cornelius Weber","Muhammad Burhan Hafez","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2303.08268v3.pdf","comment":"IROS2023, Detroit. See the project website at\n  https://matcha-agent.github.io"},{"id":"http://arxiv.org/abs/2305.11141v3","updated":"2023-10-11T16:16:18Z","published":"2023-05-18T17:35:35Z","title":"Clifford Group Equivariant Neural Networks","summary":"  We introduce Clifford Group Equivariant Neural Networks: a novel approach for\nconstructing $\\mathrm{O}(n)$- and $\\mathrm{E}(n)$-equivariant models. We\nidentify and study the $\\textit{Clifford group}$, a subgroup inside the\nClifford algebra whose definition we adjust to achieve several favorable\nproperties. Primarily, the group's action forms an orthogonal automorphism that\nextends beyond the typical vector space to the entire Clifford algebra while\nrespecting the multivector grading. This leads to several non-equivalent\nsubrepresentations corresponding to the multivector decomposition. Furthermore,\nwe prove that the action respects not just the vector space structure of the\nClifford algebra but also its multiplicative structure, i.e., the geometric\nproduct. These findings imply that every polynomial in multivectors, An\nadvantage worth mentioning is that we obtain expressive layers that can\nelegantly generalize to inner-product spaces of any dimension. We demonstrate,\nnotably from a single core implementation, state-of-the-art performance on\nseveral distinct tasks, including a three-dimensional $n$-body experiment, a\nfour-dimensional Lorentz-equivariant high-energy physics experiment, and a\nfive-dimensional convex hull experiment.\n","authors":["David Ruhe","Johannes Brandstetter","Patrick Forré"],"pdf_url":"https://arxiv.org/pdf/2305.11141v3.pdf","comment":"Published at NeurIPS 2023 (Oral)"},{"id":"http://arxiv.org/abs/2310.07626v1","updated":"2023-10-11T16:09:09Z","published":"2023-10-11T16:09:09Z","title":"Unsupervised Learning of Sea Surface Height Interpolation from\n  Multi-variate Simulated Satellite Observations","summary":"  Satellite-based remote sensing missions have revolutionized our understanding\nof the Ocean state and dynamics. Among them, spaceborne altimetry provides\nvaluable measurements of Sea Surface Height (SSH), which is used to estimate\nsurface geostrophic currents. However, due to the sensor technology employed,\nimportant gaps occur in SSH observations. Complete SSH maps are produced by the\naltimetry community using linear Optimal Interpolations (OI) such as the\nwidely-used Data Unification and Altimeter Combination System (DUACS). However,\nOI is known for producing overly smooth fields and thus misses some\nmesostructures and eddies. On the other hand, Sea Surface Temperature (SST)\nproducts have much higher data coverage and SST is physically linked to\ngeostrophic currents through advection. We design a realistic twin experiment\nto emulate the satellite observations of SSH and SST to evaluate interpolation\nmethods. We introduce a deep learning network able to use SST information, and\na trainable in two settings: one where we have no access to ground truth during\ntraining and one where it is accessible. Our investigation involves a\ncomparative analysis of the aforementioned network when trained using either\nsupervised or unsupervised loss functions. We assess the quality of SSH\nreconstructions and further evaluate the network's performance in terms of eddy\ndetection and physical properties. We find that it is possible, even in an\nunsupervised setting to use SST to improve reconstruction performance compared\nto SST-agnostic interpolations. We compare our reconstructions to DUACS's and\nreport a decrease of 41\\% in terms of root mean squared error.\n","authors":["Theo Archambault","Arthur Filoche","Anastase Charantonis","Dominique Bereziat","Sylvie Thiria"],"pdf_url":"https://arxiv.org/pdf/2310.07626v1.pdf","comment":"submitted to JAMES. 26 pages"},{"id":"http://arxiv.org/abs/2305.12534v2","updated":"2023-10-11T16:05:21Z","published":"2023-05-21T18:26:31Z","title":"BertRLFuzzer: A BERT and Reinforcement Learning based Fuzzer","summary":"  We present a novel tool BertRLFuzzer, a BERT and Reinforcement Learning (RL)\nbased fuzzer aimed at finding security vulnerabilities for Web applications.\nBertRLFuzzer works as follows: given a set of seed inputs, the fuzzer performs\ngrammar-adhering and attack-provoking mutation operations on them to generate\ncandidate attack vectors. The key insight of BertRLFuzzer is the use of RL with\na BERT model as an agent to guide the fuzzer to efficiently learn\ngrammar-adhering and attack-provoking mutation operators. In order to establish\nthe efficacy of BertRLFuzzer we compare it against a total of 13 black box and\nwhite box fuzzers over a benchmark of 9 victim websites with over 16K LOC. We\nobserved a significant improvement, relative to the nearest competing tool, in\nterms of time to first attack (54% less), new vulnerabilities found (17 new\nvulnerabilities), and attack rate (4.4% more attack vectors generated).\n","authors":["Piyush Jha","Joseph Scott","Jaya Sriram Ganeshna","Mudit Singh","Vijay Ganesh"],"pdf_url":"https://arxiv.org/pdf/2305.12534v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07612v1","updated":"2023-10-11T15:56:55Z","published":"2023-10-11T15:56:55Z","title":"PHYDI: Initializing Parameterized Hypercomplex Neural Networks as\n  Identity Functions","summary":"  Neural models based on hypercomplex algebra systems are growing and\nprolificating for a plethora of applications, ranging from computer vision to\nnatural language processing. Hand in hand with their adoption, parameterized\nhypercomplex neural networks (PHNNs) are growing in size and no techniques have\nbeen adopted so far to control their convergence at a large scale. In this\npaper, we study PHNNs convergence and propose parameterized hypercomplex\nidentity initialization (PHYDI), a method to improve their convergence at\ndifferent scales, leading to more robust performance when the number of layers\nscales up, while also reaching the same performance with fewer iterations. We\nshow the effectiveness of this approach in different benchmarks and with common\nPHNNs with ResNets- and Transformer-based architecture. The code is available\nat https://github.com/ispamm/PHYDI.\n","authors":["Matteo Mancanelli","Eleonora Grassucci","Aurelio Uncini","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2310.07612v1.pdf","comment":"Accepted at IEEE MLSP 2023 (Honorable Mention TOP 5% Outstanding\n  Papers)"},{"id":"http://arxiv.org/abs/2203.06768v4","updated":"2023-10-11T15:46:16Z","published":"2022-03-13T21:39:24Z","title":"Probabilistically Robust Recourse: Navigating the Trade-offs between\n  Costs and Robustness in Algorithmic Recourse","summary":"  As machine learning models are increasingly being employed to make\nconsequential decisions in real-world settings, it becomes critical to ensure\nthat individuals who are adversely impacted (e.g., loan denied) by the\npredictions of these models are provided with a means for recourse. While\nseveral approaches have been proposed to construct recourses for affected\nindividuals, the recourses output by these methods either achieve low costs\n(i.e., ease-of-implementation) or robustness to small perturbations (i.e.,\nnoisy implementations of recourses), but not both due to the inherent\ntrade-offs between the recourse costs and robustness. Furthermore, prior\napproaches do not provide end users with any agency over navigating the\naforementioned trade-offs. In this work, we address the above challenges by\nproposing the first algorithmic framework which enables users to effectively\nmanage the recourse cost vs. robustness trade-offs. More specifically, our\nframework Probabilistically ROBust rEcourse (\\texttt{PROBE}) lets users choose\nthe probability with which a recourse could get invalidated (recourse\ninvalidation rate) if small changes are made to the recourse i.e., the recourse\nis implemented somewhat noisily. To this end, we propose a novel objective\nfunction which simultaneously minimizes the gap between the achieved\n(resulting) and desired recourse invalidation rates, minimizes recourse costs,\nand also ensures that the resulting recourse achieves a positive model\nprediction. We develop novel theoretical results to characterize the recourse\ninvalidation rates corresponding to any given instance w.r.t. different classes\nof underlying models (e.g., linear models, tree based models etc.), and\nleverage these results to efficiently optimize the proposed objective.\nExperimental evaluation with multiple real world datasets demonstrates the\nefficacy of the proposed framework.\n","authors":["Martin Pawelczyk","Teresa Datta","Johannes van-den-Heuvel","Gjergji Kasneci","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2203.06768v4.pdf","comment":"ICLR 2023, camera ready version"},{"id":"http://arxiv.org/abs/2310.07598v1","updated":"2023-10-11T15:38:53Z","published":"2023-10-11T15:38:53Z","title":"Survey on Imbalanced Data, Representation Learning and SEP Forecasting","summary":"  Deep Learning methods have significantly advanced various data-driven tasks\nsuch as regression, classification, and forecasting. However, much of this\nprogress has been predicated on the strong but often unrealistic assumption\nthat training datasets are balanced with respect to the targets they contain.\nThis misalignment with real-world conditions, where data is frequently\nimbalanced, hampers the effectiveness of such models in practical applications.\nMethods that reconsider that assumption and tackle real-world imbalances have\nbegun to emerge and explore avenues to address this challenge. One such\npromising avenue is representation learning, which enables models to capture\ncomplex data characteristics and generalize better to minority classes. By\nfocusing on a richer representation of the feature space, these techniques hold\nthe potential to mitigate the impact of data imbalance. In this survey, we\npresent deep learning works that step away from the balanced-data assumption,\nemploying strategies like representation learning to better approximate\nreal-world imbalances. We also highlight a critical application in SEP\nforecasting where addressing data imbalance is paramount for success.\n","authors":["Josias Moukpe"],"pdf_url":"https://arxiv.org/pdf/2310.07598v1.pdf","comment":"Survey Paper, 4 figures, 16 pages"},{"id":"http://arxiv.org/abs/2310.07596v1","updated":"2023-10-11T15:37:31Z","published":"2023-10-11T15:37:31Z","title":"Prospective Side Information for Latent MDPs","summary":"  In many interactive decision-making settings, there is latent and unobserved\ninformation that remains fixed. Consider, for example, a dialogue system, where\ncomplete information about a user, such as the user's preferences, is not\ngiven. In such an environment, the latent information remains fixed throughout\neach episode, since the identity of the user does not change during an\ninteraction. This type of environment can be modeled as a Latent Markov\nDecision Process (LMDP), a special instance of Partially Observed Markov\nDecision Processes (POMDPs). Previous work established exponential lower bounds\nin the number of latent contexts for the LMDP class. This puts forward a\nquestion: under which natural assumptions a near-optimal policy of an LMDP can\nbe efficiently learned? In this work, we study the class of LMDPs with {\\em\nprospective side information}, when an agent receives additional, weakly\nrevealing, information on the latent context at the beginning of each episode.\nWe show that, surprisingly, this problem is not captured by contemporary\nsettings and algorithms designed for partially observed environments. We then\nestablish that any sample efficient algorithm must suffer at least\n$\\Omega(K^{2/3})$-regret, as opposed to standard $\\Omega(\\sqrt{K})$ lower\nbounds, and design an algorithm with a matching upper bound.\n","authors":["Jeongyeol Kwon","Yonathan Efroni","Shie Mannor","Constantine Caramanis"],"pdf_url":"https://arxiv.org/pdf/2310.07596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07592v1","updated":"2023-10-11T15:35:20Z","published":"2023-10-11T15:35:20Z","title":"Transformers for Green Semantic Communication: Less Energy, More\n  Semantics","summary":"  Semantic communication aims to transmit meaningful and effective information\nrather than focusing on individual symbols or bits, resulting in benefits like\nreduced latency, bandwidth usage, and higher throughput compared to traditional\ncommunication. However, semantic communication poses significant challenges due\nto the need for universal metrics for benchmarking the joint effects of\nsemantic information loss and practical energy consumption. This research\npresents a novel multi-objective loss function named \"Energy-Optimized Semantic\nLoss\" (EOSL), addressing the challenge of balancing semantic information loss\nand energy consumption. Through comprehensive experiments on transformer\nmodels, including CPU and GPU energy usage, it is demonstrated that EOSL-based\nencoder model selection can save up to 90\\% of energy while achieving a 44\\%\nimprovement in semantic similarity performance during inference in this\nexperiment. This work paves the way for energy-efficient neural network\nselection and the development of greener semantic communication architectures.\n","authors":["Shubhabrata Mukherjee","Cory Beard","Sejun Song"],"pdf_url":"https://arxiv.org/pdf/2310.07592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.14137v3","updated":"2023-10-11T15:34:51Z","published":"2022-08-30T10:35:32Z","title":"On the Trade-Off between Actionable Explanations and the Right to be\n  Forgotten","summary":"  As machine learning (ML) models are increasingly being deployed in\nhigh-stakes applications, policymakers have suggested tighter data protection\nregulations (e.g., GDPR, CCPA). One key principle is the \"right to be\nforgotten\" which gives users the right to have their data deleted. Another key\nprinciple is the right to an actionable explanation, also known as algorithmic\nrecourse, allowing users to reverse unfavorable decisions. To date, it is\nunknown whether these two principles can be operationalized simultaneously.\nTherefore, we introduce and study the problem of recourse invalidation in the\ncontext of data deletion requests. More specifically, we theoretically and\nempirically analyze the behavior of popular state-of-the-art algorithms and\ndemonstrate that the recourses generated by these algorithms are likely to be\ninvalidated if a small number of data deletion requests (e.g., 1 or 2) warrant\nupdates of the predictive model. For the setting of differentiable models, we\nsuggest a framework to identify a minimal subset of critical training points\nwhich, when removed, maximize the fraction of invalidated recourses. Using our\nframework, we empirically show that the removal of as little as 2 data\ninstances from the training set can invalidate up to 95 percent of all\nrecourses output by popular state-of-the-art algorithms. Thus, our work raises\nfundamental questions about the compatibility of \"the right to an actionable\nexplanation\" in the context of the \"right to be forgotten\", while also\nproviding constructive insights on the determining factors of recourse\nrobustness.\n","authors":["Martin Pawelczyk","Tobias Leemann","Asia Biega","Gjergji Kasneci"],"pdf_url":"https://arxiv.org/pdf/2208.14137v3.pdf","comment":"ICLR 2023 camera ready version"},{"id":"http://arxiv.org/abs/2310.07588v1","updated":"2023-10-11T15:28:44Z","published":"2023-10-11T15:28:44Z","title":"Accurate Use of Label Dependency in Multi-Label Text Classification\n  Through the Lens of Causality","summary":"  Multi-Label Text Classification (MLTC) aims to assign the most relevant\nlabels to each given text. Existing methods demonstrate that label dependency\ncan help to improve the model's performance. However, the introduction of label\ndependency may cause the model to suffer from unwanted prediction bias. In this\nstudy, we attribute the bias to the model's misuse of label dependency, i.e.,\nthe model tends to utilize the correlation shortcut in label dependency rather\nthan fusing text information and label dependency for prediction. Motivated by\ncausal inference, we propose a CounterFactual Text Classifier (CFTC) to\neliminate the correlation bias, and make causality-based predictions.\nSpecifically, our CFTC first adopts the predict-then-modify backbone to extract\nprecise label information embedded in label dependency, then blocks the\ncorrelation shortcut through the counterfactual de-bias technique with the help\nof the human causal graph. Experimental results on three datasets demonstrate\nthat our CFTC significantly outperforms the baselines and effectively\neliminates the correlation bias in datasets.\n","authors":["Caoyun Fan","Wenqing Chen","Jidong Tian","Yitian Li","Hao He","Yaohui Jin"],"pdf_url":"https://arxiv.org/pdf/2310.07588v1.pdf","comment":"Applied Intelligence 2023"},{"id":"http://arxiv.org/abs/2310.07587v1","updated":"2023-10-11T15:28:39Z","published":"2023-10-11T15:28:39Z","title":"Fed-GraB: Federated Long-tailed Learning with Self-Adjusting Gradient\n  Balancer","summary":"  Data privacy and long-tailed distribution are the norms rather than the\nexception in many real-world tasks. This paper investigates a federated\nlong-tailed learning (Fed-LT) task in which each client holds a locally\nheterogeneous dataset; if the datasets can be globally aggregated, they jointly\nexhibit a long-tailed distribution. Under such a setting, existing federated\noptimization and/or centralized long-tailed learning methods hardly apply due\nto challenges in (a) characterizing the global long-tailed distribution under\nprivacy constraints and (b) adjusting the local learning strategy to cope with\nthe head-tail imbalance. In response, we propose a method termed\n$\\texttt{Fed-GraB}$, comprised of a Self-adjusting Gradient Balancer (SGB)\nmodule that re-weights clients' gradients in a closed-loop manner, based on the\nfeedback of global long-tailed distribution evaluated by a Direct Prior\nAnalyzer (DPA) module. Using $\\texttt{Fed-GraB}$, clients can effectively\nalleviate the distribution drift caused by data heterogeneity during the model\ntraining process and obtain a global model with better performance on the\nminority classes while maintaining the performance of the majority classes.\nExtensive experiments demonstrate that $\\texttt{Fed-GraB}$ achieves\nstate-of-the-art performance on representative datasets such as CIFAR-10-LT,\nCIFAR-100-LT, ImageNet-LT, and iNaturalist.\n","authors":["Zikai Xiao","Zihan Chen","Songshang Liu","Hualiang Wang","Yang Feng","Jin Hao","Joey Tianyi Zhou","Jian Wu","Howard Hao Yang","Zuozhu Liu"],"pdf_url":"https://arxiv.org/pdf/2310.07587v1.pdf","comment":"Accepted by NeurIPS 2023"},{"id":"http://arxiv.org/abs/2309.11942v2","updated":"2023-10-11T15:21:32Z","published":"2023-09-21T09:57:03Z","title":"On the Probability of Immunity","summary":"  This work is devoted to the study of the probability of immunity, i.e. the\neffect occurs whether exposed or not. We derive necessary and sufficient\nconditions for non-immunity and $\\epsilon$-bounded immunity, i.e. the\nprobability of immunity is zero and $\\epsilon$-bounded, respectively. The\nformer allows us to estimate the probability of benefit (i.e., the effect\noccurs if and only if exposed) from a randomized controlled trial, and the\nlatter allows us to produce bounds of the probability of benefit that are\ntighter than the existing ones. We also introduce the concept of indirect\nimmunity (i.e., through a mediator) and repeat our previous analysis for it.\nFinally, we propose a method for sensitivity analysis of the probability of\nimmunity under unmeasured confounding.\n","authors":["Jose M. Peña"],"pdf_url":"https://arxiv.org/pdf/2309.11942v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07582v1","updated":"2023-10-11T15:20:07Z","published":"2023-10-11T15:20:07Z","title":"Linear Latent World Models in Simple Transformers: A Case Study on\n  Othello-GPT","summary":"  Foundation models exhibit significant capabilities in decision-making and\nlogical deductions. Nonetheless, a continuing discourse persists regarding\ntheir genuine understanding of the world as opposed to mere stochastic mimicry.\nThis paper meticulously examines a simple transformer trained for Othello,\nextending prior research to enhance comprehension of the emergent world model\nof Othello-GPT. The investigation reveals that Othello-GPT encapsulates a\nlinear representation of opposing pieces, a factor that causally steers its\ndecision-making process. This paper further elucidates the interplay between\nthe linear world representation and causal decision-making, and their\ndependence on layer depth and model complexity. We have made the code public.\n","authors":["Dean S. Hazineh","Zechen Zhang","Jeffery Chiu"],"pdf_url":"https://arxiv.org/pdf/2310.07582v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07579v1","updated":"2023-10-11T15:19:31Z","published":"2023-10-11T15:19:31Z","title":"In-Context Unlearning: Language Models as Few Shot Unlearners","summary":"  Machine unlearning, the study of efficiently removing the impact of specific\ntraining points on the trained model, has garnered increased attention of late,\ndriven by the need to comply with privacy regulations like the \\emph{Right to\nbe Forgotten}. Although unlearning is particularly relevant for LLMs in light\nof the copyright issues they raise, achieving precise unlearning is\ncomputationally infeasible for very large models. To this end, recent work has\nproposed several algorithms which approximate the removal of training data\nwithout retraining the model. These algorithms crucially rely on access to the\nmodel parameters in order to update them, an assumption that may not hold in\npractice due to computational constraints or when the LLM is accessed via API.\nIn this work, we propose a new class of unlearning methods for LLMs we call\n``In-Context Unlearning'', providing inputs in context and without having to\nupdate model parameters. To unlearn a particular training instance, we provide\nthe instance alongside a flipped label and additional correctly labelled\ninstances which are prepended as inputs to the LLM at inference time. Our\nexperimental results demonstrate that these contexts effectively remove\nspecific information from the training set while maintaining performance levels\nthat are competitive with (or in some cases exceed) state-of-the-art unlearning\nmethods that require access to the LLM parameters.\n","authors":["Martin Pawelczyk","Seth Neel","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2310.07579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07576v1","updated":"2023-10-11T15:17:55Z","published":"2023-10-11T15:17:55Z","title":"Analyzing Trendy Twitter Hashtags in the 2022 French Election","summary":"  Regressions trained to predict the future activity of social media users need\nrich features for accurate predictions. Many advanced models exist to generate\nsuch features; however, the time complexities of their computations are often\nprohibitive when they run on enormous data-sets. Some studies have shown that\nsimple semantic network features can be rich enough to use for regressions\nwithout requiring complex computations. We propose a method for using semantic\nnetworks as user-level features for machine learning tasks. We conducted an\nexperiment using a semantic network of 1037 Twitter hashtags from a corpus of\n3.7 million tweets related to the 2022 French presidential election. A\nbipartite graph is formed where hashtags are nodes and weighted edges connect\nthe hashtags reflecting the number of Twitter users that interacted with both\nhashtags. The graph is then transformed into a maximum-spanning tree with the\nmost popular hashtag as its root node to construct a hierarchy amongst the\nhashtags. We then provide a vector feature for each user based on this tree. To\nvalidate the usefulness of our semantic feature we performed a regression\nexperiment to predict the response rate of each user with six emotions like\nanger, enjoyment, or disgust. Our semantic feature performs well with the\nregression with most emotions having $R^2$ above 0.5. These results suggest\nthat our semantic feature could be considered for use in further experiments\npredicting social media response on big data-sets.\n","authors":["Aamir Mandviwalla","Lake Yin","Boleslaw K. Szymanski"],"pdf_url":"https://arxiv.org/pdf/2310.07576v1.pdf","comment":"9 pages, 1 figure, to be published in Complex Networks 2023"},{"id":"http://arxiv.org/abs/2211.15751v3","updated":"2023-10-11T15:13:02Z","published":"2022-11-28T20:11:37Z","title":"Edge Video Analytics: A Survey on Applications, Systems and Enabling\n  Techniques","summary":"  Video, as a key driver in the global explosion of digital information, can\ncreate tremendous benefits for human society. Governments and enterprises are\ndeploying innumerable cameras for a variety of applications, e.g., law\nenforcement, emergency management, traffic control, and security surveillance,\nall facilitated by video analytics (VA). This trend is spurred by the rapid\nadvancement of deep learning (DL), which enables more precise models for object\nclassification, detection, and tracking. Meanwhile, with the proliferation of\nInternet-connected devices, massive amounts of data are generated daily,\noverwhelming the cloud. Edge computing, an emerging paradigm that moves\nworkloads and services from the network core to the network edge, has been\nwidely recognized as a promising solution. The resulting new intersection, edge\nvideo analytics (EVA), begins to attract widespread attention. Nevertheless,\nonly a few loosely-related surveys exist on this topic. The basic concepts of\nEVA (e.g., definition, architectures) were not fully elucidated due to the\nrapid development of this domain. To fill these gaps, we provide a\ncomprehensive survey of the recent efforts on EVA. In this paper, we first\nreview the fundamentals of edge computing, followed by an overview of VA. EVA\nsystems and their enabling techniques are discussed next. In addition, we\nintroduce prevalent frameworks and datasets to aid future researchers in the\ndevelopment of EVA systems. Finally, we discuss existing challenges and foresee\nfuture research directions. We believe this survey will help readers comprehend\nthe relationship between VA and edge computing, and spark new ideas on EVA.\n","authors":["Renjie Xu","Saiedeh Razavi","Rong Zheng"],"pdf_url":"https://arxiv.org/pdf/2211.15751v3.pdf","comment":"Accepted in IEEE Communications Surveys and Tutorials, 2023"},{"id":"http://arxiv.org/abs/2304.06548v2","updated":"2023-10-11T15:09:11Z","published":"2023-04-13T13:57:33Z","title":"Multi-kernel Correntropy-based Orientation Estimation of IMUs: Gradient\n  Descent Methods","summary":"  This paper presents two computationally efficient algorithms for the\norientation estimation of inertial measurement units (IMUs): the\ncorrentropy-based gradient descent (CGD) and the correntropy-based decoupled\norientation estimation (CDOE). Traditional methods, such as gradient descent\n(GD) and decoupled orientation estimation (DOE), rely on the mean squared error\n(MSE) criterion, making them vulnerable to external acceleration and magnetic\ninterference. To address this issue, we demonstrate that the multi-kernel\ncorrentropy loss (MKCL) is an optimal objective function for maximum likelihood\nestimation (MLE) when the noise follows a type of heavy-tailed distribution. In\ncertain situations, the estimation error of the MKCL is bounded even in the\npresence of arbitrarily large outliers. By replacing the standard MSE cost\nfunction with MKCL, we develop the CGD and CDOE algorithms. We evaluate the\neffectiveness of our proposed methods by comparing them with existing\nalgorithms in various situations. Experimental results indicate that our\nproposed methods (CGD and CDOE) outperform their conventional counterparts (GD\nand DOE), especially when faced with external acceleration and magnetic\ndisturbances. Furthermore, the new algorithms demonstrate significantly lower\ncomputational complexity than Kalman filter-based approaches, making them\nsuitable for applications with low-cost microprocessors.\n","authors":["Shilei Li","Lijing Li","Dawei Shi","Yunjiang Lou","Ling Shi"],"pdf_url":"https://arxiv.org/pdf/2304.06548v2.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2304.14933v2","updated":"2023-10-11T15:08:51Z","published":"2023-04-28T15:43:21Z","title":"An Empirical Study of Multimodal Model Merging","summary":"  Model merging (e.g., via interpolation or task arithmetic) fuses multiple\nmodels trained on different tasks to generate a multi-task solution. The\ntechnique has been proven successful in previous studies, where the models are\ntrained on similar tasks and with the same initialization. In this paper, we\nexpand on this concept to a multimodal setup by merging transformers trained on\ndifferent modalities. Furthermore, we conduct our study for a novel goal where\nwe can merge vision, language, and cross-modal transformers of a\nmodality-specific architecture to create a parameter-efficient\nmodality-agnostic architecture. Through comprehensive experiments, we\nsystematically investigate the key factors impacting model performance after\nmerging, including initialization, merging mechanisms, and model architectures.\nWe also propose two metrics that assess the distance between weights to be\nmerged and can serve as an indicator of the merging outcomes. Our analysis\nleads to an effective training recipe for matching the performance of the\nmodality-agnostic baseline (i.e., pre-trained from scratch) via model merging.\nOur method also outperforms naive merging significantly on various tasks, with\nimprovements of 3% on VQA, 7% on COCO retrieval, 25% on NLVR2, 14% on Flickr30k\nand 3% on ADE20k. Our code is available at https://github.com/ylsung/vl-merging\n","authors":["Yi-Lin Sung","Linjie Li","Kevin Lin","Zhe Gan","Mohit Bansal","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2304.14933v2.pdf","comment":"EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2310.05469v2","updated":"2023-10-11T15:07:36Z","published":"2023-10-09T07:26:35Z","title":"Vibroacoustic Frequency Response Prediction with Query-based Operator\n  Networks","summary":"  Understanding vibroacoustic wave propagation in mechanical structures like\nairplanes, cars and houses is crucial to ensure health and comfort of their\nusers. To analyze such systems, designers and engineers primarily consider the\ndynamic response in the frequency domain, which is computed through expensive\nnumerical simulations like the finite element method. In contrast, data-driven\nsurrogate models offer the promise of speeding up these simulations, thereby\nfacilitating tasks like design optimization, uncertainty quantification, and\ndesign space exploration. We present a structured benchmark for a\nrepresentative vibroacoustic problem: Predicting the frequency response for\nvibrating plates with varying forms of beadings. The benchmark features a total\nof 12,000 plate geometries with an associated numerical solution and introduces\nevaluation metrics to quantify the prediction quality. To address the frequency\nresponse prediction task, we propose a novel frequency query operator model,\nwhich is trained to map plate geometries to frequency response functions. By\nintegrating principles from operator learning and implicit models for shape\nencoding, our approach effectively addresses the prediction of resonance peaks\nof frequency responses. We evaluate the method on our vibrating-plates\nbenchmark and find that it outperforms DeepONets, Fourier Neural Operators and\nmore traditional neural network architectures. The code and dataset are\navailable from https://eckerlab.org/code/delden2023_plate.\n","authors":["Jan van Delden","Julius Schultz","Christopher Blech","Sabine C. Langer","Timo Lüddecke"],"pdf_url":"https://arxiv.org/pdf/2310.05469v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07560v1","updated":"2023-10-11T15:04:33Z","published":"2023-10-11T15:04:33Z","title":"ROMO: Retrieval-enhanced Offline Model-based Optimization","summary":"  Data-driven black-box model-based optimization (MBO) problems arise in a\ngreat number of practical application scenarios, where the goal is to find a\ndesign over the whole space maximizing a black-box target function based on a\nstatic offline dataset. In this work, we consider a more general but\nchallenging MBO setting, named constrained MBO (CoMBO), where only part of the\ndesign space can be optimized while the rest is constrained by the environment.\nA new challenge arising from CoMBO is that most observed designs that satisfy\nthe constraints are mediocre in evaluation. Therefore, we focus on optimizing\nthese mediocre designs in the offline dataset while maintaining the given\nconstraints rather than further boosting the best observed design in the\ntraditional MBO setting. We propose retrieval-enhanced offline model-based\noptimization (ROMO), a new derivable forward approach that retrieves the\noffline dataset and aggregates relevant samples to provide a trusted\nprediction, and use it for gradient-based optimization. ROMO is simple to\nimplement and outperforms state-of-the-art approaches in the CoMBO setting.\nEmpirically, we conduct experiments on a synthetic Hartmann (3D) function\ndataset, an industrial CIO dataset, and a suite of modified tasks in the\nDesign-Bench benchmark. Results show that ROMO performs well in a wide range of\nconstrained optimization tasks.\n","authors":["Mingcheng Chen","Haoran Zhao","Yuxiang Zhao","Hulei Fan","Hongqiao Gao","Yong Yu","Zheng Tian"],"pdf_url":"https://arxiv.org/pdf/2310.07560v1.pdf","comment":"15 pages, 9 figures"},{"id":"http://arxiv.org/abs/2310.07558v1","updated":"2023-10-11T15:02:13Z","published":"2023-10-11T15:02:13Z","title":"Smootheness-Adaptive Dynamic Pricing with Nonparametric Demand Learning","summary":"  We study the dynamic pricing problem where the demand function is\nnonparametric and H\\\"older smooth, and we focus on adaptivity to the unknown\nH\\\"older smoothness parameter $\\beta$ of the demand function. Traditionally the\noptimal dynamic pricing algorithm heavily relies on the knowledge of $\\beta$ to\nachieve a minimax optimal regret of\n$\\widetilde{O}(T^{\\frac{\\beta+1}{2\\beta+1}})$. However, we highlight the\nchallenge of adaptivity in this dynamic pricing problem by proving that no\npricing policy can adaptively achieve this minimax optimal regret without\nknowledge of $\\beta$. Motivated by the impossibility result, we propose a\nself-similarity condition to enable adaptivity. Importantly, we show that the\nself-similarity condition does not compromise the problem's inherent complexity\nsince it preserves the regret lower bound\n$\\Omega(T^{\\frac{\\beta+1}{2\\beta+1}})$. Furthermore, we develop a\nsmoothness-adaptive dynamic pricing algorithm and theoretically prove that the\nalgorithm achieves this minimax optimal regret bound without the prior\nknowledge $\\beta$.\n","authors":["Zeqi Ye","Hansheng Jiang"],"pdf_url":"https://arxiv.org/pdf/2310.07558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.12898v3","updated":"2023-10-11T14:59:49Z","published":"2023-06-22T14:07:23Z","title":"Machine-Learning-Assisted and Real-Time-Feedback-Controlled Growth of\n  InAs/GaAs Quantum Dots","summary":"  Self-assembled InAs/GaAs quantum dots (QDs) have properties highly valuable\nfor developing various optoelectronic devices such as QD lasers and single\nphoton sources. The applications strongly rely on the density and quality of\nthese dots, which has motivated studies of the growth process control to\nrealize high-quality epi-wafers and devices. Establishing the process\nparameters in molecular beam epitaxy (MBE) for a specific density of QDs is a\nmultidimensional optimization challenge, usually addressed through\ntime-consuming and iterative trial-and-error. Here, we report a real-time\nfeedback control method to realize the growth of QDs with arbitrary density,\nwhich is fully automated and intelligent. We developed a machine learning (ML)\nmodel named 3D ResNet 50 trained using reflection high-energy electron\ndiffraction (RHEED) videos as input instead of static images and providing\nreal-time feedback on surface morphologies for process control. As a result, we\ndemonstrated that ML from previous growth could predict the post-growth density\nof QDs, by successfully tuning the QD densities in near-real time from 1.5E10\ncm-2 down to 3.8E8 cm-2 or up to 1.4E11 cm-2. Compared to traditional methods,\nour approach, with in situ tuning capabilities and excellent reliability, can\ndramatically expedite the material optimization process and improve the\nreproducibility of MBE, constituting significant progress for thin film growth\ntechniques. The concepts and methodologies proved feasible in this work are\npromising to be applied to a variety of material growth processes, which will\nrevolutionize semiconductor manufacturing for optoelectronic and\nmicroelectronic industries.\n","authors":["Chao Shen","Wenkang Zhan","Kaiyao Xin","Manyang Li","Zhenyu Sun","Hui Cong","Chi Xu","Jian Tang","Zhaofeng Wu","Bo Xu","Zhongming Wei","Chunlai Xue","Chao Zhao","Zhanguo Wang"],"pdf_url":"https://arxiv.org/pdf/2306.12898v3.pdf","comment":"5 figures"},{"id":"http://arxiv.org/abs/2308.12067v2","updated":"2023-10-11T14:49:26Z","published":"2023-08-23T11:27:30Z","title":"InstructionGPT-4: A 200-Instruction Paradigm for Fine-Tuning MiniGPT-4","summary":"  Multimodal large language models are typically trained in two stages: first\npre-training on image-text pairs, and then fine-tuning using supervised\nvision-language instruction data. Recent studies have shown that large language\nmodels can achieve satisfactory results even with a limited amount of\nhigh-quality instruction-following data. In this paper, we introduce\nInstructionGPT-4, which is fine-tuned on a small dataset comprising only 200\nexamples, amounting to approximately 6\\% of the instruction-following data used\nin the alignment dataset for MiniGPT-4. To achieve this, we first propose\nseveral metrics to access the quality of multimodal instruction data. Based on\nthese metrics, we present an effective and trainable data selector to\nautomatically identify and filter low-quality vision-language data. By\nemploying this method, InstructionGPT-4 outperforms the original MiniGPT-4 on\nvarious evaluations. Overall, our findings demonstrate that less but\nhigh-quality instruction tuning data is efficient in enabling multimodal large\nlanguage models to generate better output. Our code is available at\nhttps://github.com/waltonfuture/InstructionGPT-4.\n","authors":["Lai Wei","Zihao Jiang","Weiran Huang","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2308.12067v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07535v1","updated":"2023-10-11T14:39:51Z","published":"2023-10-11T14:39:51Z","title":"Improving Fairness-Accuracy tradeoff with few Test Samples under\n  Covariate Shift","summary":"  Covariate shift in the test data can significantly downgrade both the\naccuracy and the fairness performance of the model. Ensuring fairness across\ndifferent sensitive groups in such settings is of paramount importance due to\nsocietal implications like criminal justice. We operate under the unsupervised\nregime where only a small set of unlabeled test samples along with a labeled\ntraining set is available. Towards this problem, we make three contributions.\nFirst is a novel composite weighted entropy based objective for prediction\naccuracy which is optimized along with a representation matching loss for\nfairness. We experimentally verify that optimizing with our loss formulation\noutperforms a number of state-of-the-art baselines in the pareto sense with\nrespect to the fairness-accuracy tradeoff on several standard datasets. Our\nsecond contribution is a new setting we term Asymmetric Covariate Shift that,\nto the best of our knowledge, has not been studied before. Asymmetric covariate\nshift occurs when distribution of covariates of one group shifts significantly\ncompared to the other groups and this happens when a dominant group is\nover-represented. While this setting is extremely challenging for current\nbaselines, We show that our proposed method significantly outperforms them. Our\nthird contribution is theoretical, where we show that our weighted entropy term\nalong with prediction loss on the training set approximates test loss under\ncovariate shift. Empirically and through formal sample complexity bounds, we\nshow that this approximation to the unseen test loss does not depend on\nimportance sampling variance which affects many other baselines.\n","authors":["Shreyas Havaldar","Jatin Chauhan","Karthikeyan Shanmugam","Jay Nandy","Aravindan Raghuveer"],"pdf_url":"https://arxiv.org/pdf/2310.07535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07534v1","updated":"2023-10-11T14:39:12Z","published":"2023-10-11T14:39:12Z","title":"Human-Centered Evaluation of XAI Methods","summary":"  In the ever-evolving field of Artificial Intelligence, a critical challenge\nhas been to decipher the decision-making processes within the so-called \"black\nboxes\" in deep learning. Over recent years, a plethora of methods have emerged,\ndedicated to explaining decisions across diverse tasks. Particularly in tasks\nlike image classification, these methods typically identify and emphasize the\npivotal pixels that most influence a classifier's prediction. Interestingly,\nthis approach mirrors human behavior: when asked to explain our rationale for\nclassifying an image, we often point to the most salient features or aspects.\nCapitalizing on this parallel, our research embarked on a user-centric study.\nWe sought to objectively measure the interpretability of three leading\nexplanation methods: (1) Prototypical Part Network, (2) Occlusion, and (3)\nLayer-wise Relevance Propagation. Intriguingly, our results highlight that\nwhile the regions spotlighted by these methods can vary widely, they all offer\nhumans a nearly equivalent depth of understanding. This enables users to\ndiscern and categorize images efficiently, reinforcing the value of these\nmethods in enhancing AI transparency.\n","authors":["Karam Dawoud","Wojciech Samek","Sebastian Lapuschkin","Sebastian Bosse"],"pdf_url":"https://arxiv.org/pdf/2310.07534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13149v3","updated":"2023-10-11T14:32:34Z","published":"2023-07-24T22:22:32Z","title":"Discovering interpretable elastoplasticity models via the neural\n  polynomial method enabled symbolic regressions","summary":"  Conventional neural network elastoplasticity models are often perceived as\nlacking interpretability. This paper introduces a two-step machine learning\napproach that returns mathematical models interpretable by human experts. In\nparticular, we introduce a surrogate model where yield surfaces are expressed\nin terms of a set of single-variable feature mappings obtained from supervised\nlearning. A postprocessing step is then used to re-interpret the set of\nsingle-variable neural network mapping functions into mathematical form through\nsymbolic regression. This divide-and-conquer approach provides several\nimportant advantages. First, it enables us to overcome the scaling issue of\nsymbolic regression algorithms. From a practical perspective, it enhances the\nportability of learned models for partial differential equation solvers written\nin different programming languages. Finally, it enables us to have a concrete\nunderstanding of the attributes of the materials, such as convexity and\nsymmetries of models, through automated derivations and reasoning. Numerical\nexamples have been provided, along with an open-source code to enable third\nparty validation.\n","authors":["Bahador Bahmani","Hyoung Suk Suh","WaiChing Sun"],"pdf_url":"https://arxiv.org/pdf/2307.13149v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07528v1","updated":"2023-10-11T14:29:11Z","published":"2023-10-11T14:29:11Z","title":"Provable Advantage of Parameterized Quantum Circuit in Function\n  Approximation","summary":"  Understanding the power of parameterized quantum circuits (PQCs) in\naccomplishing machine learning tasks is one of the most important questions in\nquantum machine learning. In this paper, we analyze the expressivity of PQCs\nthrough the lens of function approximation. Previously established universal\napproximation theorems for PQCs are mainly nonconstructive, leading us to the\nfollowing question: How large do the PQCs need to be to approximate the target\nfunction up to a given error? We exhibit explicit constructions of data\nre-uploading PQCs for approximating continuous and smooth functions and\nestablish quantitative approximation error bounds in terms of the width, the\ndepth and the number of trainable parameters of the PQCs. To achieve this, we\nutilize techniques from quantum signal processing and linear combinations of\nunitaries to construct PQCs that implement multivariate polynomials. We\nimplement global and local approximation techniques using Bernstein polynomials\nand local Taylor expansion and analyze their performances in the quantum\nsetting. We also compare our proposed PQCs to nearly optimal deep neural\nnetworks in approximating high-dimensional smooth functions, showing that the\nratio between model sizes of PQC and deep neural networks is exponentially\nsmall with respect to the input dimension. This suggests a potentially novel\navenue for showcasing quantum advantages in quantum machine learning.\n","authors":["Zhan Yu","Qiuhao Chen","Yuling Jiao","Yinan Li","Xiliang Lu","Xin Wang","Jerry Zhijian Yang"],"pdf_url":"https://arxiv.org/pdf/2310.07528v1.pdf","comment":"31pages, 3 figures"},{"id":"http://arxiv.org/abs/2304.01950v2","updated":"2023-10-11T14:21:29Z","published":"2023-04-01T09:16:40Z","title":"MP-FedCL: Multiprototype Federated Contrastive Learning for Edge\n  Intelligence","summary":"  Federated learning-assisted edge intelligence enables privacy protection in\nmodern intelligent services. However, not independent and identically\ndistributed (non-IID) distribution among edge clients can impair the local\nmodel performance. The existing single prototype-based strategy represents a\nclass by using the mean of the feature space. However, feature spaces are\nusually not clustered, and a single prototype may not represent a class well.\nMotivated by this, this paper proposes a multi-prototype federated contrastive\nlearning approach (MP-FedCL) which demonstrates the effectiveness of using a\nmulti-prototype strategy over a single-prototype under non-IID settings,\nincluding both label and feature skewness. Specifically, a multi-prototype\ncomputation strategy based on \\textit{k-means} is first proposed to capture\ndifferent embedding representations for each class space, using multiple\nprototypes ($k$ centroids) to represent a class in the embedding space. In each\nglobal round, the computed multiple prototypes and their respective model\nparameters are sent to the edge server for aggregation into a global prototype\npool, which is then sent back to all clients to guide their local training.\nFinally, local training for each client minimizes their own supervised learning\ntasks and learns from shared prototypes in the global prototype pool through\nsupervised contrastive learning, which encourages them to learn knowledge\nrelated to their own class from others and reduces the absorption of unrelated\nknowledge in each global iteration. Experimental results on MNIST, Digit-5,\nOffice-10, and DomainNet show that our method outperforms multiple baselines,\nwith an average test accuracy improvement of about 4.6\\% and 10.4\\% under\nfeature and label non-IID distributions, respectively.\n","authors":["Yu Qiao","Md. Shirajum Munir","Apurba Adhikary","Huy Q. Le","Avi Deb Raha","Chaoning Zhang","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2304.01950v2.pdf","comment":"Accepted by IEEE Internet of Things"},{"id":"http://arxiv.org/abs/2111.08117v3","updated":"2023-10-11T14:20:40Z","published":"2021-11-15T22:33:52Z","title":"Neural networks with linear threshold activations: structure and\n  algorithms","summary":"  In this article we present new results on neural networks with linear\nthreshold activation functions. We precisely characterize the class of\nfunctions that are representable by such neural networks and show that 2 hidden\nlayers are necessary and sufficient to represent any function representable in\nthe class. This is a surprising result in the light of recent exact\nrepresentability investigations for neural networks using other popular\nactivation functions like rectified linear units (ReLU). We also give precise\nbounds on the sizes of the neural networks required to represent any function\nin the class. Finally, we design an algorithm to solve the empirical risk\nminimization (ERM) problem to global optimality for these neural networks with\na fixed architecture. The algorithm's running time is polynomial in the size of\nthe data sample, if the input dimension and the size of the network\narchitecture are considered fixed constants. The algorithm is unique in the\nsense that it works for any architecture with any number of layers, whereas\nprevious polynomial time globally optimal algorithms work only for very\nrestricted classes of architectures. Using these insights, we propose a new\nclass of neural networks that we call shortcut linear threshold networks. To\nthe best of our knowledge, this way of designing neural networks has not been\nexplored before in the literature. We show that these neural networks have\nseveral desirable theoretical properties.\n","authors":["Sammy Khalife","Hongyu Cheng","Amitabh Basu"],"pdf_url":"https://arxiv.org/pdf/2111.08117v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07518v1","updated":"2023-10-11T14:16:04Z","published":"2023-10-11T14:16:04Z","title":"Exploiting Causal Graph Priors with Posterior Sampling for Reinforcement\n  Learning","summary":"  Posterior sampling allows the exploitation of prior knowledge of the\nenvironment's transition dynamics to improve the sample efficiency of\nreinforcement learning. The prior is typically specified as a class of\nparametric distributions, a task that can be cumbersome in practice, often\nresulting in the choice of uninformative priors. In this work, we propose a\nnovel posterior sampling approach in which the prior is given as a (partial)\ncausal graph over the environment's variables. The latter is often more natural\nto design, such as listing known causal dependencies between biometric features\nin a medical treatment study. Specifically, we propose a hierarchical Bayesian\nprocedure, called C-PSRL, simultaneously learning the full causal graph at the\nhigher level and the parameters of the resulting factored dynamics at the lower\nlevel. For this procedure, we provide an analysis of its Bayesian regret, which\nexplicitly connects the regret rate with the degree of prior knowledge. Our\nnumerical evaluation conducted in illustrative domains confirms that C-PSRL\nstrongly improves the efficiency of posterior sampling with an uninformative\nprior while performing close to posterior sampling with the full causal graph.\n","authors":["Mirco Mutti","Riccardo De Santi","Marcello Restelli","Alexander Marx","Giorgia Ramponi"],"pdf_url":"https://arxiv.org/pdf/2310.07518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07511v1","updated":"2023-10-11T14:07:05Z","published":"2023-10-11T14:07:05Z","title":"A Unified Remote Sensing Anomaly Detector Across Modalities and Scenes\n  via Deviation Relationship Learning","summary":"  Remote sensing anomaly detector can find the objects deviating from the\nbackground as potential targets. Given the diversity in earth anomaly types, a\nunified anomaly detector across modalities and scenes should be cost-effective\nand flexible to new earth observation sources and anomaly types. However, the\ncurrent anomaly detectors are limited to a single modality and single scene,\nsince they aim to learn the varying background distribution. Motivated by the\nuniversal anomaly deviation pattern, in that anomalies exhibit deviations from\ntheir local context, we exploit this characteristic to build a unified anomaly\ndetector. Firstly, we reformulate the anomaly detection task as an undirected\nbilayer graph based on the deviation relationship, where the anomaly score is\nmodeled as the conditional probability, given the pattern of the background and\nnormal objects. The learning objective is then expressed as a conditional\nprobability ranking problem. Furthermore, we design an instantiation of the\nreformulation in the data, architecture, and optimization aspects. Simulated\nspectral and spatial anomalies drive the instantiated architecture. The model\nis optimized directly for the conditional probability ranking. The proposed\nmodel was validated in five modalities including the hyperspectral, visible\nlight, synthetic aperture radar (SAR), infrared and low light to show its\nunified detection ability.\n","authors":["Jingtao Li","Xinyu Wang","Hengwei Zhao","Liangpei Zhang","Yanfei Zhong"],"pdf_url":"https://arxiv.org/pdf/2310.07511v1.pdf","comment":"Journal paper"},{"id":"http://arxiv.org/abs/2310.07506v1","updated":"2023-10-11T14:02:11Z","published":"2023-10-11T14:02:11Z","title":"Leveraging Hierarchical Feature Sharing for Efficient Dataset\n  Condensation","summary":"  Given a real-world dataset, data condensation (DC) aims to synthesize a\nsignificantly smaller dataset that captures the knowledge of this dataset for\nmodel training with high performance. Recent works propose to enhance DC with\ndata parameterization, which condenses data into parameterized data containers\nrather than pixel space. The intuition behind data parameterization is to\nencode shared features of images to avoid additional storage costs. In this\npaper, we recognize that images share common features in a hierarchical way due\nto the inherent hierarchical structure of the classification system, which is\noverlooked by current data parameterization methods. To better align DC with\nthis hierarchical nature and encourage more efficient information sharing\ninside data containers, we propose a novel data parameterization architecture,\nHierarchical Memory Network (HMN). HMN stores condensed data in a three-tier\nstructure, representing the dataset-level, class-level, and instance-level\nfeatures. Another helpful property of the hierarchical architecture is that HMN\nnaturally ensures good independence among images despite achieving information\nsharing. This enables instance-level pruning for HMN to reduce redundant\ninformation, thereby further minimizing redundancy and enhancing performance.\nWe evaluate HMN on four public datasets (SVHN, CIFAR10, CIFAR100, and\nTiny-ImageNet) and compare HMN with eight DC baselines. The evaluation results\nshow that our proposed method outperforms all baselines, even when trained with\na batch-based loss consuming less GPU memory.\n","authors":["Haizhong Zheng","Jiachen Sun","Shutong Wu","Bhavya Kailkhura","Zhuoqing Mao","Chaowei Xiao","Atul Prakash"],"pdf_url":"https://arxiv.org/pdf/2310.07506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07497v1","updated":"2023-10-11T13:50:28Z","published":"2023-10-11T13:50:28Z","title":"Sample-Driven Federated Learning for Energy-Efficient and Real-Time IoT\n  Sensing","summary":"  In the domain of Federated Learning (FL) systems, recent cutting-edge methods\nheavily rely on ideal conditions convergence analysis. Specifically, these\napproaches assume that the training datasets on IoT devices possess similar\nattributes to the global data distribution. However, this approach fails to\ncapture the full spectrum of data characteristics in real-time sensing FL\nsystems. In order to overcome this limitation, we suggest a new approach system\nspecifically designed for IoT networks with real-time sensing capabilities. Our\napproach takes into account the generalization gap due to the user's data\nsampling process. By effectively controlling this sampling process, we can\nmitigate the overfitting issue and improve overall accuracy. In particular, We\nfirst formulate an optimization problem that harnesses the sampling process to\nconcurrently reduce overfitting while maximizing accuracy. In pursuit of this\nobjective, our surrogate optimization problem is adept at handling energy\nefficiency while optimizing the accuracy with high generalization. To solve the\noptimization problem with high complexity, we introduce an online reinforcement\nlearning algorithm, named Sample-driven Control for Federated Learning (SCFL)\nbuilt on the Soft Actor-Critic (A2C) framework. This enables the agent to\ndynamically adapt and find the global optima even in changing environments. By\nleveraging the capabilities of SCFL, our system offers a promising solution for\nresource allocation in FL systems with real-time sensing capabilities.\n","authors":["Minh Ngoc Luu","Minh-Duong Nguyen","Ebrahim Bedeer","Van Duc Nguyen","Dinh Thai Hoang","Diep N. Nguyen","Quoc-Viet Pham"],"pdf_url":"https://arxiv.org/pdf/2310.07497v1.pdf","comment":"17 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.07491v1","updated":"2023-10-11T13:39:04Z","published":"2023-10-11T13:39:04Z","title":"Model-based Clustering of Individuals' Ecological Momentary Assessment\n  Time-series Data for Improving Forecasting Performance","summary":"  Through Ecological Momentary Assessment (EMA) studies, a number of\ntime-series data is collected across multiple individuals, continuously\nmonitoring various items of emotional behavior. Such complex data is commonly\nanalyzed in an individual level, using personalized models. However, it is\nbelieved that additional information of similar individuals is likely to\nenhance these models leading to better individuals' description. Thus,\nclustering is investigated with an aim to group together the most similar\nindividuals, and subsequently use this information in group-based models in\norder to improve individuals' predictive performance. More specifically, two\nmodel-based clustering approaches are examined, where the first is using\nmodel-extracted parameters of personalized models, whereas the second is\noptimized on the model-based forecasting performance. Both methods are then\nanalyzed using intrinsic clustering evaluation measures (e.g. Silhouette\ncoefficients) as well as the performance of a downstream forecasting scheme,\nwhere each forecasting group-model is devoted to describe all individuals\nbelonging to one cluster. Among these, clustering based on performance shows\nthe best results, in terms of all examined evaluation measures. As another\nlevel of evaluation, those group-models' performance is compared to three\nbaseline scenarios, the personalized, the all-in-one group and the random\ngroup-based concept. According to this comparison, the superiority of\nclustering-based methods is again confirmed, indicating that the utilization of\ngroup-based information could be effectively enhance the overall performance of\nall individuals' data.\n","authors":["Mandani Ntekouli","Gerasimos Spanakis","Lourens Waldorp","Anne Roefs"],"pdf_url":"https://arxiv.org/pdf/2310.07491v1.pdf","comment":"17 pages, 7 figures, BNAIC/BeNeLearn 2023 (Joint International\n  Scientific Conferences on AI and Machine Learning)"},{"id":"http://arxiv.org/abs/2310.07488v1","updated":"2023-10-11T13:35:05Z","published":"2023-10-11T13:35:05Z","title":"KwaiYiiMath: Technical Report","summary":"  Recent advancements in large language models (LLMs) have demonstrated\nremarkable abilities in handling a variety of natural language processing (NLP)\ndownstream tasks, even on mathematical tasks requiring multi-step reasoning. In\nthis report, we introduce the KwaiYiiMath which enhances the mathematical\nreasoning abilities of KwaiYiiBase1, by applying Supervised Fine-Tuning (SFT)\nand Reinforced Learning from Human Feedback (RLHF), including on both English\nand Chinese mathematical tasks. Meanwhile, we also constructed a small-scale\nChinese primary school mathematics test set (named KMath), consisting of 188\nexamples to evaluate the correctness of the problem-solving process generated\nby the models. Empirical studies demonstrate that KwaiYiiMath can achieve\nstate-of-the-art (SOTA) performance on GSM8k, CMath, and KMath compared with\nthe similar size models, respectively.\n","authors":["Jiayi Fu","Lei Lin","Xiaoyang Gao","Pengli Liu","Zhengzong Chen","Zhirui Yang","Shengnan Zhang","Xue Zheng","Yan Li","Yuliang Liu","Xucheng Ye","Yiqiao Liao","Chao Liao","Bin Chen","Chengru Song","Junchen Wan","Zijia Lin","Fuzheng Zhang","Zhongyuan Wang","Di Zhang","Kun Gai"],"pdf_url":"https://arxiv.org/pdf/2310.07488v1.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2310.07485v1","updated":"2023-10-11T13:32:04Z","published":"2023-10-11T13:32:04Z","title":"Nonlinear embeddings for conserving Hamiltonians and other quantities\n  with Neural Galerkin schemes","summary":"  This work focuses on the conservation of quantities such as Hamiltonians,\nmass, and momentum when solution fields of partial differential equations are\napproximated with nonlinear parametrizations such as deep networks. The\nproposed approach builds on Neural Galerkin schemes that are based on the\nDirac--Frenkel variational principle to train nonlinear parametrizations\nsequentially in time. We first show that only adding constraints that aim to\nconserve quantities in continuous time can be insufficient because the\nnonlinear dependence on the parameters implies that even quantities that are\nlinear in the solution fields become nonlinear in the parameters and thus are\nchallenging to discretize in time. Instead, we propose Neural Galerkin schemes\nthat compute at each time step an explicit embedding onto the manifold of\nnonlinearly parametrized solution fields to guarantee conservation of\nquantities. The embeddings can be combined with standard explicit and implicit\ntime integration schemes. Numerical experiments demonstrate that the proposed\napproach conserves quantities up to machine precision.\n","authors":["Paul Schwerdtner","Philipp Schulze","Jules Berman","Benjamin Peherstorfer"],"pdf_url":"https://arxiv.org/pdf/2310.07485v1.pdf","comment":"29 pages, 8 figures"},{"id":"http://arxiv.org/abs/2310.05869v2","updated":"2023-10-11T13:25:13Z","published":"2023-10-09T17:05:25Z","title":"HyperAttention: Long-context Attention in Near-Linear Time","summary":"  We present an approximate attention mechanism named HyperAttention to address\nthe computational challenges posed by the growing complexity of long contexts\nused in Large Language Models (LLMs). Recent work suggests that in the\nworst-case scenario, quadratic time is necessary unless the entries of the\nattention matrix are bounded or the matrix has low stable rank. We introduce\ntwo parameters which measure: (1) the max column norm in the normalized\nattention matrix, and (2) the ratio of row norms in the unnormalized attention\nmatrix after detecting and removing large entries. We use these fine-grained\nparameters to capture the hardness of the problem. Despite previous lower\nbounds, we are able to achieve a linear time sampling algorithm even when the\nmatrix has unbounded entries or a large stable rank, provided the above\nparameters are small. HyperAttention features a modular design that easily\naccommodates integration of other fast low-level implementations, particularly\nFlashAttention. Empirically, employing Locality Sensitive Hashing (LSH) to\nidentify large entries, HyperAttention outperforms existing methods, giving\nsignificant speed improvements compared to state-of-the-art solutions like\nFlashAttention. We validate the empirical performance of HyperAttention on a\nvariety of different long-context length datasets. For example, HyperAttention\nmakes the inference time of ChatGLM2 50\\% faster on 32k context length while\nperplexity increases from 5.6 to 6.3. On larger context length, e.g., 131k,\nwith causal masking, HyperAttention offers 5-fold speedup on a single attention\nlayer.\n","authors":["Insu Han","Rajesh Jayaram","Amin Karbasi","Vahab Mirrokni","David P. Woodruff","Amir Zandieh"],"pdf_url":"https://arxiv.org/pdf/2310.05869v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.07068v4","updated":"2023-10-11T13:09:46Z","published":"2022-07-14T17:16:45Z","title":"Bias Mitigation for Machine Learning Classifiers: A Comprehensive Survey","summary":"  This paper provides a comprehensive survey of bias mitigation methods for\nachieving fairness in Machine Learning (ML) models. We collect a total of 341\npublications concerning bias mitigation for ML classifiers. These methods can\nbe distinguished based on their intervention procedure (i.e., pre-processing,\nin-processing, post-processing) and the technique they apply. We investigate\nhow existing bias mitigation methods are evaluated in the literature. In\nparticular, we consider datasets, metrics and benchmarking. Based on the\ngathered insights (e.g., What is the most popular fairness metric? How many\ndatasets are used for evaluating bias mitigation methods?), we hope to support\npractitioners in making informed choices when developing and evaluating new\nbias mitigation methods.\n","authors":["Max Hort","Zhenpeng Chen","Jie M. Zhang","Mark Harman","Federica Sarro"],"pdf_url":"https://arxiv.org/pdf/2207.07068v4.pdf","comment":"52 pages, 7 figures"},{"id":"http://arxiv.org/abs/2310.07464v1","updated":"2023-10-11T13:05:33Z","published":"2023-10-11T13:05:33Z","title":"Deep Learning Predicts Biomarker Status and Discovers Related\n  Histomorphology Characteristics for Low-Grade Glioma","summary":"  Biomarker detection is an indispensable part in the diagnosis and treatment\nof low-grade glioma (LGG). However, current LGG biomarker detection methods\nrely on expensive and complex molecular genetic testing, for which\nprofessionals are required to analyze the results, and intra-rater variability\nis often reported. To overcome these challenges, we propose an interpretable\ndeep learning pipeline, a Multi-Biomarker Histomorphology Discoverer\n(Multi-Beholder) model based on the multiple instance learning (MIL) framework,\nto predict the status of five biomarkers in LGG using only hematoxylin and\neosin-stained whole slide images and slide-level biomarker status labels.\nSpecifically, by incorporating the one-class classification into the MIL\nframework, accurate instance pseudo-labeling is realized for instance-level\nsupervision, which greatly complements the slide-level labels and improves the\nbiomarker prediction performance. Multi-Beholder demonstrates superior\nprediction performance and generalizability for five LGG biomarkers\n(AUROC=0.6469-0.9735) in two cohorts (n=607) with diverse races and scanning\nprotocols. Moreover, the excellent interpretability of Multi-Beholder allows\nfor discovering the quantitative and qualitative correlations between biomarker\nstatus and histomorphology characteristics. Our pipeline not only provides a\nnovel approach for biomarker prediction, enhancing the applicability of\nmolecular treatments for LGG patients but also facilitates the discovery of new\nmechanisms in molecular functionality and LGG progression.\n","authors":["Zijie Fang","Yihan Liu","Yifeng Wang","Xiangyang Zhang","Yang Chen","Changjing Cai","Yiyang Lin","Ying Han","Zhi Wang","Shan Zeng","Hong Shen","Jun Tan","Yongbing Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.07464v1.pdf","comment":"47 pages, 6 figures"},{"id":"http://arxiv.org/abs/2310.07463v1","updated":"2023-10-11T13:05:28Z","published":"2023-10-11T13:05:28Z","title":"Uncovering ECG Changes during Healthy Aging using Explainable AI","summary":"  Cardiovascular diseases remain the leading global cause of mortality. This\nnecessitates a profound understanding of heart aging processes to diagnose\nconstraints in cardiovascular fitness. Traditionally, most of such insights\nhave been drawn from the analysis of electrocardiogram (ECG) feature changes of\nindividuals as they age. However, these features, while informative, may\npotentially obscure underlying data relationships. In this paper, we employ a\ndeep-learning model and a tree-based model to analyze ECG data from a robust\ndataset of healthy individuals across varying ages in both raw signals and ECG\nfeature format. Explainable AI techniques are then used to identify ECG\nfeatures or raw signal characteristics are most discriminative for\ndistinguishing between age groups. Our analysis with tree-based classifiers\nreveal age-related declines in inferred breathing rates and identifies notably\nhigh SDANN values as indicative of elderly individuals, distinguishing them\nfrom younger adults. Furthermore, the deep-learning model underscores the\npivotal role of the P-wave in age predictions across all age groups, suggesting\npotential changes in the distribution of different P-wave types with age. These\nfindings shed new light on age-related ECG changes, offering insights that\ntranscend traditional feature-based approaches.\n","authors":["Gabriel Ott","Yannik Schaubelt","Juan Miguel Lopez Alcaraz","Wilhelm Haverkamp","Nils Strodthoff"],"pdf_url":"https://arxiv.org/pdf/2310.07463v1.pdf","comment":"10 pages, 8 figures, code available under\n  https://github.com/AI4HealthUOL/ECG-aging"},{"id":"http://arxiv.org/abs/2310.07461v1","updated":"2023-10-11T13:05:03Z","published":"2023-10-11T13:05:03Z","title":"Efficient machine-learning surrogates for large-scale geological carbon\n  and energy storage","summary":"  Geological carbon and energy storage are pivotal for achieving net-zero\ncarbon emissions and addressing climate change. However, they face\nuncertainties due to geological factors and operational limitations, resulting\nin possibilities of induced seismic events or groundwater contamination. To\novercome these challenges, we propose a specialized machine-learning (ML) model\nto manage extensive reservoir models efficiently.\n  While ML approaches hold promise for geological carbon storage, the\nsubstantial computational resources required for large-scale analysis are the\nobstacle. We've developed a method to reduce the training cost for deep neural\noperator models, using domain decomposition and a topology embedder to link\nspatio-temporal points. This approach allows accurate predictions within the\nmodel's domain, even for untrained data, enhancing ML efficiency for\nlarge-scale geological storage applications.\n","authors":["Teeratorn Kadeethum","Stephen J. Verzi","Hongkyu Yoon"],"pdf_url":"https://arxiv.org/pdf/2310.07461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07446v1","updated":"2023-10-11T12:48:45Z","published":"2023-10-11T12:48:45Z","title":"ProbTS: A Unified Toolkit to Probe Deep Time-series Forecasting","summary":"  Time-series forecasting serves as a linchpin in a myriad of applications,\nspanning various domains. With the growth of deep learning, this arena has\nbifurcated into two salient branches: one focuses on crafting specific neural\narchitectures tailored for time series, and the other harnesses advanced deep\ngenerative models for probabilistic forecasting. While both branches have made\nsignificant progress, their differences across data scenarios, methodological\nfocuses, and decoding schemes pose profound, yet unexplored, research\nquestions. To bridge this knowledge chasm, we introduce ProbTS, a pioneering\ntoolkit developed to synergize and compare these two distinct branches. Endowed\nwith a unified data module, a modularized model module, and a comprehensive\nevaluator module, ProbTS allows us to revisit and benchmark leading methods\nfrom both branches. The scrutiny with ProbTS highlights their distinct\ncharacteristics, relative strengths and weaknesses, and areas that need further\nexploration. Our analyses point to new avenues for research, aiming for more\neffective time-series forecasting.\n","authors":["Jiawen Zhang","Xumeng Wen","Shun Zheng","Jia Li","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2310.07446v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2310.07437v1","updated":"2023-10-11T12:40:07Z","published":"2023-10-11T12:40:07Z","title":"A Branched Deep Convolutional Network for Forecasting the Occurrence of\n  Hazes in Paris using Meteorological Maps with Different Characteristic\n  Spatial Scales","summary":"  A deep learning platform has been developed to forecast the occurrence of the\nlow visibility events or hazes. It is trained by using multi-decadal daily\nregional maps of various meteorological and hydrological variables as input\nfeatures and surface visibility observations as the targets. To better preserve\nthe characteristic spatial information of different input features for\ntraining, two branched architectures have recently been developed for the case\nof Paris hazes. These new architectures have improved the performance of the\nnetwork, producing reasonable scores in both validation and a blind forecasting\nevaluation using the data of 2021 and 2022 that have not been used in the\ntraining and validation.\n","authors":["Chien Wang"],"pdf_url":"https://arxiv.org/pdf/2310.07437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07435v1","updated":"2023-10-11T12:36:42Z","published":"2023-10-11T12:36:42Z","title":"Generalized Mixture Model for Extreme Events Forecasting in Time Series\n  Data","summary":"  Time Series Forecasting (TSF) is a widely researched topic with broad\napplications in weather forecasting, traffic control, and stock price\nprediction. Extreme values in time series often significantly impact human and\nnatural systems, but predicting them is challenging due to their rare\noccurrence. Statistical methods based on Extreme Value Theory (EVT) provide a\nsystematic approach to modeling the distribution of extremes, particularly the\nGeneralized Pareto (GP) distribution for modeling the distribution of\nexceedances beyond a threshold. To overcome the subpar performance of deep\nlearning in dealing with heavy-tailed data, we propose a novel framework to\nenhance the focus on extreme events. Specifically, we propose a Deep Extreme\nMixture Model with Autoencoder (DEMMA) for time series prediction. The model\ncomprises two main modules: 1) a generalized mixture distribution based on the\nHurdle model and a reparameterized GP distribution form independent of the\nextreme threshold, 2) an Autoencoder-based LSTM feature extractor and a\nquantile prediction module with a temporal attention mechanism. We demonstrate\nthe effectiveness of our approach on multiple real-world rainfall datasets.\n","authors":["Jincheng Wang","Yue Gao"],"pdf_url":"https://arxiv.org/pdf/2310.07435v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07434v1","updated":"2023-10-11T12:36:38Z","published":"2023-10-11T12:36:38Z","title":"HealthWalk: Promoting Health and Mobility through Sensor-Based Rollator\n  Walker Assistance","summary":"  Rollator walkers allow people with physical limitations to increase their\nmobility and give them the confidence and independence to participate in\nsociety for longer. However, rollator walker users often have poor posture,\nleading to further health problems and, in the worst case, falls. Integrating\nsensors into rollator walker designs can help to address this problem and\nresults in a platform that allows several other interesting use cases. This\npaper briefly overviews existing systems and the current research directions\nand challenges in this field. We also present our early HealthWalk rollator\nwalker prototype for data collection with older people, rheumatism, multiple\nsclerosis and Parkinson patients, and individuals with visual impairments.\n","authors":["Ivanna Kramer","Kevin Weirauch","Sabine Bauer","Mark Oliver Mints","Peer Neubert"],"pdf_url":"https://arxiv.org/pdf/2310.07434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07433v1","updated":"2023-10-11T12:34:39Z","published":"2023-10-11T12:34:39Z","title":"Imitation Learning from Observation with Automatic Discount Scheduling","summary":"  Humans often acquire new skills through observation and imitation. For\nrobotic agents, learning from the plethora of unlabeled video demonstration\ndata available on the Internet necessitates imitating the expert without access\nto its action, presenting a challenge known as Imitation Learning from\nObservations (ILfO). A common approach to tackle ILfO problems is to convert\nthem into inverse reinforcement learning problems, utilizing a proxy reward\ncomputed from the agent's and the expert's observations. Nonetheless, we\nidentify that tasks characterized by a progress dependency property pose\nsignificant challenges for such approaches; in these tasks, the agent needs to\ninitially learn the expert's preceding behaviors before mastering the\nsubsequent ones. Our investigation reveals that the main cause is that the\nreward signals assigned to later steps hinder the learning of initial\nbehaviors. To address this challenge, we present a novel ILfO framework that\nenables the agent to master earlier behaviors before advancing to later ones.\nWe introduce an Automatic Discount Scheduling (ADS) mechanism that adaptively\nalters the discount factor in reinforcement learning during the training phase,\nprioritizing earlier rewards initially and gradually engaging later rewards\nonly when the earlier behaviors have been mastered. Our experiments, conducted\non nine Meta-World tasks, demonstrate that our method significantly outperforms\nstate-of-the-art methods across all tasks, including those that are unsolvable\nby them.\n","authors":["Yuyang Liu","Weijun Dong","Yingdong Hu","Chuan Wen","Zhao-Heng Yin","Chongjie Zhang","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2310.07433v1.pdf","comment":"Submitted to ICLR 2024"},{"id":"http://arxiv.org/abs/2310.07430v1","updated":"2023-10-11T12:32:13Z","published":"2023-10-11T12:32:13Z","title":"Non-backtracking Graph Neural Networks","summary":"  The celebrated message-passing updates for graph neural networks allow the\nrepresentation of large-scale graphs with local and computationally tractable\nupdates. However, the local updates suffer from backtracking, i.e., a message\nflows through the same edge twice and revisits the previously visited node.\nSince the number of message flows increases exponentially with the number of\nupdates, the redundancy in local updates prevents the graph neural network from\naccurately recognizing a particular message flow for downstream tasks. In this\nwork, we propose to resolve such a redundancy via the non-backtracking graph\nneural network (NBA-GNN) that updates a message without incorporating the\nmessage from the previously visited node. We further investigate how NBA-GNN\nalleviates the over-squashing of GNNs, and establish a connection between\nNBA-GNN and the impressive performance of non-backtracking updates for\nstochastic block model recovery. We empirically verify the effectiveness of our\nNBA-GNN on long-range graph benchmark and transductive node classification\nproblems.\n","authors":["Seonghyun Park","Narae Ryu","Gahee Kim","Dongyeop Woo","Se-Young Yun","Sungsoo Ahn"],"pdf_url":"https://arxiv.org/pdf/2310.07430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07427v1","updated":"2023-10-11T12:28:52Z","published":"2023-10-11T12:28:52Z","title":"Quantum-Enhanced Forecasting: Leveraging Quantum Gramian Angular Field\n  and CNNs for Stock Return Predictions","summary":"  We propose a time series forecasting method named Quantum Gramian Angular\nField (QGAF). This approach merges the advantages of quantum computing\ntechnology with deep learning, aiming to enhance the precision of time series\nclassification and forecasting. We successfully transformed stock return time\nseries data into two-dimensional images suitable for Convolutional Neural\nNetwork (CNN) training by designing specific quantum circuits. Distinct from\nthe classical Gramian Angular Field (GAF) approach, QGAF's uniqueness lies in\neliminating the need for data normalization and inverse cosine calculations,\nsimplifying the transformation process from time series data to two-dimensional\nimages. To validate the effectiveness of this method, we conducted experiments\non datasets from three major stock markets: the China A-share market, the Hong\nKong stock market, and the US stock market. Experimental results revealed that\ncompared to the classical GAF method, the QGAF approach significantly improved\ntime series prediction accuracy, reducing prediction errors by an average of\n25\\% for Mean Absolute Error (MAE) and 48\\% for Mean Squared Error (MSE). This\nresearch confirms the potential and promising prospects of integrating quantum\ncomputing with deep learning techniques in financial time series forecasting.\n","authors":["Zhengmeng Xu","Hai Lin"],"pdf_url":"https://arxiv.org/pdf/2310.07427v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13078v2","updated":"2023-10-11T12:21:30Z","published":"2023-09-21T02:46:20Z","title":"LPML: LLM-Prompting Markup Language for Mathematical Reasoning","summary":"  In utilizing large language models (LLMs) for mathematical reasoning,\naddressing the errors in the reasoning and calculation present in the generated\ntext by LLMs is a crucial challenge. In this paper, we propose a novel\nframework that integrates the Chain-of-Thought (CoT) method with an external\ntool (Python REPL). We discovered that by prompting LLMs to generate structured\ntext in XML-like markup language, we could seamlessly integrate CoT and the\nexternal tool and control the undesired behaviors of LLMs. With our approach,\nLLMs can utilize Python computation to rectify errors within CoT. We applied\nour method to ChatGPT (GPT-3.5) to solve challenging mathematical problems and\ndemonstrated that combining CoT and Python REPL through the markup language\nenhances the reasoning capability of LLMs. Our approach enables LLMs to write\nthe markup language and perform advanced mathematical reasoning using only\nzero-shot prompting.\n","authors":["Ryutaro Yamauchi","Sho Sonoda","Akiyoshi Sannai","Wataru Kumagai"],"pdf_url":"https://arxiv.org/pdf/2309.13078v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.09615v4","updated":"2023-10-11T12:09:35Z","published":"2022-05-19T15:13:00Z","title":"EXACT: How to Train Your Accuracy","summary":"  Classification tasks are usually evaluated in terms of accuracy. However,\naccuracy is discontinuous and cannot be directly optimized using gradient\nascent. Popular methods minimize cross-entropy, hinge loss, or other surrogate\nlosses, which can lead to suboptimal results. In this paper, we propose a new\noptimization framework by introducing stochasticity to a model's output and\noptimizing expected accuracy, i.e. accuracy of the stochastic model. Extensive\nexperiments on linear models and deep image classification show that the\nproposed optimization method is a powerful alternative to widely used\nclassification losses.\n","authors":["Ivan Karpukhin","Stanislav Dereka","Sergey Kolesnikov"],"pdf_url":"https://arxiv.org/pdf/2205.09615v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07418v1","updated":"2023-10-11T12:05:34Z","published":"2023-10-11T12:05:34Z","title":"Revisiting Plasticity in Visual Reinforcement Learning: Data, Modules\n  and Training Stages","summary":"  Plasticity, the ability of a neural network to evolve with new data, is\ncrucial for high-performance and sample-efficient visual reinforcement learning\n(VRL). Although methods like resetting and regularization can potentially\nmitigate plasticity loss, the influences of various components within the VRL\nframework on the agent's plasticity are still poorly understood. In this work,\nwe conduct a systematic empirical exploration focusing on three primary\nunderexplored facets and derive the following insightful conclusions: (1) data\naugmentation is essential in maintaining plasticity; (2) the critic's\nplasticity loss serves as the principal bottleneck impeding efficient training;\nand (3) without timely intervention to recover critic's plasticity in the early\nstages, its loss becomes catastrophic. These insights suggest a novel strategy\nto address the high replay ratio (RR) dilemma, where exacerbated plasticity\nloss hinders the potential improvements of sample efficiency brought by\nincreased reuse frequency. Rather than setting a static RR for the entire\ntraining process, we propose Adaptive RR, which dynamically adjusts the RR\nbased on the critic's plasticity level. Extensive evaluations indicate that\nAdaptive RR not only avoids catastrophic plasticity loss in the early stages\nbut also benefits from more frequent reuse in later phases, resulting in\nsuperior sample efficiency.\n","authors":["Guozheng Ma","Lu Li","Sen Zhang","Zixuan Liu","Zhen Wang","Yixin Chen","Li Shen","Xueqian Wang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2310.07418v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07417v1","updated":"2023-10-11T12:03:19Z","published":"2023-10-11T12:03:19Z","title":"What can knowledge graph alignment gain with Neuro-Symbolic learning\n  approaches?","summary":"  Knowledge Graphs (KG) are the backbone of many data-intensive applications\nsince they can represent data coupled with its meaning and context. Aligning\nKGs across different domains and providers is necessary to afford a fuller and\nintegrated representation. A severe limitation of current KG alignment (KGA)\nalgorithms is that they fail to articulate logical thinking and reasoning with\nlexical, structural, and semantic data learning. Deep learning models are\nincreasingly popular for KGA inspired by their good performance in other tasks,\nbut they suffer from limitations in explainability, reasoning, and data\nefficiency. Hybrid neurosymbolic learning models hold the promise of\nintegrating logical and data perspectives to produce high-quality alignments\nthat are explainable and support validation through human-centric approaches.\nThis paper examines the current state of the art in KGA and explores the\npotential for neurosymbolic integration, highlighting promising research\ndirections for combining these fields.\n","authors":["Pedro Giesteira Cotovio","Ernesto Jimenez-Ruiz","Catia Pesquita"],"pdf_url":"https://arxiv.org/pdf/2310.07417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07416v1","updated":"2023-10-11T12:01:52Z","published":"2023-10-11T12:01:52Z","title":"A Novel Voronoi-based Convolutional Neural Network Framework for Pushing\n  Person Detection in Crowd Videos","summary":"  Analyzing the microscopic dynamics of pushing behavior within crowds can\noffer valuable insights into crowd patterns and interactions. By identifying\ninstances of pushing in crowd videos, a deeper understanding of when, where,\nand why such behavior occurs can be achieved. This knowledge is crucial to\ncreating more effective crowd management strategies, optimizing crowd flow, and\nenhancing overall crowd experiences. However, manually identifying pushing\nbehavior at the microscopic level is challenging, and the existing automatic\napproaches cannot detect such microscopic behavior. Thus, this article\nintroduces a novel automatic framework for identifying pushing in videos of\ncrowds on a microscopic level. The framework comprises two main components: i)\nFeature extraction and ii) Video labeling. In the feature extraction component,\na new Voronoi-based method is developed for determining the local regions\nassociated with each person in the input video. Subsequently, these regions are\nfed into EfficientNetV1B0 Convolutional Neural Network to extract the deep\nfeatures of each person over time. In the second component, a combination of a\nfully connected layer with a Sigmoid activation function is employed to analyze\nthese deep features and annotate the individuals involved in pushing within the\nvideo. The framework is trained and evaluated on a new dataset created using\nsix real-world experiments, including their corresponding ground truths. The\nexperimental findings indicate that the suggested framework outperforms seven\nbaseline methods that are employed for comparative analysis purposes.\n","authors":["Ahmed Alia","Mohammed Maree","Mohcine Chraibi","Armin Seyfried"],"pdf_url":"https://arxiv.org/pdf/2310.07416v1.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2310.07402v1","updated":"2023-10-11T11:38:18Z","published":"2023-10-11T11:38:18Z","title":"NuTime: Numerically Multi-Scaled Embedding for Large-Scale Time Series\n  Pretraining","summary":"  Recent research on time-series self-supervised models shows great promise in\nlearning semantic representations. However, it has been limited to small-scale\ndatasets, e.g., thousands of temporal sequences. In this work, we make key\ntechnical contributions that are tailored to the numerical properties of\ntime-series data and allow the model to scale to large datasets, e.g., millions\nof temporal sequences. We adopt the Transformer architecture by first\npartitioning the input into non-overlapping windows. Each window is then\ncharacterized by its normalized shape and two scalar values denoting the mean\nand standard deviation within each window. To embed scalar values that may\npossess arbitrary numerical scales to high-dimensional vectors, we propose a\nnumerically multi-scaled embedding module enumerating all possible scales for\nthe scalar values. The model undergoes pretraining using the proposed\nnumerically multi-scaled embedding with a simple contrastive objective on a\nlarge-scale dataset containing over a million sequences. We study its transfer\nperformance on a number of univariate and multivariate classification\nbenchmarks. Our method exhibits remarkable improvement against previous\nrepresentation learning approaches and establishes the new state of the art,\neven compared with domain-specific non-learning-based methods.\n","authors":["Chenguo Lin","Xumeng Wen","Wei Cao","Congrui Huang","Jiang Bian","Stephen Lin","Zhirong Wu"],"pdf_url":"https://arxiv.org/pdf/2310.07402v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05682v2","updated":"2023-10-11T11:28:40Z","published":"2023-10-09T12:51:46Z","title":"Analysis of Rainfall Variability and Water Extent of Selected Hydropower\n  Reservoir Using Google Earth Engine (GEE): A Case Study from Two Tropical\n  Countries, Sri Lanka and Vietnam","summary":"  This study presents a comprehensive remote sensing analysis of rainfall\npatterns and selected hydropower reservoir water extent in two tropical monsoon\ncountries, Vietnam and Sri Lanka. The aim is to understand the relationship\nbetween remotely sensed rainfall data and the dynamic changes (monthly) in\nreservoir water extent. The analysis utilizes high-resolution optical imagery\nand Sentinel-1 Synthetic Aperture Radar (SAR) data to observe and monitor water\nbodies during different weather conditions, especially during the monsoon\nseason. The average annual rainfall for both countries is determined, and\nspatiotemporal variations in monthly average rainfall are examined at regional\nand reservoir basin levels using the Climate Hazards Group InfraRed\nPrecipitation with Station (CHIRPS) dataset from 1981 to 2022. Water extents\nare derived for selected reservoirs using Sentinel-1 SAR Ground Range Detected\n(GRD) images in Vietnam and Sri Lanka from 2017 to 2022. The images are\npre-processed and corrected using terrain correction and refined Lee filter. An\nautomated thresholding algorithm, OTSU, distinguishes water and land, taking\nadvantage of both VV and VH polarization data. The connected pixel count\nthreshold is applied to enhance result accuracy. The results indicate a clear\nrelationship between rainfall patterns and reservoir water extent, with\nincreased precipitation during the monsoon season leading to higher water\nextents in the later months. This study contributes to understanding how\nrainfall variability impacts reservoir water resources in tropical monsoon\nregions. The preliminary findings can inform water resource management\nstrategies and support these countries' decision-making processes related to\nhydropower generation, flood management, and irrigation.\n","authors":["Punsisi Rajakaruna","Surajit Ghosh","Bunyod Holmatov"],"pdf_url":"https://arxiv.org/pdf/2310.05682v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07392v1","updated":"2023-10-11T11:20:35Z","published":"2023-10-11T11:20:35Z","title":"Deep Kernel and Image Quality Estimators for Optimizing Robotic\n  Ultrasound Controller using Bayesian Optimization","summary":"  Ultrasound is a commonly used medical imaging modality that requires expert\nsonographers to manually maneuver the ultrasound probe based on the acquired\nimage. Autonomous Robotic Ultrasound (A-RUS) is an appealing alternative to\nthis manual procedure in order to reduce sonographers' workload. The key\nchallenge to A-RUS is optimizing the ultrasound image quality for the region of\ninterest across different patients. This requires knowledge of anatomy,\nrecognition of error sources and precise probe position, orientation and\npressure. Sample efficiency is important while optimizing these parameters\nassociated with the robotized probe controller. Bayesian Optimization (BO), a\nsample-efficient optimization framework, has recently been applied to optimize\nthe 2D motion of the probe. Nevertheless, further improvements are needed to\nimprove the sample efficiency for high-dimensional control of the probe. We aim\nto overcome this problem by using a neural network to learn a low-dimensional\nkernel in BO, termed as Deep Kernel (DK). The neural network of DK is trained\nusing probe and image data acquired during the procedure. The two image quality\nestimators are proposed that use a deep convolution neural network and provide\nreal-time feedback to the BO. We validated our framework using these two\nfeedback functions on three urinary bladder phantoms. We obtained over 50%\nincrease in sample efficiency for 6D control of the robotized probe.\nFurthermore, our results indicate that this performance enhancement in BO is\nindependent of the specific training dataset, demonstrating inter-patient\nadaptability.\n","authors":["Deepak Raina","SH Chandrashekhara","Richard Voyles","Juan Wachs","Subir Kumar Saha"],"pdf_url":"https://arxiv.org/pdf/2310.07392v1.pdf","comment":"Accepted in IEEE International Symposium on Medical Robotics (ISMR)\n  2023"},{"id":"http://arxiv.org/abs/2310.07380v1","updated":"2023-10-11T10:55:14Z","published":"2023-10-11T10:55:14Z","title":"Histopathological Image Classification and Vulnerability Analysis using\n  Federated Learning","summary":"  Healthcare is one of the foremost applications of machine learning (ML).\nTraditionally, ML models are trained by central servers, which aggregate data\nfrom various distributed devices to forecast the results for newly generated\ndata. This is a major concern as models can access sensitive user information,\nwhich raises privacy concerns. A federated learning (FL) approach can help\naddress this issue: A global model sends its copy to all clients who train\nthese copies, and the clients send the updates (weights) back to it. Over time,\nthe global model improves and becomes more accurate. Data privacy is protected\nduring training, as it is conducted locally on the clients' devices.\n  However, the global model is susceptible to data poisoning. We develop a\nprivacy-preserving FL technique for a skin cancer dataset and show that the\nmodel is prone to data poisoning attacks. Ten clients train the model, but one\nof them intentionally introduces flipped labels as an attack. This reduces the\naccuracy of the global model. As the percentage of label flipping increases,\nthere is a noticeable decrease in accuracy. We use a stochastic gradient\ndescent optimization algorithm to find the most optimal accuracy for the model.\nAlthough FL can protect user privacy for healthcare diagnostics, it is also\nvulnerable to data poisoning, which must be addressed.\n","authors":["Sankalp Vyas","Amar Nath Patra","Raj Mani Shukla"],"pdf_url":"https://arxiv.org/pdf/2310.07380v1.pdf","comment":"Accepted in IEEE International Conference on Trust, Security and\n  Privacy in Computing and Communications (TrustCom)"},{"id":"http://arxiv.org/abs/2310.07379v1","updated":"2023-10-11T10:54:44Z","published":"2023-10-11T10:54:44Z","title":"Causal Unsupervised Semantic Segmentation","summary":"  Unsupervised semantic segmentation aims to achieve high-quality semantic\ngrouping without human-labeled annotations. With the advent of self-supervised\npre-training, various frameworks utilize the pre-trained features to train\nprediction heads for unsupervised dense prediction. However, a significant\nchallenge in this unsupervised setup is determining the appropriate level of\nclustering required for segmenting concepts. To address it, we propose a novel\nframework, CAusal Unsupervised Semantic sEgmentation (CAUSE), which leverages\ninsights from causal inference. Specifically, we bridge intervention-oriented\napproach (i.e., frontdoor adjustment) to define suitable two-step tasks for\nunsupervised prediction. The first step involves constructing a concept\nclusterbook as a mediator, which represents possible concept prototypes at\ndifferent levels of granularity in a discretized form. Then, the mediator\nestablishes an explicit link to the subsequent concept-wise self-supervised\nlearning for pixel-level grouping. Through extensive experiments and analyses\non various datasets, we corroborate the effectiveness of CAUSE and achieve\nstate-of-the-art performance in unsupervised semantic segmentation.\n","authors":["Junho Kim","Byung-Kwan Lee","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2310.07379v1.pdf","comment":"code available:\n  https://github.com/ByungKwanLee/Causal-Unsupervised-Segmentation"},{"id":"http://arxiv.org/abs/2305.08732v3","updated":"2023-10-11T10:51:12Z","published":"2023-05-15T15:47:09Z","title":"Knowledge Rumination for Pre-trained Language Models","summary":"  Previous studies have revealed that vanilla pre-trained language models\n(PLMs) lack the capacity to handle knowledge-intensive NLP tasks alone; thus,\nseveral works have attempted to integrate external knowledge into PLMs.\nHowever, despite the promising outcome, we empirically observe that PLMs may\nhave already encoded rich knowledge in their pre-trained parameters but fail to\nfully utilize them when applying them to knowledge-intensive tasks. In this\npaper, we propose a new paradigm dubbed Knowledge Rumination to help the\npre-trained language model utilize that related latent knowledge without\nretrieving it from the external corpus. By simply adding a prompt like \"As far\nas I know\" to the PLMs, we try to review related latent knowledge and inject\nthem back into the model for knowledge consolidation. We apply the proposed\nknowledge rumination to various language models, including RoBERTa, DeBERTa,\nand GPT-3. Experimental results on six commonsense reasoning tasks and GLUE\nbenchmarks demonstrate the effectiveness of our proposed approach, which proves\nthat the knowledge stored in PLMs can be better exploited to enhance\nperformance. Code is available in\nhttps://github.com/zjunlp/knowledge-rumination.\n","authors":["Yunzhi Yao","Peng Wang","Shengyu Mao","Chuanqi Tan","Fei Huang","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.08732v3.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.07371v1","updated":"2023-10-11T10:41:51Z","published":"2023-10-11T10:41:51Z","title":"Experimental quantum natural gradient optimization in photonics","summary":"  Variational quantum algorithms (VQAs) combining the advantages of\nparameterized quantum circuits and classical optimizers, promise practical\nquantum applications in the Noisy Intermediate-Scale Quantum era. The\nperformance of VQAs heavily depends on the optimization method. Compared with\ngradient-free and ordinary gradient descent methods, the quantum natural\ngradient (QNG), which mirrors the geometric structure of the parameter space,\ncan achieve faster convergence and avoid local minima more easily, thereby\nreducing the cost of circuit executions. We utilized a fully programmable\nphotonic chip to experimentally estimate the QNG in photonics for the first\ntime. We obtained the dissociation curve of the He-H$^+$ cation and achieved\nchemical accuracy, verifying the outperformance of QNG optimization on a\nphotonic device. Our work opens up a vista of utilizing QNG in photonics to\nimplement practical near-term quantum applications.\n","authors":["Yizhi Wang","Shichuan Xue","Yaxuan Wang","Jiangfang Ding","Weixu Shi","Dongyang Wang","Yong Liu","Yingwen Liu","Xiang Fu","Guangyao Huang","Anqi Huang","Mingtang Deng","Junjie Wu"],"pdf_url":"https://arxiv.org/pdf/2310.07371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07370v1","updated":"2023-10-11T10:40:43Z","published":"2023-10-11T10:40:43Z","title":"Orthogonal Random Features: Explicit Forms and Sharp Inequalities","summary":"  Random features have been introduced to scale up kernel methods via\nrandomization techniques. In particular, random Fourier features and orthogonal\nrandom features were used to approximate the popular Gaussian kernel. The\nformer is performed by a random Gaussian matrix and leads exactly to the\nGaussian kernel after averaging. In this work, we analyze the bias and the\nvariance of the kernel approximation based on orthogonal random features which\nmakes use of Haar orthogonal matrices. We provide explicit expressions for\nthese quantities using normalized Bessel functions and derive sharp exponential\nbounds supporting the view that orthogonal random features are more informative\nthan random Fourier features.\n","authors":["Nizar Demni","Hachem Kadri"],"pdf_url":"https://arxiv.org/pdf/2310.07370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07367v1","updated":"2023-10-11T10:34:52Z","published":"2023-10-11T10:34:52Z","title":"Improved Analysis of Sparse Linear Regression in Local Differential\n  Privacy Model","summary":"  In this paper, we revisit the problem of sparse linear regression in the\nlocal differential privacy (LDP) model. Existing research in the\nnon-interactive and sequentially local models has focused on obtaining the\nlower bounds for the case where the underlying parameter is $1$-sparse, and\nextending such bounds to the more general $k$-sparse case has proven to be\nchallenging. Moreover, it is unclear whether efficient non-interactive LDP\n(NLDP) algorithms exist. To address these issues, we first consider the problem\nin the $\\epsilon$ non-interactive LDP model and provide a lower bound of\n$\\Omega(\\frac{\\sqrt{dk\\log d}}{\\sqrt{n}\\epsilon})$ on the $\\ell_2$-norm\nestimation error for sub-Gaussian data, where $n$ is the sample size and $d$ is\nthe dimension of the space. We propose an innovative NLDP algorithm, the very\nfirst of its kind for the problem. As a remarkable outcome, this algorithm also\nyields a novel and highly efficient estimator as a valuable by-product. Our\nalgorithm achieves an upper bound of\n$\\tilde{O}({\\frac{d\\sqrt{k}}{\\sqrt{n}\\epsilon}})$ for the estimation error when\nthe data is sub-Gaussian, which can be further improved by a factor of\n$O(\\sqrt{d})$ if the server has additional public but unlabeled data. For the\nsequentially interactive LDP model, we show a similar lower bound of\n$\\Omega({\\frac{\\sqrt{dk}}{\\sqrt{n}\\epsilon}})$. As for the upper bound, we\nrectify a previous method and show that it is possible to achieve a bound of\n$\\tilde{O}(\\frac{k\\sqrt{d}}{\\sqrt{n}\\epsilon})$. Our findings reveal\nfundamental differences between the non-private case, central DP model, and\nlocal DP model in the sparse linear regression problem.\n","authors":["Liyang Zhu","Meng Ding","Vaneet Aggarwal","Jinhui Xu","Di Wang"],"pdf_url":"https://arxiv.org/pdf/2310.07367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07365v1","updated":"2023-10-11T10:30:49Z","published":"2023-10-11T10:30:49Z","title":"GraphControl: Adding Conditional Control to Universal Graph Pre-trained\n  Models for Graph Domain Transfer Learning","summary":"  Graph-structured data is ubiquitous in the world which models complex\nrelationships between objects, enabling various Web applications. Daily\ninfluxes of unlabeled graph data on the Web offer immense potential for these\napplications. Graph self-supervised algorithms have achieved significant\nsuccess in acquiring generic knowledge from abundant unlabeled graph data.\nThese pre-trained models can be applied to various downstream Web applications,\nsaving training time and improving downstream (target) performance. However,\ndifferent graphs, even across seemingly similar domains, can differ\nsignificantly in terms of attribute semantics, posing difficulties, if not\ninfeasibility, for transferring the pre-trained models to downstream tasks.\nConcretely speaking, for example, the additional task-specific node information\nin downstream tasks (specificity) is usually deliberately omitted so that the\npre-trained representation (transferability) can be leveraged. The trade-off as\nsuch is termed as \"transferability-specificity dilemma\" in this work. To\naddress this challenge, we introduce an innovative deployment module coined as\nGraphControl, motivated by ControlNet, to realize better graph domain transfer\nlearning. Specifically, by leveraging universal structural pre-trained models\nand GraphControl, we align the input space across various graphs and\nincorporate unique characteristics of target data as conditional inputs. These\nconditions will be progressively integrated into the model during fine-tuning\nor prompt tuning through ControlNet, facilitating personalized deployment.\nExtensive experiments show that our method significantly enhances the\nadaptability of pre-trained models on target attributed datasets, achieving\n1.4-3x performance gain. Furthermore, it outperforms training-from-scratch\nmethods on target data with a comparable margin and exhibits faster\nconvergence.\n","authors":["Yun Zhu","Yaoke Wang","Haizhou Shi","Zhenshuo Zhang","Siliang Tang"],"pdf_url":"https://arxiv.org/pdf/2310.07365v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2307.02484v4","updated":"2023-10-11T10:27:40Z","published":"2023-07-05T17:58:21Z","title":"Elastic Decision Transformer","summary":"  This paper introduces Elastic Decision Transformer (EDT), a significant\nadvancement over the existing Decision Transformer (DT) and its variants.\nAlthough DT purports to generate an optimal trajectory, empirical evidence\nsuggests it struggles with trajectory stitching, a process involving the\ngeneration of an optimal or near-optimal trajectory from the best parts of a\nset of sub-optimal trajectories. The proposed EDT differentiates itself by\nfacilitating trajectory stitching during action inference at test time,\nachieved by adjusting the history length maintained in DT. Further, the EDT\noptimizes the trajectory by retaining a longer history when the previous\ntrajectory is optimal and a shorter one when it is sub-optimal, enabling it to\n\"stitch\" with a more optimal trajectory. Extensive experimentation demonstrates\nEDT's ability to bridge the performance gap between DT-based and Q\nLearning-based approaches. In particular, the EDT outperforms Q Learning-based\nmethods in a multi-task regime on the D4RL locomotion benchmark and Atari\ngames. Videos are available at: https://kristery.github.io/edt/\n","authors":["Yueh-Hua Wu","Xiaolong Wang","Masashi Hamaya"],"pdf_url":"https://arxiv.org/pdf/2307.02484v4.pdf","comment":"Accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2302.11509v2","updated":"2023-10-11T10:20:58Z","published":"2023-02-22T17:26:03Z","title":"Construction of Knowledge Graphs: State and Challenges","summary":"  With knowledge graphs (KGs) at the center of numerous applications such as\nrecommender systems and question answering, the need for generalized pipelines\nto construct and continuously update such KGs is increasing. While the\nindividual steps that are necessary to create KGs from unstructured (e.g. text)\nand structured data sources (e.g. databases) are mostly well-researched for\ntheir one-shot execution, their adoption for incremental KG updates and the\ninterplay of the individual steps have hardly been investigated in a systematic\nmanner so far. In this work, we first discuss the main graph models for KGs and\nintroduce the major requirement for future KG construction pipelines. Next, we\nprovide an overview of the necessary steps to build high-quality KGs, including\ncross-cutting topics such as metadata management, ontology development, and\nquality assurance. We then evaluate the state of the art of KG construction\nw.r.t the introduced requirements for specific popular KGs as well as some\nrecent tools and strategies for KG construction. Finally, we identify areas in\nneed of further research and improvement.\n","authors":["Marvin Hofer","Daniel Obraczka","Alieh Saeedi","Hanna Köpcke","Erhard Rahm"],"pdf_url":"https://arxiv.org/pdf/2302.11509v2.pdf","comment":"51 pages, 5 figures, 4 tables, 328 references"},{"id":"http://arxiv.org/abs/2310.07359v1","updated":"2023-10-11T10:17:41Z","published":"2023-10-11T10:17:41Z","title":"Diagnosing Bipolar Disorder from 3-D Structural Magnetic Resonance\n  Images Using a Hybrid GAN-CNN Method","summary":"  Bipolar Disorder (BD) is a psychiatric condition diagnosed by repetitive\ncycles of hypomania and depression. Since diagnosing BD relies on subjective\nbehavioral assessments over a long period, a solid diagnosis based on objective\ncriteria is not straightforward. The current study responded to the described\nobstacle by proposing a hybrid GAN-CNN model to diagnose BD from 3-D structural\nMRI Images (sMRI). The novelty of this study stems from diagnosing BD from sMRI\nsamples rather than conventional datasets such as functional MRI (fMRI),\nelectroencephalography (EEG), and behavioral symptoms while removing the data\ninsufficiency usually encountered when dealing with sMRI samples. The impact of\nvarious augmentation ratios is also tested using 5-fold cross-validation. Based\non the results, this study obtains an accuracy rate of 75.8%, a sensitivity of\n60.3%, and a specificity of 82.5%, which are 3-5% higher than prior work while\nutilizing less than 6% sample counts. Next, it is demonstrated that a 2- D\nlayer-based GAN generator can effectively reproduce complex 3D brain samples, a\nmore straightforward technique than manual image processing. Lastly, the\noptimum augmentation threshold for the current study using 172 sMRI samples is\n50%, showing the applicability of the described method for larger sMRI\ndatasets. In conclusion, it is established that data augmentation using GAN\nimproves the accuracy of the CNN classifier using sMRI samples, thus developing\nmore reliable decision support systems to assist practitioners in identifying\nBD patients more reliably and in a shorter period\n","authors":["Masood Hamed Saghayan","Mohammad Hossein Zolfagharnasab","Ali Khadem","Farzam Matinfar","Hassan Rashidi"],"pdf_url":"https://arxiv.org/pdf/2310.07359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07355v1","updated":"2023-10-11T10:12:43Z","published":"2023-10-11T10:12:43Z","title":"IMITATE: Clinical Prior Guided Hierarchical Vision-Language Pre-training","summary":"  In the field of medical Vision-Language Pre-training (VLP), significant\nefforts have been devoted to deriving text and image features from both\nclinical reports and associated medical images. However, most existing methods\nmay have overlooked the opportunity in leveraging the inherent hierarchical\nstructure of clinical reports, which are generally split into `findings' for\ndescriptive content and `impressions' for conclusive observation. Instead of\nutilizing this rich, structured format, current medical VLP approaches often\nsimplify the report into either a unified entity or fragmented tokens. In this\nwork, we propose a novel clinical prior guided VLP framework named IMITATE to\nlearn the structure information from medical reports with hierarchical\nvision-language alignment. The framework derives multi-level visual features\nfrom the chest X-ray (CXR) images and separately aligns these features with the\ndescriptive and the conclusive text encoded in the hierarchical medical report.\nFurthermore, a new clinical-informed contrastive loss is introduced for\ncross-modal learning, which accounts for clinical prior knowledge in\nformulating sample correlations in contrastive learning. The proposed model,\nIMITATE, outperforms baseline VLP methods across six different datasets,\nspanning five medical imaging downstream tasks. Comprehensive experimental\nresults highlight the advantages of integrating the hierarchical structure of\nmedical reports for vision-language alignment.\n","authors":["Che Liu","Sibo Cheng","Miaojing Shi","Anand Shah","Wenjia Bai","Rossella Arcucci"],"pdf_url":"https://arxiv.org/pdf/2310.07355v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2310.07351v1","updated":"2023-10-11T10:03:10Z","published":"2023-10-11T10:03:10Z","title":"Atom-Motif Contrastive Transformer for Molecular Property Prediction","summary":"  Recently, Graph Transformer (GT) models have been widely used in the task of\nMolecular Property Prediction (MPP) due to their high reliability in\ncharacterizing the latent relationship among graph nodes (i.e., the atoms in a\nmolecule). However, most existing GT-based methods usually explore the basic\ninteractions between pairwise atoms, and thus they fail to consider the\nimportant interactions among critical motifs (e.g., functional groups consisted\nof several atoms) of molecules. As motifs in a molecule are significant\npatterns that are of great importance for determining molecular properties\n(e.g., toxicity and solubility), overlooking motif interactions inevitably\nhinders the effectiveness of MPP. To address this issue, we propose a novel\nAtom-Motif Contrastive Transformer (AMCT), which not only explores the\natom-level interactions but also considers the motif-level interactions. Since\nthe representations of atoms and motifs for a given molecule are actually two\ndifferent views of the same instance, they are naturally aligned to generate\nthe self-supervisory signals for model training. Meanwhile, the same motif can\nexist in different molecules, and hence we also employ the contrastive loss to\nmaximize the representation agreement of identical motifs across different\nmolecules. Finally, in order to clearly identify the motifs that are critical\nin deciding the properties of each molecule, we further construct a\nproperty-aware attention mechanism into our learning framework. Our proposed\nAMCT is extensively evaluated on seven popular benchmark datasets, and both\nquantitative and qualitative results firmly demonstrate its effectiveness when\ncompared with the state-of-the-art methods.\n","authors":["Wentao Yu","Shuo Chen","Chen Gong","Gang Niu","Masashi Sugiyama"],"pdf_url":"https://arxiv.org/pdf/2310.07351v1.pdf","comment":"submit to AAAI-24"},{"id":"http://arxiv.org/abs/2310.07347v1","updated":"2023-10-11T09:55:46Z","published":"2023-10-11T09:55:46Z","title":"Fast-ELECTRA for Efficient Pre-training","summary":"  ELECTRA pre-trains language models by detecting tokens in a sequence that\nhave been replaced by an auxiliary model. Although ELECTRA offers a significant\nboost in efficiency, its potential is constrained by the training cost brought\nby the auxiliary model. Notably, this model, which is jointly trained with the\nmain model, only serves to assist the training of the main model and is\ndiscarded post-training. This results in a substantial amount of training cost\nbeing expended in vain. To mitigate this issue, we propose Fast-ELECTRA, which\nleverages an existing language model as the auxiliary model. To construct a\nlearning curriculum for the main model, we smooth its output distribution via\ntemperature scaling following a descending schedule. Our approach rivals the\nperformance of state-of-the-art ELECTRA-style pre-training methods, while\nsignificantly eliminating the computation and memory cost brought by the joint\ntraining of the auxiliary model. Our method also reduces the sensitivity to\nhyper-parameters and enhances the pre-training stability.\n","authors":["Chengyu Dong","Liyuan Liu","Hao Cheng","Jingbo Shang","Jianfeng Gao","Xiaodong Liu"],"pdf_url":"https://arxiv.org/pdf/2310.07347v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06841v2","updated":"2023-10-11T09:53:51Z","published":"2023-04-13T22:20:54Z","title":"Video alignment using unsupervised learning of local and global features","summary":"  In this paper, we tackle the problem of video alignment, the process of\nmatching the frames of a pair of videos containing similar actions. The main\nchallenge in video alignment is that accurate correspondence should be\nestablished despite the differences in the execution processes and appearances\nbetween the two videos. We introduce an unsupervised method for alignment that\nuses global and local features of the frames. In particular, we introduce\neffective features for each video frame using three machine vision tools:\nperson detection, pose estimation, and VGG network. Then, the features are\nprocessed and combined to construct a multidimensional time series that\nrepresents the video. The resulting time series are used to align videos of the\nsame actions using a novel version of dynamic time warping named Diagonalized\nDynamic Time Warping(DDTW). The main advantage of our approach is that no\ntraining is required, which makes it applicable for any new type of action\nwithout any need to collect training samples for it. For evaluation, we\nconsidered video synchronization and phase classification tasks on the Penn\naction dataset. Also, for an effective evaluation of the video synchronization\ntask, we present a new metric called Enclosed Area Error(EAE). The results show\nthat our method outperforms previous state-of-the-art methods, such as TCC, and\nother self-supervised and weakly supervised methods.\n","authors":["Niloufar Fakhfour","Mohammad ShahverdiKondori","Hoda Mohammadzade"],"pdf_url":"https://arxiv.org/pdf/2304.06841v2.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2310.07338v1","updated":"2023-10-11T09:37:38Z","published":"2023-10-11T09:37:38Z","title":"Towards Foundation Models for Learning on Tabular Data","summary":"  Learning on tabular data underpins numerous real-world applications. Despite\nconsiderable efforts in developing effective learning models for tabular data,\ncurrent transferable tabular models remain in their infancy, limited by either\nthe lack of support for direct instruction following in new tasks or the\nneglect of acquiring foundational knowledge and capabilities from diverse\ntabular datasets. In this paper, we propose Tabular Foundation Models (TabFMs)\nto overcome these limitations. TabFMs harness the potential of generative\ntabular learning, employing a pre-trained large language model (LLM) as the\nbase model and fine-tuning it using purpose-designed objectives on an extensive\nrange of tabular datasets. This approach endows TabFMs with a profound\nunderstanding and universal capabilities essential for learning on tabular\ndata. Our evaluations underscore TabFM's effectiveness: not only does it\nsignificantly excel in instruction-following tasks like zero-shot and\nin-context inference, but it also showcases performance that approaches, and in\ninstances, even transcends, the renowned yet mysterious closed-source LLMs like\nGPT-4. Furthermore, when fine-tuning with scarce data, our model achieves\nremarkable efficiency and maintains competitive performance with abundant\ntraining data. Finally, while our results are promising, we also delve into\nTabFM's limitations and potential opportunities, aiming to stimulate and\nexpedite future research on developing more potent TabFMs.\n","authors":["Han Zhang","Xumeng Wen","Shun Zheng","Wei Xu","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2310.07338v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.09809v3","updated":"2023-10-11T09:34:21Z","published":"2022-10-18T12:28:37Z","title":"Analysis of Convolutions, Non-linearity and Depth in Graph Neural\n  Networks using Neural Tangent Kernel","summary":"  The fundamental principle of Graph Neural Networks (GNNs) is to exploit the\nstructural information of the data by aggregating the neighboring nodes using a\n`graph convolution' in conjunction with a suitable choice for the network\narchitecture, such as depth and activation functions. Therefore, understanding\nthe influence of each of the design choice on the network performance is\ncrucial. Convolutions based on graph Laplacian have emerged as the dominant\nchoice with the symmetric normalization of the adjacency matrix as the most\nwidely adopted one. However, some empirical studies show that row normalization\nof the adjacency matrix outperforms it in node classification. Despite the\nwidespread use of GNNs, there is no rigorous theoretical study on the\nrepresentation power of these convolutions, that could explain this behavior.\nSimilarly, the empirical observation of the linear GNNs performance being on\npar with non-linear ReLU GNNs lacks rigorous theory.\n  In this work, we theoretically analyze the influence of different aspects of\nthe GNN architecture using the Graph Neural Tangent Kernel in a semi-supervised\nnode classification setting. Under the population Degree Corrected Stochastic\nBlock Model, we prove that: (i) linear networks capture the class information\nas good as ReLU networks; (ii) row normalization preserves the underlying class\nstructure better than other convolutions; (iii) performance degrades with\nnetwork depth due to over-smoothing, but the loss in class information is the\nslowest in row normalization; (iv) skip connections retain the class\ninformation even at infinite depth, thereby eliminating over-smoothing. We\nfinally validate our theoretical findings numerically and on real datasets such\nas Cora and Citeseer.\n","authors":["Mahalakshmi Sabanayagam","Pascal Esser","Debarghya Ghoshdastidar"],"pdf_url":"https://arxiv.org/pdf/2210.09809v3.pdf","comment":"41 pages, 24 figures. Code available at\n  https://github.com/mahalakshmi-sabanayagam/NTK_GCN"},{"id":"http://arxiv.org/abs/2310.07335v1","updated":"2023-10-11T09:25:24Z","published":"2023-10-11T09:25:24Z","title":"Exploring Social Motion Latent Space and Human Awareness for Effective\n  Robot Navigation in Crowded Environments","summary":"  This work proposes a novel approach to social robot navigation by learning to\ngenerate robot controls from a social motion latent space. By leveraging this\nsocial motion latent space, the proposed method achieves significant\nimprovements in social navigation metrics such as success rate, navigation\ntime, and trajectory length while producing smoother (less jerk and angular\ndeviations) and more anticipatory trajectories. The superiority of the proposed\nmethod is demonstrated through comparison with baseline models in various\nscenarios. Additionally, the concept of humans' awareness towards the robot is\nintroduced into the social robot navigation framework, showing that\nincorporating human awareness leads to shorter and smoother trajectories owing\nto humans' ability to positively interact with the robot.\n","authors":["Junaid Ahmed Ansari","Satyajit Tourani","Gourav Kumar","Brojeshwar Bhowmick"],"pdf_url":"https://arxiv.org/pdf/2310.07335v1.pdf","comment":"Accepted at IROS 2023"},{"id":"http://arxiv.org/abs/2310.07325v1","updated":"2023-10-11T09:14:40Z","published":"2023-10-11T09:14:40Z","title":"An Adversarial Example for Direct Logit Attribution: Memory Management\n  in gelu-4l","summary":"  We provide concrete evidence for memory management in a 4-layer transformer.\nSpecifically, we identify clean-up behavior, in which model components\nconsistently remove the output of preceeding components during a forward pass.\nOur findings suggest that the interpretability technique Direct Logit\nAttribution provides misleading results. We show explicit examples where this\ntechnique is inaccurate, as it does not account for clean-up behavior.\n","authors":["James Dao","Yeu-Tong Lao","Can Rager","Jett Janiak"],"pdf_url":"https://arxiv.org/pdf/2310.07325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07323v1","updated":"2023-10-11T09:14:17Z","published":"2023-10-11T09:14:17Z","title":"Multichannel consecutive data cross-extraction with 1DCNN-attention for\n  diagnosis of power transformer","summary":"  Power transformer plays a critical role in grid infrastructure, and its\ndiagnosis is paramount for maintaining stable operation. However, the current\nmethods for transformer diagnosis focus on discrete dissolved gas analysis,\nneglecting deep feature extraction of multichannel consecutive data. The\nunutilized sequential data contains the significant temporal information\nreflecting the transformer condition. In light of this, the structure of\nmultichannel consecutive data cross-extraction (MCDC) is proposed in this\narticle in order to comprehensively exploit the intrinsic characteristic and\nevaluate the states of transformer. Moreover, for the better accommodation in\nscenario of transformer diagnosis, one dimensional convolution neural network\nattention (1DCNN-attention) mechanism is introduced and offers a more efficient\nsolution given the simplified spatial complexity. Finally, the effectiveness of\nMCDC and the superior generalization ability, compared with other algorithms,\nare validated in experiments conducted on a dataset collected from real\noperation cases of power transformer. Additionally, the better stability of\n1DCNN-attention has also been certified.\n","authors":["Wei Zheng","Guogang Zhang","Chenchen Zhao","Qianqian Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.07323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07321v1","updated":"2023-10-11T09:09:55Z","published":"2023-10-11T09:09:55Z","title":"On the Impact of Cross-Domain Data on German Language Models","summary":"  Traditionally, large language models have been either trained on general web\ncrawls or domain-specific data. However, recent successes of generative large\nlanguage models, have shed light on the benefits of cross-domain datasets. To\nexamine the significance of prioritizing data diversity over quality, we\npresent a German dataset comprising texts from five domains, along with another\ndataset aimed at containing high-quality data. Through training a series of\nmodels ranging between 122M and 750M parameters on both datasets, we conduct a\ncomprehensive benchmark on multiple downstream tasks. Our findings demonstrate\nthat the models trained on the cross-domain dataset outperform those trained on\nquality data alone, leading to improvements up to $4.45\\%$ over the previous\nstate-of-the-art. The models are available at\nhttps://huggingface.co/ikim-uk-essen\n","authors":["Amin Dada","Aokun Chen","Cheng Peng","Kaleb E Smith","Ahmad Idrissi-Yaghir","Constantin Marc Seibold","Jianning Li","Lars Heiliger","Christoph M. Friedrich","Daniel Truhn","Jan Egger","Jiang Bian","Jens Kleesiek","Yonghui Wu"],"pdf_url":"https://arxiv.org/pdf/2310.07321v1.pdf","comment":"13 pages, 1 figure, accepted at Findings of the Association for\n  Computational Linguistics: EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.07320v1","updated":"2023-10-11T09:09:50Z","published":"2023-10-11T09:09:50Z","title":"Byzantine-Resilient Decentralized Multi-Armed Bandits","summary":"  In decentralized cooperative multi-armed bandits (MAB), each agent observes a\ndistinct stream of rewards, and seeks to exchange information with others to\nselect a sequence of arms so as to minimize its regret. Agents in the\ncooperative setting can outperform a single agent running a MAB method such as\nUpper-Confidence Bound (UCB) independently. In this work, we study how to\nrecover such salient behavior when an unknown fraction of the agents can be\nByzantine, that is, communicate arbitrarily wrong information in the form of\nreward mean-estimates or confidence sets. This framework can be used to model\nattackers in computer networks, instigators of offensive content into\nrecommender systems, or manipulators of financial markets. Our key contribution\nis the development of a fully decentralized resilient upper confidence bound\n(UCB) algorithm that fuses an information mixing step among agents with a\ntruncation of inconsistent and extreme values. This truncation step enables us\nto establish that the performance of each normal agent is no worse than the\nclassic single-agent UCB1 algorithm in terms of regret, and more importantly,\nthe cumulative regret of all normal agents is strictly better than the\nnon-cooperative case, provided that each agent has at least 3f+1 neighbors\nwhere f is the maximum possible Byzantine agents in each agent's neighborhood.\nExtensions to time-varying neighbor graphs, and minimax lower bounds are\nfurther established on the achievable regret. Experiments corroborate the\nmerits of this framework in practice.\n","authors":["Jingxuan Zhu","Alec Koppel","Alvaro Velasquez","Ji Liu"],"pdf_url":"https://arxiv.org/pdf/2310.07320v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.07941v4","updated":"2023-10-11T09:07:24Z","published":"2022-03-15T14:25:44Z","title":"Reachability In Simple Neural Networks","summary":"  We investigate the complexity of the reachability problem for (deep) neural\nnetworks: does it compute valid output given some valid input? It was recently\nclaimed that the problem is NP-complete for general neural networks and\nspecifications over the input/output dimension given by conjunctions of linear\ninequalities. We recapitulate the proof and repair some flaws in the original\nupper and lower bound proofs. Motivated by the general result, we show that\nNP-hardness already holds for restricted classes of simple specifications and\nneural networks. Allowing for a single hidden layer and an output dimension of\none as well as neural networks with just one negative, zero and one positive\nweight or bias is sufficient to ensure NP-hardness. Additionally, we give a\nthorough discussion and outlook of possible extensions for this direction of\nresearch on neural network verification.\n","authors":["Marco Sälzer","Martin Lange"],"pdf_url":"https://arxiv.org/pdf/2203.07941v4.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2108.13179"},{"id":"http://arxiv.org/abs/2104.05859v5","updated":"2023-10-11T09:07:01Z","published":"2021-04-12T23:14:41Z","title":"Rapid Exploration for Open-World Navigation with Latent Goal Models","summary":"  We describe a robotic learning system for autonomous exploration and\nnavigation in diverse, open-world environments. At the core of our method is a\nlearned latent variable model of distances and actions, along with a\nnon-parametric topological memory of images. We use an information bottleneck\nto regularize the learned policy, giving us (i) a compact visual representation\nof goals, (ii) improved generalization capabilities, and (iii) a mechanism for\nsampling feasible goals for exploration. Trained on a large offline dataset of\nprior experience, the model acquires a representation of visual goals that is\nrobust to task-irrelevant distractors. We demonstrate our method on a mobile\nground robot in open-world exploration scenarios. Given an image of a goal that\nis up to 80 meters away, our method leverages its representation to explore and\ndiscover the goal in under 20 minutes, even amidst previously-unseen obstacles\nand weather conditions. Please check out the project website for videos of our\nexperiments and information about the real-world dataset used at\nhttps://sites.google.com/view/recon-robot.\n","authors":["Dhruv Shah","Benjamin Eysenbach","Gregory Kahn","Nicholas Rhinehart","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2104.05859v5.pdf","comment":"Presented at 5th Annual Conference on Robot Learning (CoRL 2021),\n  London, UK as an Oral Talk. Project page and dataset release at\n  https://sites.google.com/view/recon-robot"},{"id":"http://arxiv.org/abs/2310.05052v2","updated":"2023-10-11T09:04:01Z","published":"2023-10-08T07:25:27Z","title":"Learning Intra- and Inter-Cell Differences for Accurate Battery Lifespan\n  Prediction across Diverse Conditions","summary":"  Battery life prediction holds significant practical value for battery\nresearch and development. Currently, many data-driven models rely on early\nelectrical signals from specific target batteries to predict their lifespan. A\ncommon shortfall is that most existing methods are developed based on specific\naging conditions, which not only limits their model's capability but also\ndiminishes their effectiveness in predicting degradation under varied\nconditions. As a result, these models often miss out on fully benefiting from\nthe rich historical data available under other conditions. Here, to address\nabove, we introduce an approach that explicitly captures differences between\nelectrical signals of a target battery and a reference battery, irrespective of\ntheir materials and aging conditions, to forecast the target battery life.\nThrough this inter-cell difference, we not only enhance the feature space but\nalso pave the way for a universal battery life prediction framework.\nRemarkably, our model that combines the inter- and intra-cell differences\nshines across diverse conditions, standing out in its efficiency and accuracy\nusing all accessible datasets. An essential application of our approach is its\ncapability to leverage data from older batteries effectively, enabling newer\nbatteries to capitalize on insights gained from past batteries. This work not\nonly enriches the battery data utilization strategy but also sets the stage for\nsmarter battery management system in the future.\n","authors":["Han Zhang","Yuqi Li","Shun Zheng","Ziheng Lu","Xiaofan Gui","Wei Xu","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2310.05052v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07313v1","updated":"2023-10-11T09:00:02Z","published":"2023-10-11T09:00:02Z","title":"Molecule-Edit Templates for Efficient and Accurate Retrosynthesis\n  Prediction","summary":"  Retrosynthesis involves determining a sequence of reactions to synthesize\ncomplex molecules from simpler precursors. As this poses a challenge in organic\nchemistry, machine learning has offered solutions, particularly for predicting\npossible reaction substrates for a given target molecule. These solutions\nmainly fall into template-based and template-free categories. The former is\nefficient but relies on a vast set of predefined reaction patterns, while the\nlatter, though more flexible, can be computationally intensive and less\ninterpretable. To address these issues, we introduce METRO (Molecule-Edit\nTemplates for RetrOsynthesis), a machine-learning model that predicts reactions\nusing minimal templates - simplified reaction patterns capturing only essential\nmolecular changes - reducing computational overhead and achieving\nstate-of-the-art results on standard benchmarks.\n","authors":["Mikołaj Sacha","Michał Sadowski","Piotr Kozakowski","Ruard van Workum","Stanisław Jastrzębski"],"pdf_url":"https://arxiv.org/pdf/2310.07313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07312v1","updated":"2023-10-11T08:57:59Z","published":"2023-10-11T08:57:59Z","title":"WiGenAI: The Symphony of Wireless and Generative AI via Diffusion Models","summary":"  Innovative foundation models, such as GPT-3 and stable diffusion models, have\nmade a paradigm shift in the realm of artificial intelligence (AI) towards\ngenerative AI-based systems. In unison, from data communication and networking\nperspective, AI and machine learning (AI/ML) algorithms are envisioned to be\npervasively incorporated into the future generations of wireless communications\nsystems, highlighting the need for novel AI-native solutions for the emergent\ncommunication scenarios. In this article, we outline the applications of\ngenerative AI in wireless communication systems to lay the foundations for\nresearch in this field. Diffusion-based generative models, as the new\nstate-of-the-art paradigm of generative models, are introduced, and their\napplications in wireless communication systems are discussed. Two case studies\nare also presented to showcase how diffusion models can be exploited for the\ndevelopment of resilient AI-native communication systems. Specifically, we\npropose denoising diffusion probabilistic models (DDPM) for a wireless\ncommunication scheme with non-ideal transceivers, where 30% improvement is\nachieved in terms of bit error rate. As the second application, DDPMs are\nemployed at the transmitter to shape the constellation symbols, highlighting a\nrobust out-of-distribution performance. Finally, future directions and open\nissues for the development of generative AI-based wireless systems are\ndiscussed to promote future research endeavors towards wireless generative AI\n(WiGenAI).\n","authors":["Mehdi Letafati","Samad Ali","Matti Latva-aho"],"pdf_url":"https://arxiv.org/pdf/2310.07312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01276v2","updated":"2023-10-11T08:57:34Z","published":"2023-06-02T05:34:01Z","title":"Enhancing Sample Efficiency in Black-box Combinatorial Optimization via\n  Symmetric Replay Training","summary":"  Black-box combinatorial optimization (black-box CO) is frequently encountered\nin various industrial fields, such as drug discovery or hardware design.\nDespite its widespread relevance, solving black-box CO problems is highly\nchallenging due to the vast combinatorial solution space and resource-intensive\nnature of black-box function evaluations. These inherent complexities induce\nsignificant constraints on the efficacy of existing deep reinforcement learning\n(DRL) methods when applied to practical problem settings. For efficient\nexploration with the limited availability of function evaluations, this paper\nintroduces a new generic method to enhance sample efficiency. We propose\nsymmetric replay training that leverages the high-reward samples and their\nunder-explored regions in the symmetric space. In replay training, the policy\nis trained to imitate the symmetric trajectories of these high-rewarded\nsamples. The proposed method is beneficial for the exploration of highly\nrewarded regions without the necessity for additional online interactions -\nfree. The experimental results show that our method consistently improves the\nsample efficiency of various DRL methods on real-world tasks, including\nmolecular optimization and hardware design.\n","authors":["Hyeonah Kim","Minsu Kim","Sungsoo Ahn","Jinkyoo Park"],"pdf_url":"https://arxiv.org/pdf/2306.01276v2.pdf","comment":"18 pages (including 6 pages of the appendix)"},{"id":"http://arxiv.org/abs/2310.07306v1","updated":"2023-10-11T08:40:06Z","published":"2023-10-11T08:40:06Z","title":"SNOiC: Soft Labeling and Noisy Mixup based Open Intent Classification\n  Model","summary":"  This paper presents a Soft Labeling and Noisy Mixup-based open intent\nclassification model (SNOiC). Most of the previous works have used\nthreshold-based methods to identify open intents, which are prone to\noverfitting and may produce biased predictions. Additionally, the need for more\navailable data for an open intent class presents another limitation for these\nexisting models. SNOiC combines Soft Labeling and Noisy Mixup strategies to\nreduce the biasing and generate pseudo-data for open intent class. The\nexperimental results on four benchmark datasets show that the SNOiC model\nachieves a minimum and maximum performance of 68.72\\% and 94.71\\%,\nrespectively, in identifying open intents. Moreover, compared to\nstate-of-the-art models, the SNOiC model improves the performance of\nidentifying open intents by 0.93\\% (minimum) and 12.76\\% (maximum). The model's\nefficacy is further established by analyzing various parameters used in the\nproposed model. An ablation study is also conducted, which involves creating\nthree model variants to validate the effectiveness of the SNOiC model.\n","authors":["Aditi Kanwar","Aditi Seetha","Satyendra Singh Chouhan","Rajdeep Niyogi"],"pdf_url":"https://arxiv.org/pdf/2310.07306v1.pdf","comment":"9 Pages, 6 figures"},{"id":"http://arxiv.org/abs/2310.00113v2","updated":"2023-10-11T08:38:28Z","published":"2023-09-29T20:01:11Z","title":"HyperMask: Adaptive Hypernetwork-based Masks for Continual Learning","summary":"  Artificial neural networks suffer from catastrophic forgetting when they are\nsequentially trained on multiple tasks. To overcome this problem, there exist\nmany continual learning strategies. One of the most effective is the\nhypernetwork-based approach. The hypernetwork generates the weights of a target\nmodel based on the task's identity. The model's main limitation is that\nhypernetwork can produce completely different nests for each task.\nConsequently, each task is solved separately. The model does not use\ninformation from the network dedicated to previous tasks and practically\nproduces new architectures when it learns the subsequent tasks. To solve such a\nproblem, we use the lottery ticket hypothesis, which postulates the existence\nof sparse subnetworks, named winning tickets, that preserve the performance of\na full network. In the paper, we propose a method called HyperMask, which\ntrains a single network for all tasks. Hypernetwork produces semi-binary masks\nto obtain target subnetworks dedicated to new tasks. This solution inherits the\nability of the hypernetwork to adapt to new tasks with minimal forgetting.\nMoreover, due to the lottery ticket hypothesis, we can use a single network\nwith weighted subnets dedicated to each task.\n","authors":["Kamil Książek","Przemysław Spurek"],"pdf_url":"https://arxiv.org/pdf/2310.00113v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05672v2","updated":"2023-10-11T08:37:40Z","published":"2023-10-09T12:42:39Z","title":"Multi-timestep models for Model-based Reinforcement Learning","summary":"  In model-based reinforcement learning (MBRL), most algorithms rely on\nsimulating trajectories from one-step dynamics models learned on data. A\ncritical challenge of this approach is the compounding of one-step prediction\nerrors as length of the trajectory grows. In this paper we tackle this issue by\nusing a multi-timestep objective to train one-step models. Our objective is a\nweighted sum of a loss function (e.g., negative log-likelihood) at various\nfuture horizons. We explore and test a range of weights profiles. We find that\nexponentially decaying weights lead to models that significantly improve the\nlong-horizon R2 score. This improvement is particularly noticeable when the\nmodels were evaluated on noisy data. Finally, using a soft actor-critic (SAC)\nagent in pure batch reinforcement learning (RL) and iterated batch RL\nscenarios, we found that our multi-timestep models outperform or match standard\none-step models. This was especially evident in a noisy variant of the\nconsidered environment, highlighting the potential of our approach in\nreal-world applications.\n","authors":["Abdelhakim Benechehab","Giuseppe Paolo","Albert Thomas","Maurizio Filippone","Balázs Kégl"],"pdf_url":"https://arxiv.org/pdf/2310.05672v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07298v1","updated":"2023-10-11T08:32:46Z","published":"2023-10-11T08:32:46Z","title":"Beyond Memorization: Violating Privacy Via Inference with Large Language\n  Models","summary":"  Current privacy research on large language models (LLMs) primarily focuses on\nthe issue of extracting memorized training data. At the same time, models'\ninference capabilities have increased drastically. This raises the key question\nof whether current LLMs could violate individuals' privacy by inferring\npersonal attributes from text given at inference time. In this work, we present\nthe first comprehensive study on the capabilities of pretrained LLMs to infer\npersonal attributes from text. We construct a dataset consisting of real Reddit\nprofiles, and show that current LLMs can infer a wide range of personal\nattributes (e.g., location, income, sex), achieving up to $85\\%$ top-1 and\n$95.8\\%$ top-3 accuracy at a fraction of the cost ($100\\times$) and time\n($240\\times$) required by humans. As people increasingly interact with\nLLM-powered chatbots across all aspects of life, we also explore the emerging\nthreat of privacy-invasive chatbots trying to extract personal information\nthrough seemingly benign questions. Finally, we show that common mitigations,\ni.e., text anonymization and model alignment, are currently ineffective at\nprotecting user privacy against LLM inference. Our findings highlight that\ncurrent LLMs can infer personal data at a previously unattainable scale. In the\nabsence of working defenses, we advocate for a broader discussion around LLM\nprivacy implications beyond memorization, striving for a wider privacy\nprotection.\n","authors":["Robin Staab","Mark Vero","Mislav Balunović","Martin Vechev"],"pdf_url":"https://arxiv.org/pdf/2310.07298v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07297v1","updated":"2023-10-11T08:31:26Z","published":"2023-10-11T08:31:26Z","title":"Score Regularized Policy Optimization through Diffusion Behavior","summary":"  Recent developments in offline reinforcement learning have uncovered the\nimmense potential of diffusion modeling, which excels at representing\nheterogeneous behavior policies. However, sampling from diffusion policies is\nconsiderably slow because it necessitates tens to hundreds of iterative\ninference steps for one action. To address this issue, we propose to extract an\nefficient deterministic inference policy from critic models and pretrained\ndiffusion behavior models, leveraging the latter to directly regularize the\npolicy gradient with the behavior distribution's score function during\noptimization. Our method enjoys powerful generative capabilities of diffusion\nmodeling while completely circumventing the computationally intensive and\ntime-consuming diffusion sampling scheme, both during training and evaluation.\nExtensive results on D4RL tasks show that our method boosts action sampling\nspeed by more than 25 times compared with various leading diffusion-based\nmethods in locomotion tasks, while still maintaining state-of-the-art\nperformance.\n","authors":["Huayu Chen","Cheng Lu","Zhengyi Wang","Hang Su","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.07297v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2310.07276v1","updated":"2023-10-11T07:57:08Z","published":"2023-10-11T07:57:08Z","title":"BioT5: Enriching Cross-modal Integration in Biology with Chemical\n  Knowledge and Natural Language Associations","summary":"  Recent advancements in biological research leverage the integration of\nmolecules, proteins, and natural language to enhance drug discovery. However,\ncurrent models exhibit several limitations, such as the generation of invalid\nmolecular SMILES, underutilization of contextual information, and equal\ntreatment of structured and unstructured knowledge. To address these issues, we\npropose $\\mathbf{BioT5}$, a comprehensive pre-training framework that enriches\ncross-modal integration in biology with chemical knowledge and natural language\nassociations. $\\mathbf{BioT5}$ utilizes SELFIES for $100%$ robust molecular\nrepresentations and extracts knowledge from the surrounding context of\nbio-entities in unstructured biological literature. Furthermore,\n$\\mathbf{BioT5}$ distinguishes between structured and unstructured knowledge,\nleading to more effective utilization of information. After fine-tuning, BioT5\nshows superior performance across a wide range of tasks, demonstrating its\nstrong capability of capturing underlying relations and properties of\nbio-entities. Our code is available at\n$\\href{https://github.com/QizhiPei/BioT5}{Github}$.\n","authors":["Qizhi Pei","Wei Zhang","Jinhua Zhu","Kehan Wu","Kaiyuan Gao","Lijun Wu","Yingce Xia","Rui Yan"],"pdf_url":"https://arxiv.org/pdf/2310.07276v1.pdf","comment":"Empirical Methods in Natural Language Processing (EMNLP 2023)"},{"id":"http://arxiv.org/abs/2310.07269v1","updated":"2023-10-11T07:51:10Z","published":"2023-10-11T07:51:10Z","title":"Why Does Sharpness-Aware Minimization Generalize Better Than SGD?","summary":"  The challenge of overfitting, in which the model memorizes the training data\nand fails to generalize to test data, has become increasingly significant in\nthe training of large neural networks. To tackle this challenge,\nSharpness-Aware Minimization (SAM) has emerged as a promising training method,\nwhich can improve the generalization of neural networks even in the presence of\nlabel noise. However, a deep understanding of how SAM works, especially in the\nsetting of nonlinear neural networks and classification tasks, remains largely\nmissing. This paper fills this gap by demonstrating why SAM generalizes better\nthan Stochastic Gradient Descent (SGD) for a certain data model and two-layer\nconvolutional ReLU networks. The loss landscape of our studied problem is\nnonsmooth, thus current explanations for the success of SAM based on the\nHessian information are insufficient. Our result explains the benefits of SAM,\nparticularly its ability to prevent noise learning in the early stages, thereby\nfacilitating more effective learning of features. Experiments on both synthetic\nand real data corroborate our theory.\n","authors":["Zixiang Chen","Junkai Zhang","Yiwen Kou","Xiangning Chen","Cho-Jui Hsieh","Quanquan Gu"],"pdf_url":"https://arxiv.org/pdf/2310.07269v1.pdf","comment":"52 pages, 4 figures, 2 tables. In NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.07268v1","updated":"2023-10-11T07:50:51Z","published":"2023-10-11T07:50:51Z","title":"RaftFed: A Lightweight Federated Learning Framework for Vehicular Crowd\n  Intelligence","summary":"  Vehicular crowd intelligence (VCI) is an emerging research field. Facilitated\nby state-of-the-art vehicular ad-hoc networks and artificial intelligence,\nvarious VCI applications come to place, e.g., collaborative sensing,\npositioning, and mapping. The collaborative property of VCI applications\ngenerally requires data to be shared among participants, thus forming\nnetwork-wide intelligence. How to fulfill this process without compromising\ndata privacy remains a challenging issue. Although federated learning (FL) is a\npromising tool to solve the problem, adapting conventional FL frameworks to VCI\nis nontrivial. First, the centralized model aggregation is unreliable in VCI\nbecause of the existence of stragglers with unfavorable channel conditions.\nSecond, existing FL schemes are vulnerable to Non-IID data, which is\nintensified by the data heterogeneity in VCI. This paper proposes a novel\nfederated learning framework called RaftFed to facilitate privacy-preserving\nVCI. The experimental results show that RaftFed performs better than baselines\nregarding communication overhead, model accuracy, and model convergence.\n","authors":["Changan Yang","Yaxing Chen","Yao Zhang","Helei Cui","Zhiwen Yu","Bin Guo","Zheng Yan","Zijiang Yang"],"pdf_url":"https://arxiv.org/pdf/2310.07268v1.pdf","comment":"8 pages,8 figures"},{"id":"http://arxiv.org/abs/2304.03864v2","updated":"2023-10-11T07:50:28Z","published":"2023-04-07T23:25:48Z","title":"SGDP: A Stream-Graph Neural Network Based Data Prefetcher","summary":"  Data prefetching is important for storage system optimization and access\nperformance improvement. Traditional prefetchers work well for mining access\npatterns of sequential logical block address (LBA) but cannot handle complex\nnon-sequential patterns that commonly exist in real-world applications. The\nstate-of-the-art (SOTA) learning-based prefetchers cover more LBA accesses.\nHowever, they do not adequately consider the spatial interdependencies between\nLBA deltas, which leads to limited performance and robustness. This paper\nproposes a novel Stream-Graph neural network-based Data Prefetcher (SGDP).\nSpecifically, SGDP models LBA delta streams using a weighted directed graph\nstructure to represent interactive relations among LBA deltas and further\nextracts hybrid features by graph neural networks for data prefetching. We\nconduct extensive experiments on eight real-world datasets. Empirical results\nverify that SGDP outperforms the SOTA methods in terms of the hit ratio by\n6.21%, the effective prefetching ratio by 7.00%, and speeds up inference time\nby 3.13X on average. Besides, we generalize SGDP to different variants by\ndifferent stream constructions, further expanding its application scenarios and\ndemonstrating its robustness. SGDP offers a novel data prefetching solution and\nhas been verified in commercial hybrid storage systems in the experimental\nphase. Our codes and appendix are available at\nhttps://github.com/yyysjz1997/SGDP/.\n","authors":["Yiyuan Yang","Rongshang Li","Qiquan Shi","Xijun Li","Gang Hu","Xing Li","Mingxuan Yuan"],"pdf_url":"https://arxiv.org/pdf/2304.03864v2.pdf","comment":"Accepted by International Joint Conference on Neural Networks (IJCNN\n  2023)"},{"id":"http://arxiv.org/abs/2306.10347v2","updated":"2023-10-11T07:50:09Z","published":"2023-06-17T13:40:15Z","title":"DCdetector: Dual Attention Contrastive Representation Learning for Time\n  Series Anomaly Detection","summary":"  Time series anomaly detection is critical for a wide range of applications.\nIt aims to identify deviant samples from the normal sample distribution in time\nseries. The most fundamental challenge for this task is to learn a\nrepresentation map that enables effective discrimination of anomalies.\nReconstruction-based methods still dominate, but the representation learning\nwith anomalies might hurt the performance with its large abnormal loss. On the\nother hand, contrastive learning aims to find a representation that can clearly\ndistinguish any instance from the others, which can bring a more natural and\npromising representation for time series anomaly detection. In this paper, we\npropose DCdetector, a multi-scale dual attention contrastive representation\nlearning model. DCdetector utilizes a novel dual attention asymmetric design to\ncreate the permutated environment and pure contrastive loss to guide the\nlearning process, thus learning a permutation invariant representation with\nsuperior discrimination abilities. Extensive experiments show that DCdetector\nachieves state-of-the-art results on multiple time series anomaly detection\nbenchmark datasets. Code is publicly available at\nhttps://github.com/DAMO-DI-ML/KDD2023-DCdetector.\n","authors":["Yiyuan Yang","Chaoli Zhang","Tian Zhou","Qingsong Wen","Liang Sun"],"pdf_url":"https://arxiv.org/pdf/2306.10347v2.pdf","comment":"Accepted by ACM SIGKDD International Conference on Knowledge\n  Discovery & Data Mining (KDD 2023)"},{"id":"http://arxiv.org/abs/2305.00472v2","updated":"2023-10-11T07:47:07Z","published":"2023-04-30T13:10:56Z","title":"Efficient MILP Decomposition in Quantum Computing for ReLU Network\n  Robustness","summary":"  Emerging quantum computing technologies, such as Noisy Intermediate-Scale\nQuantum (NISQ) devices, offer potential advancements in solving mathematical\noptimization problems. However, limitations in qubit availability, noise, and\nerrors pose challenges for practical implementation. In this study, we examine\ntwo decomposition methods for Mixed-Integer Linear Programming (MILP) designed\nto reduce the original problem size and utilize available NISQ devices more\nefficiently. We concentrate on breaking down the original problem into smaller\nsubproblems, which are then solved iteratively using a combined\nquantum-classical hardware approach. We conduct a detailed analysis for the\ndecomposition of MILP with Benders and Dantzig-Wolfe methods. In our analysis,\nwe show that the number of qubits required to solve Benders is exponentially\nlarge in the worst-case, while remains constant for Dantzig-Wolfe.\nAdditionally, we leverage Dantzig-Wolfe decomposition on the use-case of\ncertifying the robustness of ReLU networks. Our experimental results\ndemonstrate that this approach can save up to 90\\% of qubits compared to\nexisting methods on quantum annealing and gate-based quantum computers.\n","authors":["Nicola Franco","Tom Wollschläger","Benedikt Poggel","Stephan Günnemann","Jeanette Miriam Lorenz"],"pdf_url":"https://arxiv.org/pdf/2305.00472v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07264v1","updated":"2023-10-11T07:40:46Z","published":"2023-10-11T07:40:46Z","title":"Classification of Dysarthria based on the Levels of Severity. A\n  Systematic Review","summary":"  Dysarthria is a neurological speech disorder that can significantly impact\naffected individuals' communication abilities and overall quality of life. The\naccurate and objective classification of dysarthria and the determination of\nits severity are crucial for effective therapeutic intervention. While\ntraditional assessments by speech-language pathologists (SLPs) are common, they\nare often subjective, time-consuming, and can vary between practitioners.\nEmerging machine learning-based models have shown the potential to provide a\nmore objective dysarthria assessment, enhancing diagnostic accuracy and\nreliability. This systematic review aims to comprehensively analyze current\nmethodologies for classifying dysarthria based on severity levels.\nSpecifically, this review will focus on determining the most effective set and\ntype of features that can be used for automatic patient classification and\nevaluating the best AI techniques for this purpose. We will systematically\nreview the literature on the automatic classification of dysarthria severity\nlevels. Sources of information will include electronic databases and grey\nliterature. Selection criteria will be established based on relevance to the\nresearch questions. Data extraction will include methodologies used, the type\nof features extracted for classification, and AI techniques employed. The\nfindings of this systematic review will contribute to the current understanding\nof dysarthria classification, inform future research, and support the\ndevelopment of improved diagnostic tools. The implications of these findings\ncould be significant in advancing patient care and improving therapeutic\noutcomes for individuals affected by dysarthria.\n","authors":["Afnan Al-Ali","Somaya Al-Maadeed","Moutaz Saleh","Rani Chinnappa Naidu","Zachariah C Alex","Prakash Ramachandran","Rajeev Khoodeeram","Rajesh Kumar M"],"pdf_url":"https://arxiv.org/pdf/2310.07264v1.pdf","comment":"no comments"},{"id":"http://arxiv.org/abs/2310.07261v1","updated":"2023-10-11T07:38:37Z","published":"2023-10-11T07:38:37Z","title":"Deep ReLU networks and high-order finite element methods II: Chebyshev\n  emulation","summary":"  Expression rates and stability in Sobolev norms of deep ReLU neural networks\n(NNs) in terms of the number of parameters defining the NN for continuous,\npiecewise polynomial functions, on arbitrary, finite partitions $\\mathcal{T}$\nof a bounded interval $(a,b)$ are addressed. Novel constructions of ReLU NN\nsurrogates encoding the approximated functions in terms of Chebyshev polynomial\nexpansion coefficients are developed. Chebyshev coefficients can be computed\neasily from the values of the function in the Clenshaw--Curtis points using the\ninverse fast Fourier transform. Bounds on expression rates and stability that\nare superior to those of constructions based on ReLU NN emulations of monomials\nconsidered in [Opschoor, Petersen, Schwab, 2020] are obtained. All emulation\nbounds are explicit in terms of the (arbitrary) partition of the interval, the\ntarget emulation accuracy and the polynomial degree in each element of the\npartition. ReLU NN emulation error estimates are provided for various classes\nof functions and norms, commonly encountered in numerical analysis. In\nparticular, we show exponential ReLU emulation rate bounds for analytic\nfunctions with point singularities and develop an interface between Chebfun\napproximations and constructive ReLU NN emulations.\n","authors":["Joost A. A. Opschoor","Christoph Schwab"],"pdf_url":"https://arxiv.org/pdf/2310.07261v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.08786v6","updated":"2023-10-11T07:37:01Z","published":"2022-10-17T07:01:17Z","title":"Exposing Influence Campaigns in the Age of LLMs: A Behavioral-Based AI\n  Approach to Detecting State-Sponsored Trolls","summary":"  The detection of state-sponsored trolls operating in influence campaigns on\nsocial media is a critical and unsolved challenge for the research community,\nwhich has significant implications beyond the online realm. To address this\nchallenge, we propose a new AI-based solution that identifies troll accounts\nsolely through behavioral cues associated with their sequences of sharing\nactivity, encompassing both their actions and the feedback they receive from\nothers. Our approach does not incorporate any textual content shared and\nconsists of two steps: First, we leverage an LSTM-based classifier to determine\nwhether account sequences belong to a state-sponsored troll or an organic,\nlegitimate user. Second, we employ the classified sequences to calculate a\nmetric named the \"Troll Score\", quantifying the degree to which an account\nexhibits troll-like behavior. To assess the effectiveness of our method, we\nexamine its performance in the context of the 2016 Russian interference\ncampaign during the U.S. Presidential election. Our experiments yield\ncompelling results, demonstrating that our approach can identify account\nsequences with an AUC close to 99% and accurately differentiate between Russian\ntrolls and organic users with an AUC of 91%. Notably, our behavioral-based\napproach holds a significant advantage in the ever-evolving landscape, where\ntextual and linguistic properties can be easily mimicked by Large Language\nModels (LLMs): In contrast to existing language-based techniques, it relies on\nmore challenging-to-replicate behavioral cues, ensuring greater resilience in\nidentifying influence campaigns, especially given the potential increase in the\nusage of LLMs for generating inauthentic content. Finally, we assessed the\ngeneralizability of our solution to various entities driving different\ninformation operations and found promising results that will guide future\nresearch.\n","authors":["Fatima Ezzeddine","Luca Luceri","Omran Ayoub","Ihab Sbeity","Gianluca Nogara","Emilio Ferrara","Silvia Giordano"],"pdf_url":"https://arxiv.org/pdf/2210.08786v6.pdf","comment":"22"},{"id":"http://arxiv.org/abs/2306.01984v2","updated":"2023-10-11T07:35:27Z","published":"2023-06-03T02:46:31Z","title":"DYffusion: A Dynamics-informed Diffusion Model for Spatiotemporal\n  Forecasting","summary":"  While diffusion models can successfully generate data and make predictions,\nthey are predominantly designed for static images. We propose an approach for\nefficiently training diffusion models for probabilistic spatiotemporal\nforecasting, where generating stable and accurate rollout forecasts remains\nchallenging, Our method, DYffusion, leverages the temporal dynamics in the\ndata, directly coupling it with the diffusion steps in the model. We train a\nstochastic, time-conditioned interpolator and a forecaster network that mimic\nthe forward and reverse processes of standard diffusion models, respectively.\nDYffusion naturally facilitates multi-step and long-range forecasting, allowing\nfor highly flexible, continuous-time sampling trajectories and the ability to\ntrade-off performance with accelerated sampling at inference time. In addition,\nthe dynamics-informed diffusion process in DYffusion imposes a strong inductive\nbias and significantly improves computational efficiency compared to\ntraditional Gaussian noise-based diffusion models. Our approach performs\ncompetitively on probabilistic forecasting of complex dynamics in sea surface\ntemperatures, Navier-Stokes flows, and spring mesh systems.\n","authors":["Salva Rühling Cachay","Bo Zhao","Hailey Joren","Rose Yu"],"pdf_url":"https://arxiv.org/pdf/2306.01984v2.pdf","comment":"Accepted to NeurIPS 2023; Code is available at:\n  https://github.com/Rose-STL-Lab/dyffusion"},{"id":"http://arxiv.org/abs/2310.07253v1","updated":"2023-10-11T07:30:18Z","published":"2023-10-11T07:30:18Z","title":"ADMEOOD: Out-of-Distribution Benchmark for Drug Property Prediction","summary":"  Obtaining accurate and valid information for drug molecules is a crucial and\nchallenging task. However, chemical knowledge and information have been\naccumulated over the past 100 years from various regions, laboratories, and\nexperimental purposes. Little has been explored in terms of the\nout-of-distribution (OOD) problem with noise and inconsistency, which may lead\nto weak robustness and unsatisfied performance. This study proposes a novel\nbenchmark ADMEOOD, a systematic OOD dataset curator and benchmark specifically\ndesigned for drug property prediction. ADMEOOD obtained 27 ADME (Absorption,\nDistribution, Metabolism, Excretion) drug properties from Chembl and relevant\nliterature. Additionally, it includes two kinds of OOD data shifts: Noise Shift\nand Concept Conflict Drift (CCD). Noise Shift responds to the noise level by\ncategorizing the environment into different confidence levels. On the other\nhand, CCD describes the data which has inconsistent label among the original\ndata. Finally, it tested on a variety of domain generalization models, and the\nexperimental results demonstrate the effectiveness of the proposed partition\nmethod in ADMEOOD: ADMEOOD demonstrates a significant difference performance\nbetween in-distribution and out-of-distribution data. Moreover, ERM (Empirical\nRisk Minimization) and other models exhibit distinct trends in performance\nacross different domains and measurement types.\n","authors":["Shuoying Wei","Xinlong Wen","Lida Zhu","Songquan Li","Rongbo Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.07253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07252v1","updated":"2023-10-11T07:30:01Z","published":"2023-10-11T07:30:01Z","title":"A Comparative Study of Pre-trained CNNs and GRU-Based Attention for\n  Image Caption Generation","summary":"  Image captioning is a challenging task involving generating a textual\ndescription for an image using computer vision and natural language processing\ntechniques. This paper proposes a deep neural framework for image caption\ngeneration using a GRU-based attention mechanism. Our approach employs multiple\npre-trained convolutional neural networks as the encoder to extract features\nfrom the image and a GRU-based language model as the decoder to generate\ndescriptive sentences. To improve performance, we integrate the Bahdanau\nattention model with the GRU decoder to enable learning to focus on specific\nimage parts. We evaluate our approach using the MSCOCO and Flickr30k datasets\nand show that it achieves competitive scores compared to state-of-the-art\nmethods. Our proposed framework can bridge the gap between computer vision and\nnatural language and can be extended to specific domains.\n","authors":["Rashid Khan","Bingding Huang","Haseeb Hassan","Asim Zaman","Zhongfu Ye"],"pdf_url":"https://arxiv.org/pdf/2310.07252v1.pdf","comment":"15pages, 10 figures, 5 tables. 2023 the 5th International Conference\n  on Robotics and Computer Vision (ICRCV 2023). arXiv admin note: substantial\n  text overlap with arXiv:2203.01594"},{"id":"http://arxiv.org/abs/2310.07250v1","updated":"2023-10-11T07:27:28Z","published":"2023-10-11T07:27:28Z","title":"Synthesizing Missing MRI Sequences from Available Modalities using\n  Generative Adversarial Networks in BraTS Dataset","summary":"  Glioblastoma is a highly aggressive and lethal form of brain cancer. Magnetic\nresonance imaging (MRI) plays a significant role in the diagnosis, treatment\nplanning, and follow-up of glioblastoma patients due to its non-invasive and\nradiation-free nature. The International Brain Tumor Segmentation (BraTS)\nchallenge has contributed to generating numerous AI algorithms to accurately\nand efficiently segment glioblastoma sub-compartments using four structural\n(T1, T1Gd, T2, T2-FLAIR) MRI scans. However, these four MRI sequences may not\nalways be available. To address this issue, Generative Adversarial Networks\n(GANs) can be used to synthesize the missing MRI sequences. In this paper, we\nimplement and utilize an open-source GAN approach that takes any three MRI\nsequences as input to generate the missing fourth structural sequence. Our\nproposed approach is contributed to the community-driven generally nuanced deep\nlearning framework (GaNDLF) and demonstrates promising results in synthesizing\nhigh-quality and realistic MRI sequences, enabling clinicians to improve their\ndiagnostic capabilities and support the application of AI methods to brain\ntumor MRI quantification.\n","authors":["Ibrahim Ethem Hamamci"],"pdf_url":"https://arxiv.org/pdf/2310.07250v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07245v1","updated":"2023-10-11T07:22:37Z","published":"2023-10-11T07:22:37Z","title":"Crowd Counting in Harsh Weather using Image Denoising with Pix2Pix GANs","summary":"  Visual crowd counting estimates the density of the crowd using deep learning\nmodels such as convolution neural networks (CNNs). The performance of the model\nheavily relies on the quality of the training data that constitutes crowd\nimages. In harsh weather such as fog, dust, and low light conditions, the\ninference performance may severely degrade on the noisy and blur images. In\nthis paper, we propose the use of Pix2Pix generative adversarial network (GAN)\nto first denoise the crowd images prior to passing them to the counting model.\nA Pix2Pix network is trained using synthetic noisy images generated from\noriginal crowd images and then the pretrained generator is then used in the\ninference engine to estimate the crowd density in unseen, noisy crowd images.\nThe performance is tested on JHU-Crowd dataset to validate the significance of\nthe proposed method particularly when high reliability and accuracy are\nrequired.\n","authors":["Muhammad Asif Khan","Hamid Menouar","Ridha Hamila"],"pdf_url":"https://arxiv.org/pdf/2310.07245v1.pdf","comment":"The paper has been accepted for presentation in IEEE 38th\n  International Conference on Image and Vision Computing New Zealand (IVCNZ\n  2023). The final manuscript can be accessed at ieeexplore"},{"id":"http://arxiv.org/abs/2310.07241v1","updated":"2023-10-11T07:13:16Z","published":"2023-10-11T07:13:16Z","title":"Surrogate modeling for stochastic crack growth processes in structural\n  health monitoring applications","summary":"  Fatigue crack growth is one of the most common types of deterioration in\nmetal structures with significant implications on their reliability. Recent\nadvances in Structural Health Monitoring (SHM) have motivated the use of\nstructural response data to predict future crack growth under uncertainty, in\norder to enable a transition towards predictive maintenance. Accurately\nrepresenting different sources of uncertainty in stochastic crack growth (SCG)\nprocesses is a non-trivial task. The present work builds on previous research\non physics-based SCG modeling under both material and load-related uncertainty.\nThe aim here is to construct computationally efficient, probabilistic surrogate\nmodels for SCG processes that successfully encode these different sources of\nuncertainty. An approach inspired by latent variable modeling is employed that\nutilizes Gaussian Process (GP) regression models to enable the surrogates to be\nused to generate prior distributions for different Bayesian SHM tasks as the\napplication of interest. Implementation is carried out in a numerical setting\nand model performance is assessed for two fundamental crack SHM problems;\nnamely crack length monitoring (damage quantification) and crack growth\nmonitoring (damage prognosis).\n","authors":["Nicholas E. Silionis","Konstantinos N. Anyfantis"],"pdf_url":"https://arxiv.org/pdf/2310.07241v1.pdf","comment":"20 pages, 9 figures. Preprint submitted to Elsevier journal"},{"id":"http://arxiv.org/abs/2310.07240v1","updated":"2023-10-11T07:08:20Z","published":"2023-10-11T07:08:20Z","title":"CacheGen: Fast Context Loading for Language Model Applications","summary":"  As large language models (LLMs) take on more complex tasks, their inputs\nincorporate longer contexts to respond to questions that require domain\nknowledge or user-specific conversational histories. Yet, using long contexts\nposes a challenge for responsive LLM systems, as nothing can be generated until\nall the contexts are fetched to and processed by the LLM. Existing systems\noptimize only the computation delay in context processing (e.g., by caching\nintermediate key-value features of the text context) but often cause longer\nnetwork delays in context fetching (e.g., key-value features consume orders of\nmagnitude larger bandwidth than the text context).\n  This paper presents CacheGen to minimize the delays in fetching and\nprocessing contexts for LLMs. CacheGen reduces the bandwidth needed for\ntransmitting long contexts' key-value (KV) features through a novel encoder\nthat compresses KV features into more compact bitstream representations. The\nencoder combines adaptive quantization with a tailored arithmetic coder, taking\nadvantage of the KV features' distributional properties, such as locality\nacross tokens. Furthermore, CacheGen minimizes the total delay in fetching and\nprocessing a context by using a controller that determines when to load the\ncontext as compressed KV features or raw text and picks the appropriate\ncompression level if loaded as KV features. We test CacheGen on three models of\nvarious sizes and three datasets of different context lengths. Compared to\nrecent methods that handle long contexts, CacheGen reduces bandwidth usage by\n3.7-4.3x and the total delay in fetching and processing contexts by 2.7-3x\nwhile maintaining similar LLM performance on various tasks as loading the text\ncontexts.\n","authors":["Yuhan Liu","Hanchen Li","Kuntai Du","Jiayi Yao","Yihua Cheng","Yuyang Huang","Shan Lu","Michael Maire","Henry Hoffmann","Ari Holtzman","Ganesh Ananthanarayanan","Junchen Jiang"],"pdf_url":"https://arxiv.org/pdf/2310.07240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07235v1","updated":"2023-10-11T06:53:05Z","published":"2023-10-11T06:53:05Z","title":"Are GATs Out of Balance?","summary":"  While the expressive power and computational capabilities of graph neural\nnetworks (GNNs) have been theoretically studied, their optimization and\nlearning dynamics, in general, remain largely unexplored. Our study undertakes\nthe Graph Attention Network (GAT), a popular GNN architecture in which a node's\nneighborhood aggregation is weighted by parameterized attention coefficients.\nWe derive a conservation law of GAT gradient flow dynamics, which explains why\na high portion of parameters in GATs with standard initialization struggle to\nchange during training. This effect is amplified in deeper GATs, which perform\nsignificantly worse than their shallow counterparts. To alleviate this problem,\nwe devise an initialization scheme that balances the GAT network. Our approach\ni) allows more effective propagation of gradients and in turn enables\ntrainability of deeper networks, and ii) attains a considerable speedup in\ntraining and convergence time in comparison to the standard initialization. Our\nmain theorem serves as a stepping stone to studying the learning dynamics of\npositive homogeneous models with attention mechanisms.\n","authors":["Nimrah Mustafa","Aleksandar Bojchevski","Rebekka Burkholz"],"pdf_url":"https://arxiv.org/pdf/2310.07235v1.pdf","comment":"24 pages. To be published in Advances in Neural Information\n  Processing Systems (NeurIPS), 2023"},{"id":"http://arxiv.org/abs/2310.07234v1","updated":"2023-10-11T06:51:46Z","published":"2023-10-11T06:51:46Z","title":"Hierarchical Decomposition of Prompt-Based Continual Learning:\n  Rethinking Obscured Sub-optimality","summary":"  Prompt-based continual learning is an emerging direction in leveraging\npre-trained knowledge for downstream continual learning, and has almost reached\nthe performance pinnacle under supervised pre-training. However, our empirical\nresearch reveals that the current strategies fall short of their full potential\nunder the more realistic self-supervised pre-training, which is essential for\nhandling vast quantities of unlabeled data in practice. This is largely due to\nthe difficulty of task-specific knowledge being incorporated into instructed\nrepresentations via prompt parameters and predicted by uninstructed\nrepresentations at test time. To overcome the exposed sub-optimality, we\nconduct a theoretical analysis of the continual learning objective in the\ncontext of pre-training, and decompose it into hierarchical components:\nwithin-task prediction, task-identity inference, and task-adaptive prediction.\nFollowing these empirical and theoretical insights, we propose Hierarchical\nDecomposition (HiDe-)Prompt, an innovative approach that explicitly optimizes\nthe hierarchical components with an ensemble of task-specific prompts and\nstatistics of both uninstructed and instructed representations, further with\nthe coordination of a contrastive regularization strategy. Our extensive\nexperiments demonstrate the superior performance of HiDe-Prompt and its\nrobustness to pre-training paradigms in continual learning (e.g., up to 15.01%\nand 9.61% lead on Split CIFAR-100 and Split ImageNet-R, respectively). Our code\nis available at \\url{https://github.com/thu-ml/HiDe-Prompt}.\n","authors":["Liyuan Wang","Jingyi Xie","Xingxing Zhang","Mingyi Huang","Hang Su","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.07234v1.pdf","comment":"23 pages, 20 figures, 11 tables, accepted by NeurIPS as a Spotlight"},{"id":"http://arxiv.org/abs/2212.12180v4","updated":"2023-10-11T06:49:08Z","published":"2022-12-23T07:42:56Z","title":"Autothrottle: A Practical Bi-Level Approach to Resource Management for\n  SLO-Targeted Microservices","summary":"  Achieving resource efficiency while preserving end-user experience is\nnon-trivial for cloud application operators. As cloud applications\nprogressively adopt microservices, resource managers are faced with two\ndistinct levels of system behavior: end-to-end application latency and\nper-service resource usage. Translating between the two levels, however, is\nchallenging because user requests traverse heterogeneous services that\ncollectively (but unevenly) contribute to the end-to-end latency. We present\nAutothrottle, a bi-level resource management framework for microservices with\nlatency SLOs (service-level objectives). It architecturally decouples\napplication SLO feedback from service resource control, and bridges them\nthrough the notion of performance targets. Specifically, an application-wide\nlearning-based controller is employed to periodically set performance targets\n-- expressed as CPU throttle ratios -- for per-service heuristic controllers to\nattain. We evaluate Autothrottle on three microservice applications, with\nworkload traces from production scenarios. Results show superior CPU savings,\nup to 26.21% over the best-performing baseline and up to 93.84% over all\nbaselines.\n","authors":["Zibo Wang","Pinghe Li","Chieh-Jan Mike Liang","Feng Wu","Francis Y. Yan"],"pdf_url":"https://arxiv.org/pdf/2212.12180v4.pdf","comment":"Accepted by USENIX NSDI '24"},{"id":"http://arxiv.org/abs/2310.07229v1","updated":"2023-10-11T06:36:23Z","published":"2023-10-11T06:36:23Z","title":"Self-supervised Pocket Pretraining via Protein Fragment-Surroundings\n  Alignment","summary":"  Pocket representations play a vital role in various biomedical applications,\nsuch as druggability estimation, ligand affinity prediction, and de novo drug\ndesign. While existing geometric features and pretrained representations have\ndemonstrated promising results, they usually treat pockets independent of\nligands, neglecting the fundamental interactions between them. However, the\nlimited pocket-ligand complex structures available in the PDB database (less\nthan 100 thousand non-redundant pairs) hampers large-scale pretraining\nendeavors for interaction modeling. To address this constraint, we propose a\nnovel pocket pretraining approach that leverages knowledge from high-resolution\natomic protein structures, assisted by highly effective pretrained small\nmolecule representations. By segmenting protein structures into drug-like\nfragments and their corresponding pockets, we obtain a reasonable simulation of\nligand-receptor interactions, resulting in the generation of over 5 million\ncomplexes. Subsequently, the pocket encoder is trained in a contrastive manner\nto align with the representation of pseudo-ligand furnished by some pretrained\nsmall molecule encoders. Our method, named ProFSA, achieves state-of-the-art\nperformance across various tasks, including pocket druggability prediction,\npocket matching, and ligand binding affinity prediction. Notably, ProFSA\nsurpasses other pretraining methods by a substantial margin. Moreover, our work\nopens up a new avenue for mitigating the scarcity of protein-ligand complex\ndata through the utilization of high-quality and diverse protein structure\ndatabases.\n","authors":["Bowen Gao","Yinjun Jia","Yuanle Mo","Yuyan Ni","Weiying Ma","Zhiming Ma","Yanyan Lan"],"pdf_url":"https://arxiv.org/pdf/2310.07229v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.12421v3","updated":"2023-10-11T06:32:32Z","published":"2022-11-11T02:14:28Z","title":"Data-Driven Network Neuroscience: On Data Collection and Benchmark","summary":"  This paper presents a comprehensive and quality collection of functional\nhuman brain \\emph{network} data for potential research in the intersection of\nneuroscience, machine learning, and graph analytics. Anatomical and functional\nMRI images have been used to understand the functional connectivity of the\nhuman brain and are particularly important in identifying underlying\nneurodegenerative conditions such as Alzheimer's, Parkinson's, and Autism.\nRecently, the study of the brain in the form of brain networks using machine\nlearning and graph analytics has become increasingly popular, especially to\npredict the early onset of these conditions. A brain network, represented as a\ngraph, retains rich structural and positional information that traditional\nexamination methods are unable to capture. However, the lack of publicly\naccessible brain network data prevents researchers from data-driven\nexplorations. One of the main difficulties lies in the complicated\ndomain-specific preprocessing steps and the exhaustive computation required to\nconvert the data from MRI images into brain networks. We bridge this gap by\ncollecting a large amount of MRI images from public databases and a private\nsource, working with domain experts to make sensible design choices, and\npreprocessing the MRI images to produce a collection of brain network datasets.\nThe datasets originate from 6 different sources, cover 4 brain conditions, and\nconsist of a total of 2,702 subjects. We test our graph datasets on 12 machine\nlearning models to provide baselines and validate the data quality on a recent\ngraph analysis model. To lower the barrier to entry and promote the research in\nthis interdisciplinary field, we release our brain network data and complete\npreprocessing details including codes at\nhttps://doi.org/10.17608/k6.auckland.21397377 and\nhttps://figshare.com/s/fa33c10664ca08b022ce.\n","authors":["Jiaxing Xu","Yunhan Yang","David Tse Jung Huang","Sophi Shilpa Gururajapathy","Yiping Ke","Miao Qiao","Alan Wang","Haribalan Kumar","Josh McGeown","Eryn Kwon"],"pdf_url":"https://arxiv.org/pdf/2211.12421v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17382v2","updated":"2023-10-11T06:18:04Z","published":"2023-09-29T16:36:39Z","title":"Reason for Future, Act for Now: A Principled Framework for Autonomous\n  LLM Agents with Provable Sample Efficiency","summary":"  Large language models (LLMs) demonstrate impressive reasoning abilities, but\ntranslating reasoning into actions in the real world remains challenging. In\nparticular, it remains unclear how to complete a given task provably within a\nminimum number of interactions with the external environment, e.g., through an\ninternal mechanism of reasoning. To this end, we propose a principled framework\nwith provable regret guarantees to orchestrate reasoning and acting, which we\ncall \"reason for future, act for now\" (\\texttt{RAFA}). Specifically, we design\na prompt template for reasoning that learns from the memory buffer and plans a\nfuture trajectory over a long horizon (\"reason for future\"). At each step, the\nLLM agent takes the initial action of the planned trajectory (\"act for now\"),\nstores the collected feedback in the memory buffer, and reinvokes the reasoning\nroutine to replan the future trajectory from the new state.\n  The key idea is to cast reasoning in LLMs as learning and planning in\nBayesian adaptive Markov decision processes (MDPs). Correspondingly, we prompt\nLLMs to form an updated posterior of the unknown environment from the memory\nbuffer (learning) and generate an optimal trajectory for multiple future steps\nthat maximizes a value function (planning). The learning and planning\nsubroutines are performed in an \"in-context\" manner to emulate the actor-critic\nupdate for MDPs. Our theoretical analysis proves that the novel combination of\nlong-term reasoning and short-term acting achieves a $\\sqrt{T}$ regret. In\nparticular, the regret bound highlights an intriguing interplay between the\nprior knowledge obtained through pretraining and the uncertainty reduction\nachieved by reasoning and acting. Our empirical validation shows that it\noutperforms various existing frameworks and achieves nearly perfect scores on a\nfew benchmarks.\n","authors":["Zhihan Liu","Hao Hu","Shenao Zhang","Hongyi Guo","Shuqi Ke","Boyi Liu","Zhaoran Wang"],"pdf_url":"https://arxiv.org/pdf/2309.17382v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07223v1","updated":"2023-10-11T06:13:50Z","published":"2023-10-11T06:13:50Z","title":"Deep Learning for blind spectral unmixing of LULC classes with MODIS\n  multispectral time series and ancillary data","summary":"  Remotely sensed data are dominated by mixed Land Use and Land Cover (LULC)\ntypes. Spectral unmixing is a technique to extract information from mixed\npixels into their constituent LULC types and corresponding abundance fractions.\nTraditionally, solving this task has relied on either classical methods that\nrequire prior knowledge of endmembers or machine learning methods that avoid\nexplicit endmembers calculation, also known as blind spectral unmixing (BSU).\nMost BSU studies based on Deep Learning (DL) focus on one time-step\nhyperspectral data, yet its acquisition remains quite costly compared with\nmultispectral data. To our knowledge, here we provide the first study on BSU of\nLULC classes using multispectral time series data with DL models. We further\nboost the performance of a Long-Short Term Memory (LSTM)-based model by\nincorporating geographic plus topographic (geo-topographic) and climatic\nancillary information. Our experiments show that combining spectral-temporal\ninput data together with geo-topographic and climatic information substantially\nimproves the abundance estimation of LULC classes in mixed pixels. To carry out\nthis study, we built a new labeled dataset of the region of Andalusia (Spain)\nwith monthly multispectral time series of pixels for the year 2013 from MODIS\nat 460m resolution, for two hierarchical levels of LULC classes, named\nAndalusia MultiSpectral MultiTemporal Unmixing (Andalusia-MSMTU). This dataset\nprovides, at the pixel level, a multispectral time series plus ancillary\ninformation annotated with the abundance of each LULC class inside each pixel.\nThe dataset and code are available to the public.\n","authors":["José Rodríguez-Ortega","Rohaifa Khaldi","Domingo Alcaraz-Segura","Siham Tabik"],"pdf_url":"https://arxiv.org/pdf/2310.07223v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07221v1","updated":"2023-10-11T06:11:11Z","published":"2023-10-11T06:11:11Z","title":"Using Learnable Physics for Real-Time Exercise Form Recommendations","summary":"  Good posture and form are essential for safe and productive exercising. Even\nin gym settings, trainers may not be readily available for feedback.\nRehabilitation therapies and fitness workouts can thus benefit from recommender\nsystems that provide real-time evaluation. In this paper, we present an\nalgorithmic pipeline that can diagnose problems in exercise techniques and\noffer corrective recommendations, with high sensitivity and specificity in\nreal-time. We use MediaPipe for pose recognition, count repetitions using\npeak-prominence detection, and use a learnable physics simulator to track\nmotion evolution for each exercise. A test video is diagnosed based on\ndeviations from the prototypical learned motion using statistical learning. The\nsystem is evaluated on six full and upper body exercises. These real-time\nrecommendations, counseled via low-cost equipment like smartphones, will allow\nexercisers to rectify potential mistakes making self-practice feasible while\nreducing the risk of workout injuries.\n","authors":["Abhishek Jaiswal","Gautam Chauhan","Nisheeth Srivastava"],"pdf_url":"https://arxiv.org/pdf/2310.07221v1.pdf","comment":"Accepted by ACM RecSys '23, 12 pages , 7 Figures"},{"id":"http://arxiv.org/abs/2310.07220v1","updated":"2023-10-11T06:10:07Z","published":"2023-10-11T06:10:07Z","title":"COPlanner: Plan to Roll Out Conservatively but to Explore Optimistically\n  for Model-Based RL","summary":"  Dyna-style model-based reinforcement learning contains two phases: model\nrollouts to generate sample for policy learning and real environment\nexploration using current policy for dynamics model learning. However, due to\nthe complex real-world environment, it is inevitable to learn an imperfect\ndynamics model with model prediction error, which can further mislead policy\nlearning and result in sub-optimal solutions. In this paper, we propose\n$\\texttt{COPlanner}$, a planning-driven framework for model-based methods to\naddress the inaccurately learned dynamics model problem with conservative model\nrollouts and optimistic environment exploration. $\\texttt{COPlanner}$ leverages\nan uncertainty-aware policy-guided model predictive control (UP-MPC) component\nto plan for multi-step uncertainty estimation. This estimated uncertainty then\nserves as a penalty during model rollouts and as a bonus during real\nenvironment exploration respectively, to choose actions. Consequently,\n$\\texttt{COPlanner}$ can avoid model uncertain regions through conservative\nmodel rollouts, thereby alleviating the influence of model error.\nSimultaneously, it explores high-reward model uncertain regions to reduce model\nerror actively through optimistic real environment exploration.\n$\\texttt{COPlanner}$ is a plug-and-play framework that can be applied to any\ndyna-style model-based methods. Experimental results on a series of\nproprioceptive and visual continuous control tasks demonstrate that both sample\nefficiency and asymptotic performance of strong model-based methods are\nsignificantly improved combined with $\\texttt{COPlanner}$.\n","authors":["Xiyao Wang","Ruijie Zheng","Yanchao Sun","Ruonan Jia","Wichayaporn Wongkamjan","Huazhe Xu","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2310.07220v1.pdf","comment":"20 pages, 12 figures"},{"id":"http://arxiv.org/abs/2310.07219v1","updated":"2023-10-11T06:09:48Z","published":"2023-10-11T06:09:48Z","title":"Improved Membership Inference Attacks Against Language Classification\n  Models","summary":"  Artificial intelligence systems are prevalent in everyday life, with use\ncases in retail, manufacturing, health, and many other fields. With the rise in\nAI adoption, associated risks have been identified, including privacy risks to\nthe people whose data was used to train models. Assessing the privacy risks of\nmachine learning models is crucial to enabling knowledgeable decisions on\nwhether to use, deploy, or share a model. A common approach to privacy risk\nassessment is to run one or more known attacks against the model and measure\ntheir success rate. We present a novel framework for running membership\ninference attacks against classification models. Our framework takes advantage\nof the ensemble method, generating many specialized attack models for different\nsubsets of the data. We show that this approach achieves higher accuracy than\neither a single attack model or an attack model per class label, both on\nclassical and language classification tasks.\n","authors":["Shlomit Shachor","Natalia Razinkov","Abigail Goldsteen"],"pdf_url":"https://arxiv.org/pdf/2310.07219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07217v1","updated":"2023-10-11T06:09:14Z","published":"2023-10-11T06:09:14Z","title":"Enhancing Neural Architecture Search with Multiple Hardware Constraints\n  for Deep Learning Model Deployment on Tiny IoT Devices","summary":"  The rapid proliferation of computing domains relying on Internet of Things\n(IoT) devices has created a pressing need for efficient and accurate\ndeep-learning (DL) models that can run on low-power devices. However,\ntraditional DL models tend to be too complex and computationally intensive for\ntypical IoT end-nodes. To address this challenge, Neural Architecture Search\n(NAS) has emerged as a popular design automation technique for co-optimizing\nthe accuracy and complexity of deep neural networks. Nevertheless, existing NAS\ntechniques require many iterations to produce a network that adheres to\nspecific hardware constraints, such as the maximum memory available on the\nhardware or the maximum latency allowed by the target application. In this\nwork, we propose a novel approach to incorporate multiple constraints into\nso-called Differentiable NAS optimization methods, which allows the generation,\nin a single shot, of a model that respects user-defined constraints on both\nmemory and latency in a time comparable to a single standard training. The\nproposed approach is evaluated on five IoT-relevant benchmarks, including the\nMLPerf Tiny suite and Tiny ImageNet, demonstrating that, with a single search,\nit is possible to reduce memory and latency by 87.4% and 54.2%, respectively\n(as defined by our targets), while ensuring non-inferior accuracy on\nstate-of-the-art hand-tuned deep neural networks for TinyML.\n","authors":["Alessio Burrello","Matteo Risso","Beatrice Alessandra Motetti","Enrico Macii","Luca Benini","Daniele Jahier Pagliari"],"pdf_url":"https://arxiv.org/pdf/2310.07217v1.pdf","comment":"Accepted for publication at the IEEE Transactions on Emerging Topics\n  in Computing"},{"id":"http://arxiv.org/abs/2310.07216v1","updated":"2023-10-11T06:04:40Z","published":"2023-10-11T06:04:40Z","title":"Generative Modeling on Manifolds Through Mixture of Riemannian Diffusion\n  Processes","summary":"  Learning the distribution of data on Riemannian manifolds is crucial for\nmodeling data from non-Euclidean space, which is required by many applications\nfrom diverse scientific fields. Yet, existing generative models on manifolds\nsuffer from expensive divergence computation or rely on approximations of heat\nkernel. These limitations restrict their applicability to simple geometries and\nhinder scalability to high dimensions. In this work, we introduce the\nRiemannian Diffusion Mixture, a principled framework for building a generative\nprocess on manifolds as a mixture of endpoint-conditioned diffusion processes\ninstead of relying on the denoising approach of previous diffusion models, for\nwhich the generative process is characterized by its drift guiding toward the\nmost probable endpoint with respect to the geometry of the manifold. We further\npropose a simple yet efficient training objective for learning the mixture\nprocess, that is readily applicable to general manifolds. Our method\noutperforms previous generative models on various manifolds while scaling to\nhigh dimensions and requires a dramatically reduced number of in-training\nsimulation steps for general manifolds.\n","authors":["Jaehyeong Jo","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2310.07216v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04895v2","updated":"2023-10-11T05:59:53Z","published":"2023-10-07T18:47:17Z","title":"Cell Tracking-by-detection using Elliptical Bounding Boxes","summary":"  Cell detection and tracking are paramount for bio-analysis. Recent approaches\nrely on the tracking-by-model evolution paradigm, which usually consists of\ntraining end-to-end deep learning models to detect and track the cells on the\nframes with promising results. However, such methods require extensive amounts\nof annotated data, which is time-consuming to obtain and often requires\nspecialized annotators. This work proposes a new approach based on the\nclassical tracking-by-detection paradigm that alleviates the requirement of\nannotated data. More precisely, it approximates the cell shapes as oriented\nellipses and then uses generic-purpose oriented object detectors to identify\nthe cells in each frame. We then rely on a global data association algorithm\nthat explores temporal cell similarity using probability distance metrics,\nconsidering that the ellipses relate to two-dimensional Gaussian distributions.\nOur results show that our method can achieve detection and tracking results\ncompetitively with state-of-the-art techniques that require considerably more\nextensive data annotation. Our code is available at:\nhttps://github.com/LucasKirsten/Deep-Cell-Tracking-EBB.\n","authors":["Lucas N. Kirsten","Cláudio R. Jung"],"pdf_url":"https://arxiv.org/pdf/2310.04895v2.pdf","comment":"Paper under review on IEEE/ACM Transactions on Computational Biology\n  and Bioinformatics"},{"id":"http://arxiv.org/abs/2310.07211v1","updated":"2023-10-11T05:55:20Z","published":"2023-10-11T05:55:20Z","title":"Bridging the Gap between Newton-Raphson Method and Regularized Policy\n  Iteration","summary":"  Regularization is one of the most important techniques in reinforcement\nlearning algorithms. The well-known soft actor-critic algorithm is a special\ncase of regularized policy iteration where the regularizer is chosen as Shannon\nentropy. Despite some empirical success of regularized policy iteration, its\ntheoretical underpinnings remain unclear. This paper proves that regularized\npolicy iteration is strictly equivalent to the standard Newton-Raphson method\nin the condition of smoothing out Bellman equation with strongly convex\nfunctions. This equivalence lays the foundation of a unified analysis for both\nglobal and local convergence behaviors of regularized policy iteration. We\nprove that regularized policy iteration has global linear convergence with the\nrate being $\\gamma$ (discount factor). Furthermore, this algorithm converges\nquadratically once it enters a local region around the optimal value. We also\nshow that a modified version of regularized policy iteration, i.e., with\nfinite-step policy evaluation, is equivalent to inexact Newton method where the\nNewton iteration formula is solved with truncated iterations. We prove that the\nassociated algorithm achieves an asymptotic linear convergence rate of\n$\\gamma^M$ in which $M$ denotes the number of steps carried out in policy\nevaluation. Our results take a solid step towards a better understanding of the\nconvergence properties of regularized policy iteration algorithms.\n","authors":["Zeyang Li","Chuxiong Hu","Yunan Wang","Guojian Zhan","Jie Li","Shengbo Eben Li"],"pdf_url":"https://arxiv.org/pdf/2310.07211v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07207v1","updated":"2023-10-11T05:34:46Z","published":"2023-10-11T05:34:46Z","title":"Robust Safe Reinforcement Learning under Adversarial Disturbances","summary":"  Safety is a primary concern when applying reinforcement learning to\nreal-world control tasks, especially in the presence of external disturbances.\nHowever, existing safe reinforcement learning algorithms rarely account for\nexternal disturbances, limiting their applicability and robustness in practice.\nTo address this challenge, this paper proposes a robust safe reinforcement\nlearning framework that tackles worst-case disturbances. First, this paper\npresents a policy iteration scheme to solve for the robust invariant set, i.e.,\na subset of the safe set, where persistent safety is only possible for states\nwithin. The key idea is to establish a two-player zero-sum game by leveraging\nthe safety value function in Hamilton-Jacobi reachability analysis, in which\nthe protagonist (i.e., control inputs) aims to maintain safety and the\nadversary (i.e., external disturbances) tries to break down safety. This paper\nproves that the proposed policy iteration algorithm converges monotonically to\nthe maximal robust invariant set. Second, this paper integrates the proposed\npolicy iteration scheme into a constrained reinforcement learning algorithm\nthat simultaneously synthesizes the robust invariant set and uses it for\nconstrained policy optimization. This algorithm tackles both optimality and\nsafety, i.e., learning a policy that attains high rewards while maintaining\nsafety under worst-case disturbances. Experiments on classic control tasks show\nthat the proposed method achieves zero constraint violation with learned\nworst-case adversarial disturbances, while other baseline algorithms violate\nthe safety constraints substantially. Our proposed method also attains\ncomparable performance as the baselines even in the absence of the adversary.\n","authors":["Zeyang Li","Chuxiong Hu","Shengbo Eben Li","Jia Cheng","Yunan Wang"],"pdf_url":"https://arxiv.org/pdf/2310.07207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07204v1","updated":"2023-10-11T05:32:29Z","published":"2023-10-11T05:32:29Z","title":"State of the Art on Diffusion Models for Visual Computing","summary":"  The field of visual computing is rapidly advancing due to the emergence of\ngenerative artificial intelligence (AI), which unlocks unprecedented\ncapabilities for the generation, editing, and reconstruction of images, videos,\nand 3D scenes. In these domains, diffusion models are the generative AI\narchitecture of choice. Within the last year alone, the literature on\ndiffusion-based tools and applications has seen exponential growth and relevant\npapers are published across the computer graphics, computer vision, and AI\ncommunities with new works appearing daily on arXiv. This rapid growth of the\nfield makes it difficult to keep up with all recent developments. The goal of\nthis state-of-the-art report (STAR) is to introduce the basic mathematical\nconcepts of diffusion models, implementation details and design choices of the\npopular Stable Diffusion model, as well as overview important aspects of these\ngenerative AI tools, including personalization, conditioning, inversion, among\nothers. Moreover, we give a comprehensive overview of the rapidly growing\nliterature on diffusion-based generation and editing, categorized by the type\nof generated medium, including 2D images, videos, 3D objects, locomotion, and\n4D scenes. Finally, we discuss available datasets, metrics, open challenges,\nand social implications. This STAR provides an intuitive starting point to\nexplore this exciting topic for researchers, artists, and practitioners alike.\n","authors":["Ryan Po","Wang Yifan","Vladislav Golyanik","Kfir Aberman","Jonathan T. Barron","Amit H. Bermano","Eric Ryan Chan","Tali Dekel","Aleksander Holynski","Angjoo Kanazawa","C. Karen Liu","Lingjie Liu","Ben Mildenhall","Matthias Nießner","Björn Ommer","Christian Theobalt","Peter Wonka","Gordon Wetzstein"],"pdf_url":"https://arxiv.org/pdf/2310.07204v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2310.07517v1","updated":"2023-10-11T14:15:25Z","published":"2023-10-11T14:15:25Z","title":"CM-PIE: Cross-modal perception for interactive-enhanced audio-visual\n  video parsing","summary":"  Audio-visual video parsing is the task of categorizing a video at the segment\nlevel with weak labels, and predicting them as audible or visible events.\nRecent methods for this task leverage the attention mechanism to capture the\nsemantic correlations among the whole video across the audio-visual modalities.\nHowever, these approaches have overlooked the importance of individual segments\nwithin a video and the relationship among them, and tend to rely on a single\nmodality when learning features. In this paper, we propose a novel\ninteractive-enhanced cross-modal perception method~(CM-PIE), which can learn\nfine-grained features by applying a segment-based attention module.\nFurthermore, a cross-modal aggregation block is introduced to jointly optimize\nthe semantic representation of audio and visual signals by enhancing\ninter-modal interactions. The experimental results show that our model offers\nimproved parsing performance on the Look, Listen, and Parse dataset compared to\nother methods.\n","authors":["Yaru Chen","Ruohao Guo","Xubo Liu","Peipei Wu","Guangyao Li","Zhenbo Li","Wenwu Wang"],"pdf_url":"https://arxiv.org/pdf/2310.07517v1.pdf","comment":"5 pages, 3 figures, 15 references"},{"id":"http://arxiv.org/abs/2304.11161v2","updated":"2023-10-11T13:29:23Z","published":"2023-04-02T16:03:44Z","title":"altiro3D: Scene representation from single image and novel view\n  synthesis","summary":"  We introduce altiro3D, a free extended library developed to represent reality\nstarting from a given original RGB image or flat video. It allows to generate a\nlight-field (or Native) image or video and get a realistic 3D experience. To\nsynthesize N-number of virtual images and add them sequentially into a Quilt\ncollage, we apply MiDaS models for the monocular depth estimation, simple\nOpenCV and Telea inpainting techniques to map all pixels, and implement a\n'Fast' algorithm to handle 3D projection camera and scene transformations along\nN-viewpoints. We use the degree of depth to move proportionally the pixels,\nassuming the original image to be at the center of all the viewpoints. altiro3D\ncan also be used with DIBR algorithm to compute intermediate snapshots from a\nequivalent 'Real (slower)' camera with N-geometric viewpoints, which requires\nto calibrate a priori several intrinsic and extrinsic camera parameters. We\nadopt a pixel- and device-based Lookup Table to optimize computing time. The\nmultiple viewpoints and video generated from a single image or frame can be\ndisplayed in a free-view LCD display.\n","authors":["E. Canessa","L. Tenze"],"pdf_url":"https://arxiv.org/pdf/2304.11161v2.pdf","comment":"In press (2023) Springer International Journal of Information\n  Technology (IJIT) 10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.07376v1","updated":"2023-10-11T10:50:15Z","published":"2023-10-11T10:50:15Z","title":"Point Cloud Denoising and Outlier Detection with Local Geometric\n  Structure by Dynamic Graph CNN","summary":"  The digitalization of society is rapidly developing toward the realization of\nthe digital twin and metaverse. In particular, point clouds are attracting\nattention as a media format for 3D space. Point cloud data is contaminated with\nnoise and outliers due to measurement errors. Therefore, denoising and outlier\ndetection are necessary for point cloud processing. Among them, PointCleanNet\nis an effective method for point cloud denoising and outlier detection.\nHowever, it does not consider the local geometric structure of the patch. We\nsolve this problem by applying two types of graph convolutional layer designed\nbased on the Dynamic Graph CNN. Experimental results show that the proposed\nmethods outperform the conventional method in AUPR, which indicates outlier\ndetection accuracy, and Chamfer Distance, which indicates denoising accuracy.\n","authors":["Kosuke Nakayama","Hiroto Fukuta","Hiroshi Watanabe"],"pdf_url":"https://arxiv.org/pdf/2310.07376v1.pdf","comment":"2023 IEEE 12th Global Conference on Consumer Electronics (GCCE 2023)"},{"id":"http://arxiv.org/abs/2310.07287v1","updated":"2023-10-11T08:18:51Z","published":"2023-10-11T08:18:51Z","title":"Interactive Interior Design Recommendation via Coarse-to-fine Multimodal\n  Reinforcement Learning","summary":"  Personalized interior decoration design often incurs high labor costs. Recent\nefforts in developing intelligent interior design systems have focused on\ngenerating textual requirement-based decoration designs while neglecting the\nproblem of how to mine homeowner's hidden preferences and choose the proper\ninitial design. To fill this gap, we propose an Interactive Interior Design\nRecommendation System (IIDRS) based on reinforcement learning (RL). IIDRS aims\nto find an ideal plan by interacting with the user, who provides feedback on\nthe gap between the recommended plan and their ideal one. To improve\ndecision-making efficiency and effectiveness in large decoration spaces, we\npropose a Decoration Recommendation Coarse-to-Fine Policy Network (DecorRCFN).\nAdditionally, to enhance generalization in online scenarios, we propose an\nobject-aware feedback generation method that augments model training with\ndiversified and dynamic textual feedback. Extensive experiments on a real-world\ndataset demonstrate our method outperforms traditional methods by a large\nmargin in terms of recommendation accuracy. Further user studies demonstrate\nthat our method reaches higher real-world user satisfaction than baseline\nmethods.\n","authors":["He Zhang","Ying Sun","Weiyu Guo","Yafei Liu","Haonan Lu","Xiaodong Lin","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2310.07287v1.pdf","comment":"Accepted by ACM International Conference on Multimedia'23. 9 pages, 7\n  figures"},{"id":"http://arxiv.org/abs/2310.07236v1","updated":"2023-10-11T06:56:08Z","published":"2023-10-11T06:56:08Z","title":"AdaMesh: Personalized Facial Expressions and Head Poses for\n  Speech-Driven 3D Facial Animation","summary":"  Speech-driven 3D facial animation aims at generating facial movements that\nare synchronized with the driving speech, which has been widely explored\nrecently. Existing works mostly neglect the person-specific talking style in\ngeneration, including facial expression and head pose styles. Several works\nintend to capture the personalities by fine-tuning modules. However, limited\ntraining data leads to the lack of vividness. In this work, we propose AdaMesh,\na novel adaptive speech-driven facial animation approach, which learns the\npersonalized talking style from a reference video of about 10 seconds and\ngenerates vivid facial expressions and head poses. Specifically, we propose\nmixture-of-low-rank adaptation (MoLoRA) to fine-tune the expression adapter,\nwhich efficiently captures the facial expression style. For the personalized\npose style, we propose a pose adapter by building a discrete pose prior and\nretrieving the appropriate style embedding with a semantic-aware pose style\nmatrix without fine-tuning. Extensive experimental results show that our\napproach outperforms state-of-the-art methods, preserves the talking style in\nthe reference video, and generates vivid facial animation. The supplementary\nvideo and code will be available at https://adamesh.github.io.\n","authors":["Liyang Chen","Weihong Bao","Shun Lei","Boshi Tang","Zhiyong Wu","Shiyin Kang","Haozhi Huang"],"pdf_url":"https://arxiv.org/pdf/2310.07236v1.pdf","comment":"Project Page: https://adamesh.github.io"},{"id":"http://arxiv.org/abs/2310.04673v3","updated":"2023-10-11T02:55:54Z","published":"2023-10-07T03:17:59Z","title":"LauraGPT: Listen, Attend, Understand, and Regenerate Audio with GPT","summary":"  Generative Pre-trained Transformer (GPT) models have achieved remarkable\nperformance on various natural language processing tasks. However, there has\nbeen limited research on applying similar frameworks to audio tasks. Previously\nproposed large language models for audio tasks either lack sufficient\nquantitative evaluations, or are limited to tasks for recognizing and\nunderstanding audio content, or significantly underperform existing\nstate-of-the-art (SOTA) models. In this paper, we propose LauraGPT, a unified\nGPT model for audio recognition, understanding, and generation. LauraGPT is a\nversatile language model that can process both audio and text inputs and\ngenerate outputs in either modalities. It can perform a wide range of tasks\nrelated to content, semantics, paralinguistics, and audio-signal analysis. Some\nof its noteworthy tasks include automatic speech recognition, speech-to-text\ntranslation, text-to-speech synthesis, machine translation, speech enhancement,\nautomated audio captioning, speech emotion recognition, and spoken language\nunderstanding. To achieve this goal, we use a combination of continuous and\ndiscrete features for audio. We encode input audio into continuous\nrepresentations using an audio encoder and decode output audio from discrete\ncodec codes. We then fine-tune a large decoder-only Transformer-based language\nmodel on multiple audio-to-text, text-to-audio, audio-to-audio, and\ntext-to-text tasks using a supervised multitask learning approach. Extensive\nexperiments show that LauraGPT achieves competitive or superior performance\ncompared to existing SOTA models on various audio processing benchmarks.\n","authors":["Jiaming Wang","Zhihao Du","Qian Chen","Yunfei Chu","Zhifu Gao","Zerui Li","Kai Hu","Xiaohuan Zhou","Jin Xu","Ziyang Ma","Wen Wang","Siqi Zheng","Chang Zhou","Zhijie Yan","Shiliang Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.04673v3.pdf","comment":"10 pages, under review"},{"id":"http://arxiv.org/abs/2310.07121v1","updated":"2023-10-11T01:51:19Z","published":"2023-10-11T01:51:19Z","title":"Motion Vector-Domain Video Steganalysis Exploiting Skipped Macroblocks","summary":"  Video steganography has the potential to be used to convey illegal\ninformation, and video steganalysis is a vital tool to detect the presence of\nthis illicit act. Currently, all the motion vector (MV)-based video\nsteganalysis algorithms extract feature sets directly on the MVs, but ignoring\nthe steganograhic operation may perturb the statistics distribution of other\nvideo encoding elements, such as the skipped macroblocks (no direct MVs). This\npaper proposes a novel 11-dimensional feature set to detect MV-based video\nsteganography based on the above observation. The proposed feature is extracted\nbased on the skipped macroblocks by recompression calibration. Specifically,\nthe feature consists of two components. The first is the probability\ndistribution of motion vector prediction (MVP) difference, and the second is\nthe probability distribution of partition state transfer. Extensive experiments\non different conditions demonstrate that the proposed feature set achieves good\ndetection accuracy, especially in lower embedding capacity. In addition, the\nloss of detection performance caused by recompression calibration using\nmismatched quantization parameters (QP) is within the acceptable range, so the\nproposed method can be used in practical scenarios.\n","authors":["Jun Li","Minqing Zhang","Ke Niu","Yingnan Zhang","Xiaoyuan Yang"],"pdf_url":"https://arxiv.org/pdf/2310.07121v1.pdf","comment":null}]},"2023-10-12T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2310.08582v1","updated":"2023-10-12T17:59:50Z","published":"2023-10-12T17:59:50Z","title":"Tree-Planner: Efficient Close-loop Task Planning with Large Language\n  Models","summary":"  This paper studies close-loop task planning, which refers to the process of\ngenerating a sequence of skills (a plan) to accomplish a specific goal while\nadapting the plan based on real-time observations. Recently, prompting Large\nLanguage Models (LLMs) to generate actions iteratively has become a prevalent\nparadigm due to its superior performance and user-friendliness. However, this\nparadigm is plagued by two inefficiencies: high token consumption and redundant\nerror correction, both of which hinder its scalability for large-scale testing\nand applications. To address these issues, we propose Tree-Planner, which\nreframes task planning with LLMs into three distinct phases: plan sampling,\naction tree construction, and grounded deciding. Tree-Planner starts by using\nan LLM to sample a set of potential plans before execution, followed by the\naggregation of them to form an action tree. Finally, the LLM performs a\ntop-down decision-making process on the tree, taking into account real-time\nenvironmental information. Experiments show that Tree-Planner achieves\nstate-of-the-art performance while maintaining high efficiency. By decomposing\nLLM queries into a single plan-sampling call and multiple grounded-deciding\ncalls, a considerable part of the prompt are less likely to be repeatedly\nconsumed. As a result, token consumption is reduced by 92.2% compared to the\npreviously best-performing model. Additionally, by enabling backtracking on the\naction tree as needed, the correction process becomes more flexible, leading to\na 40.5% decrease in error corrections. Project page:\nhttps://tree-planner.github.io/\n","authors":["Mengkang Hu","Yao Mu","Xinmiao Yu","Mingyu Ding","Shiguang Wu","Wenqi Shao","Qiguang Chen","Bin Wang","Yu Qiao","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2310.08582v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08577v1","updated":"2023-10-12T17:59:30Z","published":"2023-10-12T17:59:30Z","title":"Visual Data-Type Understanding does not emerge from Scaling\n  Vision-Language Models","summary":"  Recent advances in the development of vision-language models (VLMs) are\nyielding remarkable success in recognizing visual semantic content, including\nimpressive instances of compositional image understanding. Here, we introduce\nthe novel task of \\textit{Visual Data-Type Identification}, a basic perceptual\nskill with implications for data curation (e.g., noisy data-removal from large\ndatasets, domain-specific retrieval) and autonomous vision (e.g.,\ndistinguishing changing weather conditions from camera lens staining). We\ndevelop two datasets consisting of animal images altered across a diverse set\nof 27 visual \\textit{data-types}, spanning four broad categories. An extensive\nzero-shot evaluation of 39 VLMs, ranging from 100M to 80B parameters, shows a\nnuanced performance landscape. While VLMs are reasonably good at identifying\ncertain stylistic \\textit{data-types}, such as cartoons and sketches, they\nstruggle with simpler \\textit{data-types} arising from basic manipulations like\nimage rotations or additive noise. Our findings reveal that (i) model scaling\nalone yields marginal gains for contrastively-trained models like CLIP, and\n(ii) there is a pronounced drop in performance for the largest\nauto-regressively trained VLMs like OpenFlamingo. This finding points to a\nblind spot in current frontier VLMs: they excel in recognizing semantic content\nbut fail to acquire an understanding of visual \\textit{data-types} through\nscaling. By analyzing the pre-training distributions of these models and\nincorporating \\textit{data-type} information into the captions during\nfine-tuning, we achieve a significant enhancement in performance. By exploring\nthis previously uncharted task, we aim to set the stage for further advancing\nVLMs to equip them with visual data-type understanding. Code and datasets are\nreleased \\href{https://github.com/bethgelab/DataTypeIdentification}{here}.\n","authors":["Vishaal Udandarao","Max F. Burg","Samuel Albanie","Matthias Bethge"],"pdf_url":"https://arxiv.org/pdf/2310.08577v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08566v1","updated":"2023-10-12T17:55:02Z","published":"2023-10-12T17:55:02Z","title":"Transformers as Decision Makers: Provable In-Context Reinforcement\n  Learning via Supervised Pretraining","summary":"  Large transformer models pretrained on offline reinforcement learning\ndatasets have demonstrated remarkable in-context reinforcement learning (ICRL)\ncapabilities, where they can make good decisions when prompted with interaction\ntrajectories from unseen environments. However, when and how transformers can\nbe trained to perform ICRL have not been theoretically well-understood. In\nparticular, it is unclear which reinforcement-learning algorithms transformers\ncan perform in context, and how distribution mismatch in offline training data\naffects the learned algorithms. This paper provides a theoretical framework\nthat analyzes supervised pretraining for ICRL. This includes two recently\nproposed training methods -- algorithm distillation and decision-pretrained\ntransformers. First, assuming model realizability, we prove the\nsupervised-pretrained transformer will imitate the conditional expectation of\nthe expert algorithm given the observed trajectory. The generalization error\nwill scale with model capacity and a distribution divergence factor between the\nexpert and offline algorithms. Second, we show transformers with ReLU attention\ncan efficiently approximate near-optimal online reinforcement learning\nalgorithms like LinUCB and Thompson sampling for stochastic linear bandits, and\nUCB-VI for tabular Markov decision processes. This provides the first\nquantitative analysis of the ICRL capabilities of transformers pretrained from\noffline trajectories.\n","authors":["Licong Lin","Yu Bai","Song Mei"],"pdf_url":"https://arxiv.org/pdf/2310.08566v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08559v1","updated":"2023-10-12T17:51:10Z","published":"2023-10-12T17:51:10Z","title":"Phenomenal Yet Puzzling: Testing Inductive Reasoning Capabilities of\n  Language Models with Hypothesis Refinement","summary":"  The ability to derive underlying principles from a handful of observations\nand then generalize to novel situations -- known as inductive reasoning -- is\ncentral to human intelligence. Prior work suggests that language models (LMs)\noften fall short on inductive reasoning, despite achieving impressive success\non research benchmarks. In this work, we conduct a systematic study of the\ninductive reasoning capabilities of LMs through iterative hypothesis\nrefinement, a technique that more closely mirrors the human inductive process\nthan standard input-output prompting. Iterative hypothesis refinement employs a\nthree-step process: proposing, selecting, and refining hypotheses in the form\nof textual rules. By examining the intermediate rules, we observe that LMs are\nphenomenal hypothesis proposers (i.e., generating candidate rules), and when\ncoupled with a (task-specific) symbolic interpreter that is able to\nsystematically filter the proposed set of rules, this hybrid approach achieves\nstrong results across inductive reasoning benchmarks that require inducing\ncausal relations, language-like instructions, and symbolic concepts. However,\nthey also behave as puzzling inductive reasoners, showing notable performance\ngaps in rule induction (i.e., identifying plausible rules) and rule application\n(i.e., applying proposed rules to instances), suggesting that LMs are proposing\nhypotheses without being able to actually apply the rules. Through empirical\nand human analyses, we further reveal several discrepancies between the\ninductive reasoning processes of LMs and humans, shedding light on both the\npotentials and limitations of using LMs in inductive reasoning tasks.\n","authors":["Linlu Qiu","Liwei Jiang","Ximing Lu","Melanie Sclar","Valentina Pyatkin","Chandra Bhagavatula","Bailin Wang","Yoon Kim","Yejin Choi","Nouha Dziri","Xiang Ren"],"pdf_url":"https://arxiv.org/pdf/2310.08559v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11606v2","updated":"2023-10-12T17:50:38Z","published":"2023-08-22T17:53:55Z","title":"StoryBench: A Multifaceted Benchmark for Continuous Story Visualization","summary":"  Generating video stories from text prompts is a complex task. In addition to\nhaving high visual quality, videos need to realistically adhere to a sequence\nof text prompts whilst being consistent throughout the frames. Creating a\nbenchmark for video generation requires data annotated over time, which\ncontrasts with the single caption used often in video datasets. To fill this\ngap, we collect comprehensive human annotations on three existing datasets, and\nintroduce StoryBench: a new, challenging multi-task benchmark to reliably\nevaluate forthcoming text-to-video models. Our benchmark includes three video\ngeneration tasks of increasing difficulty: action execution, where the next\naction must be generated starting from a conditioning video; story\ncontinuation, where a sequence of actions must be executed starting from a\nconditioning video; and story generation, where a video must be generated from\nonly text prompts. We evaluate small yet strong text-to-video baselines, and\nshow the benefits of training on story-like data algorithmically generated from\nexisting video captions. Finally, we establish guidelines for human evaluation\nof video stories, and reaffirm the need of better automatic metrics for video\ngeneration. StoryBench aims at encouraging future research efforts in this\nexciting new area.\n","authors":["Emanuele Bugliarello","Hernan Moraldo","Ruben Villegas","Mohammad Babaeizadeh","Mohammad Taghi Saffar","Han Zhang","Dumitru Erhan","Vittorio Ferrari","Pieter-Jan Kindermans","Paul Voigtlaender"],"pdf_url":"https://arxiv.org/pdf/2308.11606v2.pdf","comment":"NeurIPS D&B 2023"},{"id":"http://arxiv.org/abs/2310.00737v2","updated":"2023-10-12T17:39:04Z","published":"2023-10-01T17:25:56Z","title":"GenAI Against Humanity: Nefarious Applications of Generative Artificial\n  Intelligence and Large Language Models","summary":"  Generative Artificial Intelligence (GenAI) and Large Language Models (LLMs)\nare marvels of technology; celebrated for their prowess in natural language\nprocessing and multimodal content generation, they promise a transformative\nfuture. But as with all powerful tools, they come with their shadows. Picture\nliving in a world where deepfakes are indistinguishable from reality, where\nsynthetic identities orchestrate malicious campaigns, and where targeted\nmisinformation or scams are crafted with unparalleled precision. Welcome to the\ndarker side of GenAI applications. This article is not just a journey through\nthe meanders of potential misuse of GenAI and LLMs, but also a call to\nrecognize the urgency of the challenges ahead. As we navigate the seas of\nmisinformation campaigns, malicious content generation, and the eerie creation\nof sophisticated malware, we'll uncover the societal implications that ripple\nthrough the GenAI revolution we are witnessing. From AI-powered botnets on\nsocial media platforms to the unnerving potential of AI to generate fabricated\nidentities, or alibis made of synthetic realities, the stakes have never been\nhigher. The lines between the virtual and the real worlds are blurring, and the\nconsequences of potential GenAI's nefarious applications impact us all. This\narticle serves both as a synthesis of rigorous research presented on the risks\nof GenAI and misuse of LLMs and as a thought-provoking vision of the different\ntypes of harmful GenAI applications we might encounter in the near future, and\nsome ways we can prepare for them.\n","authors":["Emilio Ferrara"],"pdf_url":"https://arxiv.org/pdf/2310.00737v2.pdf","comment":"Submitted to CACM (Viewpoint)"},{"id":"http://arxiv.org/abs/2306.07951v2","updated":"2023-10-12T17:34:12Z","published":"2023-06-13T17:48:27Z","title":"Questioning the Survey Responses of Large Language Models","summary":"  As large language models increase in capability, researchers have started to\nconduct surveys of all kinds on these models with varying scientific\nmotivations. In this work, we examine what we can learn from language models'\nsurvey responses on the basis of the well-established American Community Survey\n(ACS) by the U.S. Census Bureau. Using a de-facto standard multiple-choice\nprompting technique and evaluating 40 different language models, hundreds of\nthousands of times each on questions from the ACS, we systematically establish\ntwo dominant patterns. First, models have significant position and labeling\nbiases, for example, towards survey responses labeled with the letter \"A\".\nSecond, when adjusting for labeling biases through randomized answer ordering,\nmodels across the board trend towards uniformly random survey responses. In\nfact, binary classifiers can almost perfectly differentiate between models'\nresponses to the ACS and the responses of the US census. Taken together, our\nfindings suggest caution in treating survey responses from language models as\nequivalent to those of human populations at present time.\n","authors":["Ricardo Dominguez-Olmedo","Moritz Hardt","Celestine Mendler-Dünner"],"pdf_url":"https://arxiv.org/pdf/2306.07951v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08540v1","updated":"2023-10-12T17:32:09Z","published":"2023-10-12T17:32:09Z","title":"Do pretrained Transformers Really Learn In-context by Gradient Descent?","summary":"  Is In-Context Learning (ICL) implicitly equivalent to Gradient Descent (GD)?\nSeveral recent works draw analogies between the dynamics of GD and the emergent\nbehavior of ICL in large language models. However, these works make assumptions\nfar from the realistic natural language setting in which language models are\ntrained. Such discrepancies between theory and practice, therefore, necessitate\nfurther investigation to validate their applicability.\n  We start by highlighting the weaknesses in prior works that construct\nTransformer weights to simulate gradient descent. Their experiments with\ntraining Transformers on ICL objective, inconsistencies in the order\nsensitivity of ICL and GD, sparsity of the constructed weights, and sensitivity\nto parameter changes are some examples of a mismatch from the real-world\nsetting.\n  Furthermore, we probe and compare the ICL vs. GD hypothesis in a natural\nsetting. We conduct comprehensive empirical analyses on language models\npretrained on natural data (LLaMa-7B). Our comparisons on various performance\nmetrics highlight the inconsistent behavior of ICL and GD as a function of\nvarious factors such as datasets, models, and number of demonstrations. We\nobserve that ICL and GD adapt the output distribution of language models\ndifferently. These results indicate that the equivalence between ICL and GD is\nan open hypothesis, requires nuanced considerations and calls for further\nstudies.\n","authors":["Lingfeng Shen","Aayush Mishra","Daniel Khashabi"],"pdf_url":"https://arxiv.org/pdf/2310.08540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05173v2","updated":"2023-10-12T17:25:44Z","published":"2023-09-11T00:02:05Z","title":"DePT: Decomposed Prompt Tuning for Parameter-Efficient Fine-tuning","summary":"  Prompt tuning (PT), where a small amount of trainable soft (continuous)\nprompt vectors is affixed to the input of language models (LM), has shown\npromising results across various tasks and models for parameter-efficient\nfine-tuning (PEFT). PT stands out from other PEFT approaches because it\nmaintains competitive performance with fewer trainable parameters and does not\ndrastically scale up its parameters as the model size expands. However, PT\nintroduces additional soft prompt tokens, leading to longer input sequences,\nwhich significantly impacts training and inference time and memory usage due to\nthe Transformer's quadratic complexity. Particularly concerning for Large\nLanguage Models (LLMs) that face heavy daily querying. To address this issue,\nwe propose Decomposed Prompt Tuning (DePT), which decomposes the soft prompt\ninto a shorter soft prompt and a pair of low-rank matrices that are then\noptimised with two different learning rates. This allows DePT to achieve better\nperformance while saving over 20% memory and time costs compared to vanilla PT\nand its variants, without changing trainable parameter sizes. Through extensive\nexperiments on 23 natural language processing (NLP) and vision-language (VL)\ntasks, we demonstrate that DePT outperforms state-of-the-art PEFT approaches,\nincluding the full fine-tuning baseline in some scenarios. Additionally, we\nempirically show that DEPT grows more efficient as the model size increases.\nOur further study reveals that DePT integrates seamlessly with\nparameter-efficient transfer learning in the few-shot learning setting and\nhighlights its adaptability to various model architectures and sizes.\n","authors":["Zhengxiang Shi","Aldo Lipani"],"pdf_url":"https://arxiv.org/pdf/2309.05173v2.pdf","comment":"Code is available at https://github.com/ZhengxiangShi/DePT"},{"id":"http://arxiv.org/abs/2310.08535v1","updated":"2023-10-12T17:24:15Z","published":"2023-10-12T17:24:15Z","title":"Formally Specifying the High-Level Behavior of LLM-Based Agents","summary":"  LLM-based agents have recently emerged as promising tools for solving\nchallenging problems without the need for task-specific finetuned models that\ncan be expensive to procure. Currently, the design and implementation of such\nagents is ad hoc, as the wide variety of tasks that LLM-based agents may be\napplied to naturally means there can be no one-size-fits-all approach to agent\ndesign. In this work we aim to alleviate the difficulty of designing and\nimplementing new agents by proposing a minimalistic, high-level generation\nframework that simplifies the process of building agents. The framework we\nintroduce allows the user to specify desired agent behaviors in Linear Temporal\nLogic (LTL). The declarative LTL specification is then used to construct a\nconstrained decoder that guarantees the LLM will produce an output exhibiting\nthe desired behavior. By designing our framework in this way, we obtain several\nbenefits, including the ability to enforce complex agent behavior, the ability\nto formally validate prompt examples, and the ability to seamlessly incorporate\ncontent-focused logical constraints into generation. In particular, our\ndeclarative approach, in which the desired behavior is simply described without\nconcern for how it should be implemented or enforced, enables rapid design,\nimplementation and experimentation with different LLM-based agents. We\ndemonstrate how the proposed framework can be used to implement recent\nLLM-based agents, and show how the guardrails our approach provides can lead to\nimprovements in agent performance. In addition, we release our code for general\nuse.\n","authors":["Maxwell Crouse","Ibrahim Abdelaziz","Kinjal Basu","Soham Dan","Sadhana Kumaravel","Achille Fokoue","Pavan Kapanipathi","Luis Lastras"],"pdf_url":"https://arxiv.org/pdf/2310.08535v1.pdf","comment":"Preprint under review"},{"id":"http://arxiv.org/abs/2302.07863v4","updated":"2023-10-12T17:23:56Z","published":"2023-02-15T18:55:29Z","title":"Speculative Decoding with Big Little Decoder","summary":"  The recent emergence of Large Language Models based on the Transformer\narchitecture has enabled dramatic advancements in the field of Natural Language\nProcessing. However, these models have long inference latency, which limits\ntheir deployment and makes them prohibitively expensive for various real-time\napplications. The inference latency is further exacerbated by autoregressive\ngenerative tasks, as models need to run iteratively to generate tokens\nsequentially without leveraging token-level parallelization. To address this,\nwe propose Big Little Decoder (BiLD), a framework that can improve inference\nefficiency and latency for a wide range of text generation applications. The\nBiLD framework contains two models with different sizes that collaboratively\ngenerate text. The small model runs autoregressively to generate text with a\nlow inference cost, and the large model is only invoked occasionally to refine\nthe small model's inaccurate predictions in a non-autoregressive manner. To\ncoordinate the small and large models, BiLD introduces two simple yet effective\npolicies: (1) the fallback policy that determines when to hand control over to\nthe large model; and (2) the rollback policy that determines when the large\nmodel needs to correct the small model's inaccurate predictions. To evaluate\nour framework across different tasks and models, we apply BiLD to various text\ngeneration scenarios encompassing machine translation on IWSLT 2017 De-En and\nWMT 2014 De-En, and summarization on XSUM and CNN/DailyMail. On an NVIDIA T4\nGPU, our framework achieves a speedup of up to 2.12x speedup with minimal\ngeneration quality degradation. Furthermore, our framework is fully\nplug-and-play and can be applied without any modifications in the training\nprocess or model architecture. Our code is open-sourced\n","authors":["Sehoon Kim","Karttikeya Mangalam","Suhong Moon","Jitendra Malik","Michael W. Mahoney","Amir Gholami","Kurt Keutzer"],"pdf_url":"https://arxiv.org/pdf/2302.07863v4.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.08523v1","updated":"2023-10-12T17:17:27Z","published":"2023-10-12T17:17:27Z","title":"LLM-augmented Preference Learning from Natural Language","summary":"  Finding preferences expressed in natural language is an important but\nchallenging task. State-of-the-art(SotA) methods leverage transformer-based\nmodels such as BERT, RoBERTa, etc. and graph neural architectures such as graph\nattention networks. Since Large Language Models (LLMs) are equipped to deal\nwith larger context lengths and have much larger model sizes than the\ntransformer-based model, we investigate their ability to classify comparative\ntext directly. This work aims to serve as a first step towards using LLMs for\nthe CPC task. We design and conduct a set of experiments that format the\nclassification task into an input prompt for the LLM and a methodology to get a\nfixed-format response that can be automatically evaluated. Comparing\nperformances with existing methods, we see that pre-trained LLMs are able to\noutperform the previous SotA models with no fine-tuning involved. Our results\nshow that the LLMs can consistently outperform the SotA when the target text is\nlarge -- i.e. composed of multiple sentences --, and are still comparable to\nthe SotA performance in shorter text. We also find that few-shot learning\nyields better performance than zero-shot learning.\n","authors":["Inwon Kang","Sikai Ruan","Tyler Ho","Jui-Chien Lin","Farhad Mohsin","Oshani Seneviratne","Lirong Xia"],"pdf_url":"https://arxiv.org/pdf/2310.08523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.07580v2","updated":"2023-10-12T17:07:29Z","published":"2022-04-15T13:02:33Z","title":"mGPT: Few-Shot Learners Go Multilingual","summary":"  Recent studies report that autoregressive language models can successfully\nsolve many NLP tasks via zero- and few-shot learning paradigms, which opens up\nnew possibilities for using the pre-trained language models. This paper\nintroduces two autoregressive GPT-like models with 1.3 billion and 13 billion\nparameters trained on 60 languages from 25 language families using Wikipedia\nand Colossal Clean Crawled Corpus. We reproduce the GPT-3 architecture using\nGPT-2 sources and the sparse attention mechanism; Deepspeed and Megatron\nframeworks allow us to parallelize the training and inference steps\neffectively. The resulting models show performance on par with the recently\nreleased XGLM models by Facebook, covering more languages and enhancing NLP\npossibilities for low resource languages of CIS countries and Russian small\nnations. We detail the motivation for the choices of the architecture design,\nthoroughly describe the data preparation pipeline, and train five small\nversions of the model to choose the most optimal multilingual tokenization\nstrategy. We measure the model perplexity in all covered languages and evaluate\nit on the wide spectre of multilingual tasks, including classification,\ngenerative, sequence labeling and knowledge probing. The models were evaluated\nwith the zero-shot and few-shot methods. Furthermore, we compared the\nclassification tasks with the state-of-the-art multilingual model XGLM. source\ncode and the mGPT XL model are publicly released.\n","authors":["Oleh Shliazhko","Alena Fenogenova","Maria Tikhonova","Vladislav Mikhailov","Anastasia Kozlova","Tatiana Shavrina"],"pdf_url":"https://arxiv.org/pdf/2204.07580v2.pdf","comment":"Accepted for publication at Transactions of the Association for\n  Computational Linguistics (TACL) To be presented at the Conference on\n  Empirical Methods in Natural Language Processing (EMNLP 2023)"},{"id":"http://arxiv.org/abs/2310.08511v1","updated":"2023-10-12T17:06:19Z","published":"2023-10-12T17:06:19Z","title":"HoneyBee: Progressive Instruction Finetuning of Large Language Models\n  for Materials Science","summary":"  We propose an instruction-based process for trustworthy data curation in\nmaterials science (MatSci-Instruct), which we then apply to finetune a\nLLaMa-based language model targeted for materials science (HoneyBee).\nMatSci-Instruct helps alleviate the scarcity of relevant, high-quality\nmaterials science textual data available in the open literature, and HoneyBee\nis the first billion-parameter language model specialized to materials science.\nIn MatSci-Instruct we improve the trustworthiness of generated data by\nprompting multiple commercially available large language models for generation\nwith an Instructor module (e.g. Chat-GPT) and verification from an independent\nVerifier module (e.g. Claude). Using MatSci-Instruct, we construct a dataset of\nmultiple tasks and measure the quality of our dataset along multiple\ndimensions, including accuracy against known facts, relevance to materials\nscience, as well as completeness and reasonableness of the data. Moreover, we\niteratively generate more targeted instructions and instruction-data in a\nfinetuning-evaluation-feedback loop leading to progressively better performance\nfor our finetuned HoneyBee models. Our evaluation on the MatSci-NLP benchmark\nshows HoneyBee's outperformance of existing language models on materials\nscience tasks and iterative improvement in successive stages of\ninstruction-data refinement. We study the quality of HoneyBee's language\nmodeling through automatic evaluation and analyze case studies to further\nunderstand the model's capabilities and limitations. Our code and relevant\ndatasets are publicly available at\n\\url{https://github.com/BangLab-UdeM-Mila/NLP4MatSci-HoneyBee}.\n","authors":["Yu Song","Santiago Miret","Huan Zhang","Bang Liu"],"pdf_url":"https://arxiv.org/pdf/2310.08511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08496v1","updated":"2023-10-12T16:55:44Z","published":"2023-10-12T16:55:44Z","title":"The Uncertainty-based Retrieval Framework for Ancient Chinese CWS and\n  POS","summary":"  Automatic analysis for modern Chinese has greatly improved the accuracy of\ntext mining in related fields, but the study of ancient Chinese is still\nrelatively rare. Ancient text division and lexical annotation are important\nparts of classical literature comprehension, and previous studies have tried to\nconstruct auxiliary dictionary and other fused knowledge to improve the\nperformance. In this paper, we propose a framework for ancient Chinese Word\nSegmentation and Part-of-Speech Tagging that makes a twofold effort: on the one\nhand, we try to capture the wordhood semantics; on the other hand, we\nre-predict the uncertain samples of baseline model by introducing external\nknowledge. The performance of our architecture outperforms pre-trained BERT\nwith CRF and existing tools such as Jiayan.\n","authors":["Pengyu Wang","Zhichen Ren"],"pdf_url":"https://arxiv.org/pdf/2310.08496v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08491v1","updated":"2023-10-12T16:50:08Z","published":"2023-10-12T16:50:08Z","title":"Prometheus: Inducing Fine-grained Evaluation Capability in Language\n  Models","summary":"  Recently, using a powerful proprietary Large Language Model (LLM) (e.g.,\nGPT-4) as an evaluator for long-form responses has become the de facto\nstandard. However, for practitioners with large-scale evaluation tasks and\ncustom criteria in consideration (e.g., child-readability), using proprietary\nLLMs as an evaluator is unreliable due to the closed-source nature,\nuncontrolled versioning, and prohibitive costs. In this work, we propose\nPrometheus, a fully open-source LLM that is on par with GPT-4's evaluation\ncapabilities when the appropriate reference materials (reference answer, score\nrubric) are accompanied. We first construct the Feedback Collection, a new\ndataset that consists of 1K fine-grained score rubrics, 20K instructions, and\n100K responses and language feedback generated by GPT-4. Using the Feedback\nCollection, we train Prometheus, a 13B evaluator LLM that can assess any given\nlong-form text based on customized score rubric provided by the user.\nExperimental results show that Prometheus scores a Pearson correlation of 0.897\nwith human evaluators when evaluating with 45 customized score rubrics, which\nis on par with GPT-4 (0.882), and greatly outperforms ChatGPT (0.392).\nFurthermore, measuring correlation with GPT-4 with 1222 customized score\nrubrics across four benchmarks (MT Bench, Vicuna Bench, Feedback Bench, Flask\nEval) shows similar trends, bolstering Prometheus's capability as an evaluator\nLLM. Lastly, Prometheus achieves the highest accuracy on two human preference\nbenchmarks (HHH Alignment & MT Bench Human Judgment) compared to open-sourced\nreward models explicitly trained on human preference datasets, highlighting its\npotential as an universal reward model. We open-source our code, dataset, and\nmodel at https://github.com/kaistAI/Prometheus.\n","authors":["Seungone Kim","Jamin Shin","Yejin Cho","Joel Jang","Shayne Longpre","Hwaran Lee","Sangdoo Yun","Seongjin Shin","Sungdong Kim","James Thorne","Minjoon Seo"],"pdf_url":"https://arxiv.org/pdf/2310.08491v1.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2310.08487v1","updated":"2023-10-12T16:46:58Z","published":"2023-10-12T16:46:58Z","title":"GraphextQA: A Benchmark for Evaluating Graph-Enhanced Large Language\n  Models","summary":"  While multi-modal models have successfully integrated information from image,\nvideo, and audio modalities, integrating graph modality into large language\nmodels (LLMs) remains unexplored. This discrepancy largely stems from the\ninherent divergence between structured graph data and unstructured text data.\nIncorporating graph knowledge provides a reliable source of information,\nenabling potential solutions to address issues in text generation, e.g.,\nhallucination, and lack of domain knowledge. To evaluate the integration of\ngraph knowledge into language models, a dedicated dataset is needed. However,\nthere is currently no benchmark dataset specifically designed for multimodal\ngraph-language models. To address this gap, we propose GraphextQA, a question\nanswering dataset with paired subgraphs, retrieved from Wikidata, to facilitate\nthe evaluation and future development of graph-language models. Additionally,\nwe introduce a baseline model called CrossGNN, which conditions answer\ngeneration on the paired graphs by cross-attending question-aware graph\nfeatures at decoding. The proposed dataset is designed to evaluate\ngraph-language models' ability to understand graphs and make use of it for\nanswer generation. We perform experiments with language-only models and the\nproposed graph-language model to validate the usefulness of the paired graphs\nand to demonstrate the difficulty of the task.\n","authors":["Yuanchun Shen","Ruotong Liao","Zhen Han","Yunpu Ma","Volker Tresp"],"pdf_url":"https://arxiv.org/pdf/2310.08487v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08483v1","updated":"2023-10-12T16:42:53Z","published":"2023-10-12T16:42:53Z","title":"Understanding the Humans Behind Online Misinformation: An Observational\n  Study Through the Lens of the COVID-19 Pandemic","summary":"  The proliferation of online misinformation has emerged as one of the biggest\nthreats to society. Considerable efforts have focused on building\nmisinformation detection models, still the perils of misinformation remain\nabound. Mitigating online misinformation and its ramifications requires a\nholistic approach that encompasses not only an understanding of its intricate\nlandscape in relation to the complex issue and topic-rich information ecosystem\nonline, but also the psychological drivers of individuals behind it. Adopting a\ntime series analytic technique and robust causal inference-based design, we\nconduct a large-scale observational study analyzing over 32 million COVID-19\ntweets and 16 million historical timeline tweets. We focus on understanding the\nbehavior and psychology of users disseminating misinformation during COVID-19\nand its relationship with the historical inclinations towards sharing\nmisinformation on Non-COVID topics before the pandemic. Our analysis\nunderscores the intricacies inherent to cross-topic misinformation, and\nhighlights that users' historical inclination toward sharing misinformation is\npositively associated with their present behavior pertaining to misinformation\nsharing on emergent topics and beyond. This work may serve as a valuable\nfoundation for designing user-centric inoculation strategies and\necologically-grounded agile interventions for effectively tackling online\nmisinformation.\n","authors":["Mohit Chandra","Anush Mattapalli","Munmun De Choudhury"],"pdf_url":"https://arxiv.org/pdf/2310.08483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08475v1","updated":"2023-10-12T16:32:44Z","published":"2023-10-12T16:32:44Z","title":"Can We Edit Multimodal Large Language Models?","summary":"  In this paper, we focus on editing Multimodal Large Language Models (MLLMs).\nCompared to editing single-modal LLMs, multimodal model editing is more\nchallenging, which demands a higher level of scrutiny and careful consideration\nin the editing process. To facilitate research in this area, we construct a new\nbenchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite\nof innovative metrics for evaluation. We conduct comprehensive experiments\ninvolving various model editing baselines and analyze the impact of editing\ndifferent components for multimodal LLMs. Empirically, we notice that previous\nbaselines can implement editing multimodal LLMs to some extent, but the effect\nis still barely satisfactory, indicating the potential difficulty of this task.\nWe hope that our work can provide the NLP community with insights\\footnote{Code\nand dataset are available in https://github.com/zjunlp/EasyEdit.\n","authors":["Siyuan Cheng","Bozhong Tian","Qingbin Liu","Xi Chen","Yongheng Wang","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08475v1.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.08461v1","updated":"2023-10-12T16:21:04Z","published":"2023-10-12T16:21:04Z","title":"DistillSpec: Improving Speculative Decoding via Knowledge Distillation","summary":"  Speculative decoding (SD) accelerates large language model inference by\nemploying a faster draft model for generating multiple tokens, which are then\nverified in parallel by the larger target model, resulting in the text\ngenerated according to the target model distribution. However, identifying a\ncompact draft model that is well-aligned with the target model is challenging.\nTo tackle this issue, we propose DistillSpec that uses knowledge distillation\nto better align the draft model with the target model, before applying SD.\nDistillSpec makes two key design choices, which we demonstrate via systematic\nstudy to be crucial to improving the draft and target alignment: utilizing\non-policy data generation from the draft model, and tailoring the divergence\nfunction to the task and decoding strategy. Notably, DistillSpec yields\nimpressive 10 - 45% speedups over standard SD on a range of standard\nbenchmarks, using both greedy and non-greedy sampling. Furthermore, we combine\nDistillSpec with lossy SD to achieve fine-grained control over the latency vs.\ntask performance trade-off. Finally, in practical scenarios with models of\nvarying sizes, first using distillation to boost the performance of the target\nmodel and then applying DistillSpec to train a well-aligned draft model can\nreduce decoding latency by 6-10x with minimal performance drop, compared to\nstandard decoding without distillation.\n","authors":["Yongchao Zhou","Kaifeng Lyu","Ankit Singh Rawat","Aditya Krishna Menon","Afshin Rostamizadeh","Sanjiv Kumar","Jean-François Kagy","Rishabh Agarwal"],"pdf_url":"https://arxiv.org/pdf/2310.08461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.15363v3","updated":"2023-10-12T16:12:57Z","published":"2022-11-28T14:38:45Z","title":"On the Security Vulnerabilities of Text-to-SQL Models","summary":"  Although it has been demonstrated that Natural Language Processing (NLP)\nalgorithms are vulnerable to deliberate attacks, the question of whether such\nweaknesses can lead to software security threats is under-explored. To bridge\nthis gap, we conducted vulnerability tests on Text-to-SQL systems that are\ncommonly used to create natural language interfaces to databases. We showed\nthat the Text-to-SQL modules within six commercial applications can be\nmanipulated to produce malicious code, potentially leading to data breaches and\nDenial of Service attacks. This is the first demonstration that NLP models can\nbe exploited as attack vectors in the wild. In addition, experiments using four\nopen-source language models verified that straightforward backdoor attacks on\nText-to-SQL systems achieve a 100% success rate without affecting their\nperformance. The aim of this work is to draw the community's attention to\npotential software security issues associated with NLP algorithms and encourage\nexploration of methods to mitigate against them.\n","authors":["Xutan Peng","Yipeng Zhang","Jingfeng Yang","Mark Stevenson"],"pdf_url":"https://arxiv.org/pdf/2211.15363v3.pdf","comment":"ISSRE 2023: Best Paper Candidate"},{"id":"http://arxiv.org/abs/2305.14259v3","updated":"2023-10-12T16:10:51Z","published":"2023-05-23T17:12:08Z","title":"Learning to Generate Novel Scientific Directions with Contextualized\n  Literature-based Discovery","summary":"  Literature-Based Discovery (LBD) aims to discover new scientific knowledge by\nmining papers and generating hypotheses. Standard LBD is limited to predicting\npairwise relations between discrete concepts (e.g., drug-disease links), and\nignores critical contexts like experimental settings (e.g., a specific patient\npopulation where a drug is evaluated) and background motivations (e.g., to find\ndrugs without specific side effects). We address these limitations with a novel\nformulation of contextualized-LBD (C-LBD): generating scientific hypotheses in\nnatural language, while grounding them in a context that controls the\nhypothesis search space. We present a modeling framework using retrieval of\n``inspirations'' from past scientific papers. Our evaluations reveal that GPT-4\ntends to generate ideas with overall low technical depth and novelty, while our\ninspiration prompting approaches partially mitigate this issue. Our work\nrepresents a first step toward building language models that generate new ideas\nderived from scientific literature.\n","authors":["Qingyun Wang","Doug Downey","Heng Ji","Tom Hope"],"pdf_url":"https://arxiv.org/pdf/2305.14259v3.pdf","comment":"24 pages. Code and resource is available at\n  https://github.com/EagleW/CLBD"},{"id":"http://arxiv.org/abs/2307.07864v2","updated":"2023-10-12T16:06:19Z","published":"2023-07-15T18:25:56Z","title":"CIDER: Context sensitive sentiment analysis for short-form text","summary":"  Researchers commonly perform sentiment analysis on large collections of short\ntexts like tweets, Reddit posts or newspaper headlines that are all focused on\na specific topic, theme or event. Usually, general purpose sentiment analysis\nmethods are used which perform well on average but miss the variation in\nmeaning that happens across different contexts, for example, the word \"active\"\nhas a very different intention and valence in the phrase \"active lifestyle\"\nversus \"active volcano\". This work presents a new approach, CIDER (Context\nInformed Dictionary and sEntiment Reasoner), which performs context sensitive\nsentiment analysis, where the valence of sentiment laden terms is inferred from\nthe whole corpus before being used to score the individual texts. In this paper\nwe detail the CIDER algorithm and demonstrate that it outperforms\nstate-of-the-art generalist sentiment analysis on a large collection of tweets\nabout the weather. We have made our implementation of CIDER available as a\npython package: https://pypi.org/project/ciderpolarity/.\n","authors":["James C. Young","Rudy Arthur","Hywel T. P. Williams"],"pdf_url":"https://arxiv.org/pdf/2307.07864v2.pdf","comment":"12 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2310.04472v2","updated":"2023-10-12T15:57:04Z","published":"2023-10-06T04:48:48Z","title":"Effective Slogan Generation with Noise Perturbation","summary":"  Slogans play a crucial role in building the brand's identity of the firm. A\nslogan is expected to reflect firm's vision and brand's value propositions in\nmemorable and likeable ways. Automating the generation of slogans with such\ncharacteristics is challenging. Previous studies developted and tested slogan\ngeneration with syntactic control and summarization models which are not\ncapable of generating distinctive slogans. We introduce a a novel apporach that\nleverages pre-trained transformer T5 model with noise perturbation on newly\nproposed 1:N matching pair dataset. This approach serves as a contributing\nfator in generting distinctive and coherent slogans. Turthermore, the proposed\napproach incorporates descriptions about the firm and brand into the generation\nof slogans. We evaluate generated slogans based on ROUGE1, ROUGEL and Cosine\nSimilarity metrics and also assess them with human subjects in terms of\nslogan's distinctiveness, coherence, and fluency. The results demonstrate that\nour approach yields better performance than baseline models and other\ntransformer-based models.\n","authors":["Jongeun Kim","MinChung Kim","Taehwan Kim"],"pdf_url":"https://arxiv.org/pdf/2310.04472v2.pdf","comment":"Accepted in CIKM 2023 short paper\n  https://github.com/joannekim0420/SloganGeneration"},{"id":"http://arxiv.org/abs/2310.08433v1","updated":"2023-10-12T15:56:24Z","published":"2023-10-12T15:56:24Z","title":"A Confederacy of Models: a Comprehensive Evaluation of LLMs on Creative\n  Writing","summary":"  We evaluate a range of recent LLMs on English creative writing, a challenging\nand complex task that requires imagination, coherence, and style. We use a\ndifficult, open-ended scenario chosen to avoid training data reuse: an epic\nnarration of a single combat between Ignatius J. Reilly, the protagonist of the\nPulitzer Prize-winning novel A Confederacy of Dunces (1980), and a pterodactyl,\na prehistoric flying reptile. We ask several LLMs and humans to write such a\nstory and conduct a human evalution involving various criteria such as fluency,\ncoherence, originality, humor, and style. Our results show that some\nstate-of-the-art commercial LLMs match or slightly outperform our writers in\nmost dimensions; whereas open-source LLMs lag behind. Humans retain an edge in\ncreativity, while humor shows a binary divide between LLMs that can handle it\ncomparably to humans and those that fail at it. We discuss the implications and\nlimitations of our study and suggest directions for future research.\n","authors":["Carlos Gómez-Rodríguez","Paul Williams"],"pdf_url":"https://arxiv.org/pdf/2310.08433v1.pdf","comment":"Accepted for publication in Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2302.11713v4","updated":"2023-10-12T15:30:41Z","published":"2023-02-23T00:33:54Z","title":"Can Pre-trained Vision and Language Models Answer Visual\n  Information-Seeking Questions?","summary":"  Pre-trained vision and language models have demonstrated state-of-the-art\ncapabilities over existing tasks involving images and texts, including visual\nquestion answering. However, it remains unclear whether these models possess\nthe capability to answer questions that are not only querying visual content\nbut knowledge-intensive and information-seeking. In this study, we introduce\nInfoSeek, a visual question answering dataset tailored for information-seeking\nquestions that cannot be answered with only common sense knowledge. Using\nInfoSeek, we analyze various pre-trained visual question answering models and\ngain insights into their characteristics. Our findings reveal that\nstate-of-the-art pre-trained multi-modal models (e.g., PaLI-X, BLIP2, etc.)\nface challenges in answering visual information-seeking questions, but\nfine-tuning on the InfoSeek dataset elicits models to use fine-grained\nknowledge that was learned during their pre-training. Furthermore, we show that\naccurate visual entity recognition can be used to improve performance on\nInfoSeek by retrieving relevant documents, showing a significant space for\nimprovement.\n","authors":["Yang Chen","Hexiang Hu","Yi Luan","Haitian Sun","Soravit Changpinyo","Alan Ritter","Ming-Wei Chang"],"pdf_url":"https://arxiv.org/pdf/2302.11713v4.pdf","comment":"EMNLP 2023 (main conference); Our dataset and evaluation is available\n  at https://open-vision-language.github.io/infoseek/"},{"id":"http://arxiv.org/abs/2310.08395v1","updated":"2023-10-12T15:08:14Z","published":"2023-10-12T15:08:14Z","title":"Prompting Large Language Models with Chain-of-Thought for Few-Shot\n  Knowledge Base Question Generation","summary":"  The task of Question Generation over Knowledge Bases (KBQG) aims to convert a\nlogical form into a natural language question. For the sake of expensive cost\nof large-scale question annotation, the methods of KBQG under low-resource\nscenarios urgently need to be developed. However, current methods heavily rely\non annotated data for fine-tuning, which is not well-suited for few-shot\nquestion generation. The emergence of Large Language Models (LLMs) has shown\ntheir impressive generalization ability in few-shot tasks. Inspired by\nChain-of-Thought (CoT) prompting, which is an in-context learning strategy for\nreasoning, we formulate KBQG task as a reasoning problem, where the generation\nof a complete question is splitted into a series of sub-question generation.\nOur proposed prompting method KQG-CoT first retrieves supportive logical forms\nfrom the unlabeled data pool taking account of the characteristics of the\nlogical form. Then, we write a prompt to explicit the reasoning chain of\ngenerating complicated questions based on the selected demonstrations. To\nfurther ensure prompt quality, we extend KQG-CoT into KQG-CoT+ via sorting the\nlogical forms by their complexity. We conduct extensive experiments over three\npublic KBQG datasets. The results demonstrate that our prompting method\nconsistently outperforms other prompting baselines on the evaluated datasets.\nRemarkably, our KQG-CoT+ method could surpass existing few-shot SoTA results of\nthe PathQuestions dataset by 18.25, 10.72, and 10.18 absolute points on BLEU-4,\nMETEOR, and ROUGE-L, respectively.\n","authors":["Yuanyuan Liang","Jianing Wang","Hanlun Zhu","Lei Wang","Weining Qian","Yunshi Lan"],"pdf_url":"https://arxiv.org/pdf/2310.08395v1.pdf","comment":"Accepted by EMNLP 2023 main conference"},{"id":"http://arxiv.org/abs/2310.08394v1","updated":"2023-10-12T15:07:11Z","published":"2023-10-12T15:07:11Z","title":"Towards Better Evaluation of Instruction-Following: A Case-Study in\n  Summarization","summary":"  Despite recent advances, evaluating how well large language models (LLMs)\nfollow user instructions remains an open problem. While evaluation methods of\nlanguage models have seen a rise in prompt-based approaches, limited work on\nthe correctness of these methods has been conducted. In this work, we perform a\nmeta-evaluation of a variety of metrics to quantify how accurately they measure\nthe instruction-following abilities of LLMs. Our investigation is performed on\ngrounded query-based summarization by collecting a new short-form, real-world\ndataset riSum, containing $300$ document-instruction pairs with $3$ answers\neach. All $900$ answers are rated by $3$ human annotators. Using riSum, we\nanalyze agreement between evaluation methods and human judgment. Finally, we\npropose new LLM-based reference-free evaluation methods that improve upon\nestablished baselines and perform on-par with costly reference-based metrics\nwhich require high-quality summaries.\n","authors":["Ondrej Skopek","Rahul Aralikatte","Sian Gooding","Victor Carbune"],"pdf_url":"https://arxiv.org/pdf/2310.08394v1.pdf","comment":"Accepted to CoNLL 2023"},{"id":"http://arxiv.org/abs/2310.08383v1","updated":"2023-10-12T14:57:24Z","published":"2023-10-12T14:57:24Z","title":"Reconstructing Materials Tetrahedron: Challenges in Materials\n  Information Extraction","summary":"  Discovery of new materials has a documented history of propelling human\nprogress for centuries and more. The behaviour of a material is a function of\nits composition, structure, and properties, which further depend on its\nprocessing and testing conditions. Recent developments in deep learning and\nnatural language processing have enabled information extraction at scale from\npublished literature such as peer-reviewed publications, books, and patents.\nHowever, this information is spread in multiple formats, such as tables, text,\nand images, and with little or no uniformity in reporting style giving rise to\nseveral machine learning challenges. Here, we discuss, quantify, and document\nthese outstanding challenges in automated information extraction (IE) from\nmaterials science literature towards the creation of a large materials science\nknowledge base. Specifically, we focus on IE from text and tables and outline\nseveral challenges with examples. We hope the present work inspires researchers\nto address the challenges in a coherent fashion, providing to fillip to IE for\nthe materials knowledge base.\n","authors":["Kausik Hira","Mohd Zaki","Dhruvil Sheth"," Mausam","N M Anoop Krishnan"],"pdf_url":"https://arxiv.org/pdf/2310.08383v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08372v1","updated":"2023-10-12T14:44:05Z","published":"2023-10-12T14:44:05Z","title":"Improving Factual Consistency for Knowledge-Grounded Dialogue Systems\n  via Knowledge Enhancement and Alignment","summary":"  Pretrained language models (PLMs) based knowledge-grounded dialogue systems\nare prone to generate responses that are factually inconsistent with the\nprovided knowledge source. In such inconsistent responses, the dialogue models\nfail to accurately express the external knowledge they rely upon. Inspired by\nprevious work which identified that feed-forward networks (FFNs) within\nTransformers are responsible for factual knowledge expressions, we investigate\ntwo methods to efficiently improve the factual expression capability {of FFNs}\nby knowledge enhancement and alignment respectively. We first propose\n\\textsc{K-Dial}, which {explicitly} introduces {extended FFNs in Transformers\nto enhance factual knowledge expressions} given the specific patterns of\nknowledge-grounded dialogue inputs. Additionally, we apply the reinforcement\nlearning for factual consistency (RLFC) method to implicitly adjust FFNs'\nexpressions in responses by aligning with gold knowledge for the factual\nconsistency preference. To comprehensively assess the factual consistency and\ndialogue quality of responses, we employ extensive automatic measures and human\nevaluations including sophisticated fine-grained NLI-based metrics.\nExperimental results on WoW and CMU\\_DoG datasets demonstrate that our methods\nefficiently enhance the ability of the FFN module to convey factual knowledge,\nvalidating the efficacy of improving factual consistency for knowledge-grounded\ndialogue systems.\n","authors":["Boyang Xue","Weichao Wang","Hongru Wang","Fei Mi","Rui Wang","Yasheng Wang","Lifeng Shang","Xin Jiang","Qun Liu","Kam-Fai Wong"],"pdf_url":"https://arxiv.org/pdf/2310.08372v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08367v1","updated":"2023-10-12T14:38:25Z","published":"2023-10-12T14:38:25Z","title":"MCU: A Task-centric Framework for Open-ended Agent Evaluation in\n  Minecraft","summary":"  To pursue the goal of creating an open-ended agent in Minecraft, an\nopen-ended game environment with unlimited possibilities, this paper introduces\na task-centric framework named MCU for Minecraft agent evaluation. The MCU\nframework leverages the concept of atom tasks as fundamental building blocks,\nenabling the generation of diverse or even arbitrary tasks. Within the MCU\nframework, each task is measured with six distinct difficulty scores (time\nconsumption, operational effort, planning complexity, intricacy, creativity,\nnovelty). These scores offer a multi-dimensional assessment of a task from\ndifferent angles, and thus can reveal an agent's capability on specific facets.\nThe difficulty scores also serve as the feature of each task, which creates a\nmeaningful task space and unveils the relationship between tasks. For efficient\nevaluation of Minecraft agents employing the MCU framework, we maintain a\nunified benchmark, namely SkillForge, which comprises representative tasks with\ndiverse categories and difficulty distribution. We also provide convenient\nfilters for users to select tasks to assess specific capabilities of agents. We\nshow that MCU has the high expressivity to cover all tasks used in recent\nliterature on Minecraft agent, and underscores the need for advancements in\nareas such as creativity, precise control, and out-of-distribution\ngeneralization under the goal of open-ended Minecraft agent development.\n","authors":["Haowei Lin","Zihao Wang","Jianzhu Ma","Yitao Liang"],"pdf_url":"https://arxiv.org/pdf/2310.08367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03128v2","updated":"2023-10-12T14:37:55Z","published":"2023-10-04T19:39:26Z","title":"MetaTool Benchmark for Large Language Models: Deciding Whether to Use\n  Tools and Which to Use","summary":"  Large language models (LLMs) have garnered significant attention due to their\nimpressive natural language processing (NLP) capabilities. Recently, many\nstudies have focused on the tool utilization ability of LLMs. They primarily\ninvestigated how LLMs effectively collaborate with given specific tools.\nHowever, in scenarios where LLMs serve as intelligent agents, as seen in\napplications like AutoGPT and MetaGPT, LLMs are expected to engage in intricate\ndecision-making processes that involve deciding whether to employ a tool and\nselecting the most suitable tool(s) from a collection of available tools to\nfulfill user requests. Therefore, in this paper, we introduce MetaTool, a\nbenchmark designed to evaluate whether LLMs have tool usage awareness and can\ncorrectly choose tools. Specifically, we create a dataset called ToolE within\nthe benchmark. This dataset contains various types of user queries in the form\nof prompts that trigger LLMs to use tools, including both single-tool and\nmulti-tool scenarios. Subsequently, we set the tasks for both tool usage\nawareness and tool selection. We define four subtasks from different\nperspectives in tool selection, including tool selection with similar choices,\ntool selection in specific scenarios, tool selection with possible reliability\nissues, and multi-tool selection. We conduct experiments involving nine popular\nLLMs and find that the majority of them still struggle to effectively select\ntools, highlighting the existing gaps between LLMs and genuine intelligent\nagents. However, through the error analysis, we found there is still\nsignificant room for improvement. Finally, we conclude with insights for tool\ndevelopers that follow ChatGPT to provide detailed descriptions that can\nenhance the tool selection performance of LLMs.\n","authors":["Yue Huang","Jiawen Shi","Yuan Li","Chenrui Fan","Siyuan Wu","Qihui Zhang","Yixin Liu","Pan Zhou","Yao Wan","Neil Zhenqiang Gong","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2310.03128v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08365v1","updated":"2023-10-12T14:36:13Z","published":"2023-10-12T14:36:13Z","title":"From Large Language Models to Knowledge Graphs for Biomarker Discovery\n  in Cancer","summary":"  Domain experts often rely on up-to-date knowledge for apprehending and\ndisseminating specific biological processes that help them design strategies to\ndevelop prevention and therapeutic decision-making. A challenging scenario for\nartificial intelligence (AI) is using biomedical data (e.g., texts, imaging,\nomics, and clinical) to provide diagnosis and treatment recommendations for\ncancerous conditions. Data and knowledge about cancer, drugs, genes, proteins,\nand their mechanism is spread across structured (knowledge bases (KBs)) and\nunstructured (e.g., scientific articles) sources. A large-scale knowledge graph\n(KG) can be constructed by integrating these data, followed by extracting facts\nabout semantically interrelated entities and relations. Such KGs not only allow\nexploration and question answering (QA) but also allow domain experts to deduce\nnew knowledge. However, exploring and querying large-scale KGs is tedious for\nnon-domain users due to a lack of understanding of the underlying data assets\nand semantic technologies. In this paper, we develop a domain KG to leverage\ncancer-specific biomarker discovery and interactive QA. For this, a domain\nontology called OncoNet Ontology (ONO) is developed to enable semantic\nreasoning for validating gene-disease relations. The KG is then enriched by\nharmonizing the ONO, controlled vocabularies, and additional biomedical\nconcepts from scientific articles by employing BioBERT- and SciBERT-based\ninformation extraction (IE) methods. Further, since the biomedical domain is\nevolving, where new findings often replace old ones, without employing\nup-to-date findings, there is a high chance an AI system exhibits concept drift\nwhile providing diagnosis and treatment. Therefore, we finetuned the KG using\nlarge language models (LLMs) based on more recent articles and KBs that might\nnot have been seen by the named entity recognition models.\n","authors":["Md. Rezaul Karim","Lina Molinas Comet","Md Shajalal","Oya Beyan","Dietrich Rebholz-Schuhmann","Stefan Decker"],"pdf_url":"https://arxiv.org/pdf/2310.08365v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2302.04737"},{"id":"http://arxiv.org/abs/2304.11657v2","updated":"2023-10-12T13:57:58Z","published":"2023-04-23T13:54:39Z","title":"Enhancing Chain-of-Thoughts Prompting with Iterative Bootstrapping in\n  Large Language Models","summary":"  Large language models (LLMs) can achieve highly effective performance on\nvarious reasoning tasks by incorporating step-by-step chain-of-thought (CoT)\nprompting as demonstrations. However, the reasoning chains of demonstrations\ngenerated by LLMs are prone to errors, which can subsequently lead to incorrect\nreasoning during inference. Furthermore, inappropriate exemplars (overly\nsimplistic or complex), can affect overall performance among varying levels of\ndifficulty. We introduce Iter-CoT (Iterative bootstrapping in Chain-of-Thoughts\nPrompting), an iterative bootstrapping approach for selecting exemplars and\ngenerating reasoning chains. By utilizing iterative bootstrapping, our approach\nenables LLMs to autonomously rectify errors, resulting in more precise and\ncomprehensive reasoning chains. Simultaneously, our approach selects\nchallenging yet answerable questions accompanied by reasoning chains as\nexemplars with a moderate level of difficulty, which enhances the LLMs'\ngeneralizability across varying levels of difficulty. Experimental results\nindicate that Iter-CoT exhibits superiority, achieving competitive performance\nacross three distinct reasoning tasks on ten datasets.\n","authors":["Jiashuo Sun","Yi Luo","Yeyun Gong","Chen Lin","Yelong Shen","Jian Guo","Nan Duan"],"pdf_url":"https://arxiv.org/pdf/2304.11657v2.pdf","comment":"28 pages, 10 figures, 21 tables"},{"id":"http://arxiv.org/abs/2309.16396v3","updated":"2023-10-12T13:39:39Z","published":"2023-09-28T12:43:32Z","title":"A Comprehensive Survey of Document-level Relation Extraction (2016-2023)","summary":"  Document-level relation extraction (DocRE) is an active area of research in\nnatural language processing (NLP) concerned with identifying and extracting\nrelationships between entities beyond sentence boundaries. Compared to the more\ntraditional sentence-level relation extraction, DocRE provides a broader\ncontext for analysis and is more challenging because it involves identifying\nrelationships that may span multiple sentences or paragraphs. This task has\ngained increased interest as a viable solution to build and populate knowledge\nbases automatically from unstructured large-scale documents (e.g., scientific\npapers, legal contracts, or news articles), in order to have a better\nunderstanding of relationships between entities. This paper aims to provide a\ncomprehensive overview of recent advances in this field, highlighting its\ndifferent applications in comparison to sentence-level relation extraction.\n","authors":["Julien Delaunay","Hanh Thi Hong Tran","Carlos-Emiliano González-Gallardo","Georgeta Bordea","Nicolas Sidere","Antoine Doucet"],"pdf_url":"https://arxiv.org/pdf/2309.16396v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08320v1","updated":"2023-10-12T13:33:04Z","published":"2023-10-12T13:33:04Z","title":"Defending Our Privacy With Backdoors","summary":"  The proliferation of large AI models trained on uncurated, often sensitive\nweb-scraped data has raised significant privacy concerns. One of the concerns\nis that adversaries can extract information about the training data using\nprivacy attacks. Unfortunately, the task of removing specific information from\nthe models without sacrificing performance is not straightforward and has\nproven to be challenging. We propose a rather easy yet effective defense based\non backdoor attacks to remove private information such as names of individuals\nfrom models, and focus in this work on text encoders. Specifically, through\nstrategic insertion of backdoors, we align the embeddings of sensitive phrases\nwith those of neutral terms-\"a person\" instead of the person's name. Our\nempirical results demonstrate the effectiveness of our backdoor-based defense\non CLIP by assessing its performance using a specialized privacy attack for\nzero-shot classifiers. Our approach provides not only a new \"dual-use\"\nperspective on backdoor attacks, but also presents a promising avenue to\nenhance the privacy of individuals within models trained on uncurated\nweb-scraped data.\n","authors":["Dominik Hintersdorf","Lukas Struppek","Daniel Neider","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2310.08320v1.pdf","comment":"14 pages, 4 figures"},{"id":"http://arxiv.org/abs/2310.08309v1","updated":"2023-10-12T13:15:11Z","published":"2023-10-12T13:15:11Z","title":"Not All Demonstration Examples are Equally Beneficial: Reweighting\n  Demonstration Examples for In-Context Learning","summary":"  Large Language Models (LLMs) have recently gained the In-Context Learning\n(ICL) ability with the models scaling up, allowing them to quickly adapt to\ndownstream tasks with only a few demonstration examples prepended in the input\nsequence. Nonetheless, the current practice of ICL treats all demonstration\nexamples equally, which still warrants improvement, as the quality of examples\nis usually uneven. In this paper, we investigate how to determine approximately\noptimal weights for demonstration examples and how to apply them during ICL. To\nassess the quality of weights in the absence of additional validation data, we\ndesign a masked self-prediction (MSP) score that exhibits a strong correlation\nwith the final ICL performance. To expedite the weight-searching process, we\ndiscretize the continuous weight space and adopt beam search. With\napproximately optimal weights obtained, we further propose two strategies to\napply them to demonstrations at different model positions. Experimental results\non 8 text classification tasks show that our approach outperforms conventional\nICL by a large margin. Our code are publicly available at\nhttps:github.com/Zhe-Young/WICL.\n","authors":["Zhe Yang","Damai Dai","Peiyi Wang","Zhifang Sui"],"pdf_url":"https://arxiv.org/pdf/2310.08309v1.pdf","comment":"Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.08298v1","updated":"2023-10-12T13:02:34Z","published":"2023-10-12T13:02:34Z","title":"MProto: Multi-Prototype Network with Denoised Optimal Transport for\n  Distantly Supervised Named Entity Recognition","summary":"  Distantly supervised named entity recognition (DS-NER) aims to locate entity\nmentions and classify their types with only knowledge bases or gazetteers and\nunlabeled corpus. However, distant annotations are noisy and degrade the\nperformance of NER models. In this paper, we propose a noise-robust prototype\nnetwork named MProto for the DS-NER task. Different from previous\nprototype-based NER methods, MProto represents each entity type with multiple\nprototypes to characterize the intra-class variance among entity\nrepresentations. To optimize the classifier, each token should be assigned an\nappropriate ground-truth prototype and we consider such token-prototype\nassignment as an optimal transport (OT) problem. Furthermore, to mitigate the\nnoise from incomplete labeling, we propose a novel denoised optimal transport\n(DOT) algorithm. Specifically, we utilize the assignment result between Other\nclass tokens and all prototypes to distinguish unlabeled entity tokens from\ntrue negatives. Experiments on several DS-NER benchmarks demonstrate that our\nMProto achieves state-of-the-art performance. The source code is now available\non Github.\n","authors":["Shuhui Wu","Yongliang Shen","Zeqi Tan","Wenqi Ren","Jietian Guo","Shiliang Pu","Weiming Lu"],"pdf_url":"https://arxiv.org/pdf/2310.08298v1.pdf","comment":"Accepted to EMNLP-2023, camera ready version"},{"id":"http://arxiv.org/abs/2310.08291v1","updated":"2023-10-12T12:52:46Z","published":"2023-10-12T12:52:46Z","title":"Expanding the Vocabulary of BERT for Knowledge Base Construction","summary":"  Knowledge base construction entails acquiring structured information to\ncreate a knowledge base of factual and relational data, facilitating question\nanswering, information retrieval, and semantic understanding. The challenge\ncalled \"Knowledge Base Construction from Pretrained Language Models\" at\nInternational Semantic Web Conference 2023 defines tasks focused on\nconstructing knowledge base using language model. Our focus was on Track 1 of\nthe challenge, where the parameters are constrained to a maximum of 1 billion,\nand the inclusion of entity descriptions within the prompt is prohibited.\n  Although the masked language model offers sufficient flexibility to extend\nits vocabulary, it is not inherently designed for multi-token prediction. To\naddress this, we present Vocabulary Expandable BERT for knowledge base\nconstruction, which expand the language model's vocabulary while preserving\nsemantic embeddings for newly added words. We adopt task-specific\nre-pre-training on masked language model to further enhance the language model.\n  Through experimentation, the results show the effectiveness of our\napproaches. Our framework achieves F1 score of 0.323 on the hidden test set and\n0.362 on the validation set, both data set is provided by the challenge.\nNotably, our framework adopts a lightweight language model (BERT-base, 0.13\nbillion parameters) and surpasses the model using prompts directly on large\nlanguage model (Chatgpt-3, 175 billion parameters). Besides, Token-Recode\nachieves comparable performances as Re-pretrain. This research advances\nlanguage understanding models by enabling the direct embedding of multi-token\nentities, signifying a substantial step forward in link prediction task in\nknowledge graph and metadata completion in data management.\n","authors":["Dong Yang","Xu Wang","Remzi Celebi"],"pdf_url":"https://arxiv.org/pdf/2310.08291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08280v1","updated":"2023-10-12T12:36:46Z","published":"2023-10-12T12:36:46Z","title":"Optimizing Odia Braille Literacy: The Influence of Speed on Error\n  Reduction and Enhanced Comprehension","summary":"  This study aims to conduct an extensive detailed analysis of the Odia Braille\nreading comprehension among students with visual disability. Specifically, the\nstudy explores their reading speed and hand or finger movements. The study also\naims to investigate any comprehension difficulties and reading errors they may\nencounter. Six students from the 9th and 10th grades, aged between 14 and 16,\nparticipated in the study. We observed participants hand movements to\nunderstand how reading errors were connected to hand movement and identify the\nstudents reading difficulties. We also evaluated the participants Odia Braille\nreading skills, including their reading speed (in words per minute), errors,\nand comprehension. The average speed of Odia Braille reader is 17.64wpm.\nAccording to the study, there was a noticeable correlation between reading\nspeed and reading errors. As reading speed decreased, the number of reading\nerrors tended to increase. Moreover, the study established a link between\nreduced Braille reading errors and improved reading comprehension. In contrast,\nthe study found that better comprehension was associated with increased reading\nspeed. The researchers concluded with some interesting findings about preferred\nBraille reading patterns. These findings have important theoretical,\ndevelopmental, and methodological implications for instruction.\n","authors":["Monnie Parida","Manjira Sinha","Anupam Basu","Pabitra Mitra"],"pdf_url":"https://arxiv.org/pdf/2310.08280v1.pdf","comment":"4 Pages, Paper accepted in Diversity and Inclusion track at\n  CODS-COMAD 2024"},{"id":"http://arxiv.org/abs/2310.08279v1","updated":"2023-10-12T12:31:23Z","published":"2023-10-12T12:31:23Z","title":"CP-KGC: Constrained-Prompt Knowledge Graph Completion with Large\n  Language Models","summary":"  Knowledge graph completion (KGC) aims to utilize existing knowledge to deduce\nand infer missing connections within knowledge graphs. Text-based approaches,\nlike SimKGC, have outperformed graph embedding methods, showcasing the promise\nof inductive KGC. However, the efficacy of text-based methods hinges on the\nquality of entity textual descriptions. In this paper, we identify the key\nissue of whether large language models (LLMs) can generate effective text. To\nmitigate hallucination in LLM-generated text in this paper, we introduce a\nconstraint-based prompt that utilizes the entity and its textual description as\ncontextual constraints to enhance data quality. Our Constrained-Prompt\nKnowledge Graph Completion (CP-KGC) method demonstrates effective inference\nunder low resource computing conditions and surpasses prior results on the\nWN18RR and FB15K237 datasets. This showcases the integration of LLMs in KGC\ntasks and provides new directions for future research.\n","authors":["Rui Yang","Li Fang","Yi Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.08279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12053v3","updated":"2023-10-12T12:06:37Z","published":"2023-09-21T13:20:13Z","title":"AceGPT, Localizing Large Language Models in Arabic","summary":"  This paper is devoted to the development of a localized Large Language Model\n(LLM) specifically for Arabic, a language imbued with unique cultural\ncharacteristics inadequately addressed by current mainstream models.\nSignificant concerns emerge when addressing cultural sensitivity and local\nvalues. To address this, the paper proposes a comprehensive solution that\nincludes further pre-training with Arabic texts, Supervised Fine-Tuning (SFT)\nutilizing native Arabic instructions, and GPT-4 responses in Arabic, alongside\nReinforcement Learning with AI Feedback (RLAIF) employing a reward model\nattuned to local culture and values. The goal is to cultivate culturally\ncognizant and value-aligned Arabic LLMs capable of accommodating the diverse,\napplication-specific needs of Arabic-speaking communities. Comprehensive\nevaluations reveal that the resulting model, dubbed 'AceGPT', sets the\nstate-of-the-art standard for open Arabic LLMs across various benchmarks,\nincluding the instruction-following benchmark (i.e., Arabic Vicuna-80 and\nArabic AlpacaEval), knowledge benchmark (i.e., Arabic MMLU and EXAMs), and the\nnewly introduced Arabic Cultural and Value Alignment benchmark. Notably, AceGPT\noutperforms Turbo in the popular Vicuna-80 benchmark when evaluated with GPT-4,\ndespite the benchmark's limited scale. Codes, data, and models are in\nhttps://github.com/FreedomIntelligence/AceGPT.\n","authors":["Huang Huang","Fei Yu","Jianqing Zhu","Xuening Sun","Hao Cheng","Dingjie Song","Zhihong Chen","Abdulmohsen Alharthi","Bang An","Ziche Liu","Zhiyi Zhang","Junying Chen","Jianquan Li","Benyou Wang","Lian Zhang","Ruoyu Sun","Xiang Wan","Haizhou Li","Jinchao Xu"],"pdf_url":"https://arxiv.org/pdf/2309.12053v3.pdf","comment":"https://github.com/FreedomIntelligence/AceGPT"},{"id":"http://arxiv.org/abs/2310.08256v1","updated":"2023-10-12T12:01:32Z","published":"2023-10-12T12:01:32Z","title":"Impact of Co-occurrence on Factual Knowledge of Large Language Models","summary":"  Large language models (LLMs) often make factually incorrect responses despite\ntheir success in various applications. In this paper, we hypothesize that\nrelying heavily on simple co-occurrence statistics of the pre-training corpora\nis one of the main factors that cause factual errors. Our results reveal that\nLLMs are vulnerable to the co-occurrence bias, defined as preferring frequently\nco-occurred words over the correct answer. Consequently, LLMs struggle to\nrecall facts whose subject and object rarely co-occur in the pre-training\ndataset although they are seen during finetuning. We show that co-occurrence\nbias remains despite scaling up model sizes or finetuning. Therefore, we\nsuggest finetuning on a debiased dataset to mitigate the bias by filtering out\nbiased samples whose subject-object co-occurrence count is high. Although\ndebiased finetuning allows LLMs to memorize rare facts in the training set, it\nis not effective in recalling rare facts unseen during finetuning. Further\nresearch in mitigation will help build reliable language models by preventing\npotential errors. The code is available at\n\\url{https://github.com/CheongWoong/impact_of_cooccurrence}.\n","authors":["Cheongwoong Kang","Jaesik Choi"],"pdf_url":"https://arxiv.org/pdf/2310.08256v1.pdf","comment":"EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2309.02092v3","updated":"2023-10-12T11:46:23Z","published":"2023-09-05T09:56:29Z","title":"Where are We in Event-centric Emotion Analysis? Bridging Emotion Role\n  Labeling and Appraisal-based Approaches","summary":"  The term emotion analysis in text subsumes various natural language\nprocessing tasks which have in common the goal to enable computers to\nunderstand emotions. Most popular is emotion classification in which one or\nmultiple emotions are assigned to a predefined textual unit. While such setting\nis appropriate for identifying the reader's or author's emotion, emotion role\nlabeling adds the perspective of mentioned entities and extracts text spans\nthat correspond to the emotion cause. The underlying emotion theories agree on\none important point; that an emotion is caused by some internal or external\nevent and comprises several subcomponents, including the subjective feeling and\na cognitive evaluation. We therefore argue that emotions and events are related\nin two ways. (1) Emotions are events; and this perspective is the fundament in\nnatural language processing for emotion role labeling. (2) Emotions are caused\nby events; a perspective that is made explicit with research how to incorporate\npsychological appraisal theories in NLP models to interpret events. These two\nresearch directions, role labeling and (event-focused) emotion classification,\nhave by and large been tackled separately. In this paper, we contextualize both\nperspectives and discuss open research questions.\n","authors":["Roman Klinger"],"pdf_url":"https://arxiv.org/pdf/2309.02092v3.pdf","comment":"accepted to the Big Picture Workshop\n  (https://bigpictureworkshop.com/)"},{"id":"http://arxiv.org/abs/2305.15020v2","updated":"2023-10-12T11:45:56Z","published":"2023-05-24T11:00:33Z","title":"An Efficient Multilingual Language Model Compression through Vocabulary\n  Trimming","summary":"  Multilingual language model (LM) have become a powerful tool in NLP\nespecially for non-English languages. Nevertheless, model parameters of\nmultilingual LMs remain large due to the larger embedding matrix of the\nvocabulary covering tokens in different languages. On the contrary, monolingual\nLMs can be trained in a target language with the language-specific vocabulary\nonly, but this requires a large budget and availability of reliable corpora to\nachieve a high-quality LM from scratch. In this paper, we propose\nvocabulary-trimming (VT), a method to reduce a multilingual LM vocabulary to a\ntarget language by deleting irrelevant tokens from its vocabulary. In theory,\nVT can compress any existing multilingual LM to build monolingual LMs in any\nlanguage covered by the multilingual LM. In our experiments, we show that VT\ncan retain the original performance of the multilingual LM, while being smaller\nin size (in general around 50% of the original vocabulary size is enough) than\nthe original multilingual LM. The evaluation is performed over four NLP tasks\n(two generative and two classification tasks) among four widely used\nmultilingual LMs in seven languages. Finally, we show that this methodology can\nkeep the best of both monolingual and multilingual worlds by keeping a small\nsize as monolingual models without the need for specifically retraining them,\nand even limiting potentially harmful social biases.\n","authors":["Asahi Ushio","Yi Zhou","Jose Camacho-Collados"],"pdf_url":"https://arxiv.org/pdf/2305.15020v2.pdf","comment":"EMNLP 2023 findings"},{"id":"http://arxiv.org/abs/2304.00457v3","updated":"2023-10-12T11:35:35Z","published":"2023-04-02T05:47:09Z","title":"LLMMaps -- A Visual Metaphor for Stratified Evaluation of Large Language\n  Models","summary":"  Large Language Models (LLMs) have revolutionized natural language processing\nand demonstrated impressive capabilities in various tasks. Unfortunately, they\nare prone to hallucinations, where the model exposes incorrect or false\ninformation in its responses, which renders diligent evaluation approaches\nmandatory. While LLM performance in specific knowledge fields is often\nevaluated based on question and answer (Q&A) datasets, such evaluations usually\nreport only a single accuracy number for the dataset, which often covers an\nentire field. This field-based evaluation, is problematic with respect to\ntransparency and model improvement. A stratified evaluation could instead\nreveal subfields, where hallucinations are more likely to occur and thus help\nto better assess LLMs' risks and guide their further development. To support\nsuch stratified evaluations, we propose LLMMaps as a novel visualization\ntechnique that enables users to evaluate LLMs' performance with respect to Q&A\ndatasets. LLMMaps provide detailed insights into LLMs' knowledge capabilities\nin different subfields, by transforming Q&A datasets as well as LLM responses\ninto an internal knowledge structure. An extension for comparative\nvisualization furthermore, allows for the detailed comparison of multiple LLMs.\nTo assess LLMMaps we use them to conduct a comparative analysis of several\nstate-of-the-art LLMs, such as BLOOM, GPT-2, GPT-3, ChatGPT and LLaMa-13B, as\nwell as two qualitative user evaluations. All necessary source code and data\nfor generating LLMMaps to be used in scientific publications and elsewhere is\navailable on GitHub: https://github.com/viscom-ulm/LLMMaps\n","authors":["Patrik Puchert","Poonam Poonam","Christian van Onzenoodt","Timo Ropinski"],"pdf_url":"https://arxiv.org/pdf/2304.00457v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08240v1","updated":"2023-10-12T11:35:24Z","published":"2023-10-12T11:35:24Z","title":"Who Said That? Benchmarking Social Media AI Detection","summary":"  AI-generated text has proliferated across various online platforms, offering\nboth transformative prospects and posing significant risks related to\nmisinformation and manipulation. Addressing these challenges, this paper\nintroduces SAID (Social media AI Detection), a novel benchmark developed to\nassess AI-text detection models' capabilities in real social media platforms.\nIt incorporates real AI-generate text from popular social media platforms like\nZhihu and Quora. Unlike existing benchmarks, SAID deals with content that\nreflects the sophisticated strategies employed by real AI users on the Internet\nwhich may evade detection or gain visibility, providing a more realistic and\nchallenging evaluation landscape. A notable finding of our study, based on the\nZhihu dataset, reveals that annotators can distinguish between AI-generated and\nhuman-generated texts with an average accuracy rate of 96.5%. This finding\nnecessitates a re-evaluation of human capability in recognizing AI-generated\ntext in today's widely AI-influenced environment. Furthermore, we present a new\nuser-oriented AI-text detection challenge focusing on the practicality and\neffectiveness of identifying AI-generated text based on user information and\nmultiple responses. The experimental results demonstrate that conducting\ndetection tasks on actual social media platforms proves to be more challenging\ncompared to traditional simulated AI-text detection, resulting in a decreased\naccuracy. On the other hand, user-oriented AI-generated text detection\nsignificantly improve the accuracy of detection.\n","authors":["Wanyun Cui","Linqiu Zhang","Qianle Wang","Shuyang Cai"],"pdf_url":"https://arxiv.org/pdf/2310.08240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08232v1","updated":"2023-10-12T11:25:46Z","published":"2023-10-12T11:25:46Z","title":"Language Models are Universal Embedders","summary":"  In the large language model (LLM) revolution, embedding is a key component of\nvarious systems. For example, it is used to retrieve knowledge or memories for\nLLMs, to build content moderation filters, etc. As such cases span from English\nto other natural or programming languages, from retrieval to classification and\nbeyond, it is desirable to build a unified embedding model rather than\ndedicated ones for each scenario. In this work, we make an initial step towards\nthis goal, demonstrating that multiple languages (both natural and programming)\npre-trained transformer decoders can embed universally when finetuned on\nlimited English data. We provide a comprehensive practice with thorough\nevaluations. On English MTEB, our models achieve competitive performance on\ndifferent embedding tasks by minimal training data. On other benchmarks, such\nas multilingual classification and code search, our models (without any\nsupervision) perform comparably to, or even surpass heavily supervised\nbaselines and/or APIs. These results provide evidence of a promising path\ntowards building powerful unified embedders that can be applied across tasks\nand languages.\n","authors":["Xin Zhang","Zehan Li","Yanzhao Zhang","Dingkun Long","Pengjun Xie","Meishan Zhang","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08232v1.pdf","comment":"13 pages, in progress"},{"id":"http://arxiv.org/abs/2310.08225v1","updated":"2023-10-12T11:17:40Z","published":"2023-10-12T11:17:40Z","title":"Fast Word Error Rate Estimation Using Self-Supervised Representations\n  For Speech And Text","summary":"  The quality of automatic speech recognition (ASR) is typically measured by\nword error rate (WER). WER estimation is a task aiming to predict the WER of an\nASR system, given a speech utterance and a transcription. This task has gained\nincreasing attention while advanced ASR systems are trained on large amounts of\ndata. In this case, WER estimation becomes necessary in many scenarios, for\nexample, selecting training data with unknown transcription quality or\nestimating the testing performance of an ASR system without ground truth\ntranscriptions. Facing large amounts of data, the computation efficiency of a\nWER estimator becomes essential in practical applications. However, previous\nworks usually did not consider it as a priority. In this paper, a Fast WER\nestimator (Fe-WER) using self-supervised learning representation (SSLR) is\nintroduced. The estimator is built upon SSLR aggregated by average pooling. The\nresults show that Fe-WER outperformed the e-WER3 baseline relatively by 19.69%\nand 7.16% on Ted-Lium3 in both evaluation metrics of root mean square error and\nPearson correlation coefficient, respectively. Moreover, the estimation\nweighted by duration was 10.43% when the target was 10.88%. Lastly, the\ninference speed was about 4x in terms of a real-time factor.\n","authors":["Chanho Park","Chengsong Lu","Mingjie Chen","Thomas Hain"],"pdf_url":"https://arxiv.org/pdf/2310.08225v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2310.08221v1","updated":"2023-10-12T11:11:54Z","published":"2023-10-12T11:11:54Z","title":"SimCKP: Simple Contrastive Learning of Keyphrase Representations","summary":"  Keyphrase generation (KG) aims to generate a set of summarizing words or\nphrases given a source document, while keyphrase extraction (KE) aims to\nidentify them from the text. Because the search space is much smaller in KE, it\nis often combined with KG to predict keyphrases that may or may not exist in\nthe corresponding document. However, current unified approaches adopt sequence\nlabeling and maximization-based generation that primarily operate at a token\nlevel, falling short in observing and scoring keyphrases as a whole. In this\nwork, we propose SimCKP, a simple contrastive learning framework that consists\nof two stages: 1) An extractor-generator that extracts keyphrases by learning\ncontext-aware phrase-level representations in a contrastive manner while also\ngenerating keyphrases that do not appear in the document; 2) A reranker that\nadapts scores for each generated phrase by likewise aligning their\nrepresentations with the corresponding document. Experimental results on\nmultiple benchmark datasets demonstrate the effectiveness of our proposed\napproach, which outperforms the state-of-the-art models by a significant\nmargin.\n","authors":["Minseok Choi","Chaeheon Gwak","Seho Kim","Si Hyeong Kim","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2310.08221v1.pdf","comment":"Accepted to Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2309.16292v2","updated":"2023-10-12T11:11:47Z","published":"2023-09-28T09:41:35Z","title":"DiLu: A Knowledge-Driven Approach to Autonomous Driving with Large\n  Language Models","summary":"  Recent advancements in autonomous driving have relied on data-driven\napproaches, which are widely adopted but face challenges including dataset\nbias, overfitting, and uninterpretability. Drawing inspiration from the\nknowledge-driven nature of human driving, we explore the question of how to\ninstill similar capabilities into autonomous driving systems and summarize a\nparadigm that integrates an interactive environment, a driver agent, as well as\na memory component to address this question. Leveraging large language models\nwith emergent abilities, we propose the DiLu framework, which combines a\nReasoning and a Reflection module to enable the system to perform\ndecision-making based on common-sense knowledge and evolve continuously.\nExtensive experiments prove DiLu's capability to accumulate experience and\ndemonstrate a significant advantage in generalization ability over\nreinforcement learning-based methods. Moreover, DiLu is able to directly\nacquire experiences from real-world datasets which highlights its potential to\nbe deployed on practical autonomous driving systems. To the best of our\nknowledge, we are the first to instill knowledge-driven capability into\nautonomous driving systems from the perspective of how humans drive.\n","authors":["Licheng Wen","Daocheng Fu","Xin Li","Xinyu Cai","Tao Ma","Pinlong Cai","Min Dou","Botian Shi","Liang He","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2309.16292v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08187v1","updated":"2023-10-12T10:26:26Z","published":"2023-10-12T10:26:26Z","title":"Visual Question Generation in Bengali","summary":"  The task of Visual Question Generation (VQG) is to generate human-like\nquestions relevant to the given image. As VQG is an emerging research field,\nexisting works tend to focus only on resource-rich language such as English due\nto the availability of datasets. In this paper, we propose the first Bengali\nVisual Question Generation task and develop a novel transformer-based\nencoder-decoder architecture that generates questions in Bengali when given an\nimage. We propose multiple variants of models - (i) image-only: baseline model\nof generating questions from images without additional information, (ii)\nimage-category and image-answer-category: guided VQG where we condition the\nmodel to generate questions based on the answer and the category of expected\nquestion. These models are trained and evaluated on the translated VQAv2.0\ndataset. Our quantitative and qualitative results establish the first state of\nthe art models for VQG task in Bengali and demonstrate that our models are\ncapable of generating grammatically correct and relevant questions. Our\nquantitative results show that our image-cat model achieves a BLUE-1 score of\n33.12 and BLEU-3 score of 7.56 which is the highest of the other two variants.\nWe also perform a human evaluation to assess the quality of the generation\ntasks. Human evaluation suggests that image-cat model is capable of generating\ngoal-driven and attribute-specific questions and also stays relevant to the\ncorresponding image.\n","authors":["Mahmud Hasan","Labiba Islam","Jannatul Ferdous Ruma","Tasmiah Tahsin Mayeesha","Rashedur M. Rahman"],"pdf_url":"https://arxiv.org/pdf/2310.08187v1.pdf","comment":"19 pages including references, 4 figures and 3 tables. Accepted in\n  the Proceedings of the Workshop on Multimodal, Multilingual Natural Language\n  Generation and Multilingual WebNLG Challenge (MM-NLG 2023)"},{"id":"http://arxiv.org/abs/2310.08185v1","updated":"2023-10-12T10:21:37Z","published":"2023-10-12T10:21:37Z","title":"EIPE-text: Evaluation-Guided Iterative Plan Extraction for Long-Form\n  Narrative Text Generation","summary":"  Plan-and-Write is a common hierarchical approach in long-form narrative text\ngeneration, which first creates a plan to guide the narrative writing.\nFollowing this approach, several studies rely on simply prompting large\nlanguage models for planning, which often yields suboptimal results. In this\npaper, we propose a new framework called Evaluation-guided Iterative Plan\nExtraction for long-form narrative text generation (EIPE-text), which extracts\nplans from the corpus of narratives and utilizes the extracted plans to\nconstruct a better planner. EIPE-text has three stages: plan extraction,\nlearning, and inference. In the plan extraction stage, it iteratively extracts\nand improves plans from the narrative corpus and constructs a plan corpus. We\npropose a question answer (QA) based evaluation mechanism to automatically\nevaluate the plans and generate detailed plan refinement instructions to guide\nthe iterative improvement. In the learning stage, we build a better planner by\nfine-tuning with the plan corpus or in-context learning with examples in the\nplan corpus. Finally, we leverage a hierarchical approach to generate long-form\nnarratives. We evaluate the effectiveness of EIPE-text in the domains of novels\nand storytelling. Both GPT-4-based evaluations and human evaluations\ndemonstrate that our method can generate more coherent and relevant long-form\nnarratives. Our code will be released in the future.\n","authors":["Wang You","Wenshan Wu","Yaobo Liang","Shaoguang Mao","Chenfei Wu","Maosong Cao","Yuzhe Cai","Yiduo Guo","Yan Xia","Furu Wei","Nan Duan"],"pdf_url":"https://arxiv.org/pdf/2310.08185v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08172v1","updated":"2023-10-12T09:55:45Z","published":"2023-10-12T09:55:45Z","title":"Exploring the Cognitive Knowledge Structure of Large Language Models: An\n  Educational Diagnostic Assessment Approach","summary":"  Large Language Models (LLMs) have not only exhibited exceptional performance\nacross various tasks, but also demonstrated sparks of intelligence. Recent\nstudies have focused on assessing their capabilities on human exams and\nrevealed their impressive competence in different domains. However, cognitive\nresearch on the overall knowledge structure of LLMs is still lacking. In this\npaper, based on educational diagnostic assessment method, we conduct an\nevaluation using MoocRadar, a meticulously annotated human test dataset based\non Bloom Taxonomy. We aim to reveal the knowledge structures of LLMs and gain\ninsights of their cognitive capabilities. This research emphasizes the\nsignificance of investigating LLMs' knowledge and understanding the disparate\ncognitive patterns of LLMs. By shedding light on models' knowledge, researchers\ncan advance development and utilization of LLMs in a more informed and\neffective manner.\n","authors":["Zheyuan Zhang","Jifan Yu","Juanzi Li","Lei Hou"],"pdf_url":"https://arxiv.org/pdf/2310.08172v1.pdf","comment":"Findings of EMNLP 2023 (Short Paper)"},{"id":"http://arxiv.org/abs/2310.08170v1","updated":"2023-10-12T09:49:10Z","published":"2023-10-12T09:49:10Z","title":"Simplicity Level Estimate (SLE): A Learned Reference-Less Metric for\n  Sentence Simplification","summary":"  Automatic evaluation for sentence simplification remains a challenging\nproblem. Most popular evaluation metrics require multiple high-quality\nreferences -- something not readily available for simplification -- which makes\nit difficult to test performance on unseen domains. Furthermore, most existing\nmetrics conflate simplicity with correlated attributes such as fluency or\nmeaning preservation. We propose a new learned evaluation metric (SLE) which\nfocuses on simplicity, outperforming almost all existing metrics in terms of\ncorrelation with human judgements.\n","authors":["Liam Cripwell","Joël Legrand","Claire Gardent"],"pdf_url":"https://arxiv.org/pdf/2310.08170v1.pdf","comment":"Accepted to EMNLP 2023 (Main Conference)"},{"id":"http://arxiv.org/abs/2310.08167v1","updated":"2023-10-12T09:41:22Z","published":"2023-10-12T09:41:22Z","title":"Multiclass Classification of Policy Documents with Large Language Models","summary":"  Classifying policy documents into policy issue topics has been a long-time\neffort in political science and communication disciplines. Efforts to automate\ntext classification processes for social science research purposes have so far\nachieved remarkable results, but there is still a large room for progress. In\nthis work, we test the prediction performance of an alternative strategy, which\nrequires human involvement much less than full manual coding. We use the GPT\n3.5 and GPT 4 models of the OpenAI, which are pre-trained instruction-tuned\nLarge Language Models (LLM), to classify congressional bills and congressional\nhearings into Comparative Agendas Project's 21 major policy issue topics. We\npropose three use-case scenarios and estimate overall accuracies ranging from\n%58-83 depending on scenario and GPT model employed. The three scenarios aims\nat minimal, moderate, and major human interference, respectively. Overall, our\nresults point towards the insufficiency of complete reliance on GPT with\nminimal human intervention, an increasing accuracy along with the human effort\nexerted, and a surprisingly high accuracy achieved in the most humanly\ndemanding use-case. However, the superior use-case achieved the %83 accuracy on\nthe %65 of the data in which the two models agreed, suggesting that a similar\napproach to ours can be relatively easily implemented and allow for mostly\nautomated coding of a majority of a given dataset. This could free up resources\nallowing manual human coding of the remaining %35 of the data to achieve an\noverall higher level of accuracy while reducing costs significantly.\n","authors":["Erkan Gunes","Christoffer Koch Florczak"],"pdf_url":"https://arxiv.org/pdf/2310.08167v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08166v1","updated":"2023-10-12T09:39:17Z","published":"2023-10-12T09:39:17Z","title":"Ziya-VL: Bilingual Large Vision-Language Model via Multi-Task\n  Instruction Tuning","summary":"  Recent advancements enlarge the capabilities of large language models (LLMs)\nin zero-shot image-to-text generation and understanding by integrating\nmulti-modal inputs. However, such success is typically limited to English\nscenarios due to the lack of large-scale and high-quality non-English\nmulti-modal resources, making it extremely difficult to establish competitive\ncounterparts in other languages. In this paper, we introduce the Ziya-VL\nseries, a set of bilingual large-scale vision-language models (LVLMs) designed\nto incorporate visual semantics into LLM for multi-modal dialogue. Composed of\nZiya-VL-Base and Ziya-VL-Chat, our models adopt the Querying Transformer from\nBLIP-2, further exploring the assistance of optimization schemes such as\ninstruction tuning, multi-stage training and low-rank adaptation module for\nvisual-language alignment. In addition, we stimulate the understanding ability\nof GPT-4 in multi-modal scenarios, translating our gathered English image-text\ndatasets into Chinese and generating instruction-response through the\nin-context learning method. The experiment results demonstrate that compared to\nthe existing LVLMs, Ziya-VL achieves competitive performance across a wide\nrange of English-only tasks including zero-shot image-text retrieval, image\ncaptioning, and visual question answering. The evaluation leaderboard accessed\nby GPT-4 also indicates that our models possess satisfactory image-text\nunderstanding and generation capabilities in Chinese multi-modal scenario\ndialogues. Code, demo and models are available at\n~\\url{https://huggingface.co/IDEA-CCNL/Ziya-BLIP2-14B-Visual-v1}.\n","authors":["Junyu Lu","Dixiang Zhang","Xiaojun Wu","Xinyu Gao","Ruyi Gan","Jiaxing Zhang","Yan Song","Pingjian Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08166v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08152v1","updated":"2023-10-12T09:18:19Z","published":"2023-10-12T09:18:19Z","title":"Context Compression for Auto-regressive Transformers with Sentinel\n  Tokens","summary":"  The quadratic complexity of the attention module makes it gradually become\nthe bulk of compute in Transformer-based LLMs during generation. Moreover, the\nexcessive key-value cache that arises when dealing with long inputs also brings\nsevere issues on memory footprint and inference latency. In this work, we\npropose a plug-and-play approach that is able to incrementally compress the\nintermediate activation of a specified span of tokens into compact ones,\nthereby reducing both memory and computational cost when processing subsequent\ncontext. Experiments on both in-domain language modeling and zero-shot\nopen-ended document generation demonstrate the advantage of our approach over\nsparse attention baselines in terms of fluency, n-gram matching, and semantic\nsimilarity. At last, we comprehensively profile the benefit of context\ncompression on improving the system throughout. Code is available at\nhttps://github.com/DRSY/KV_Compression.\n","authors":["Siyu Ren","Qi Jia","Kenny Q. Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.08152v1.pdf","comment":"To appear at EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.05199v2","updated":"2023-10-12T09:04:07Z","published":"2023-10-08T15:14:39Z","title":"Loose lips sink ships: Mitigating Length Bias in Reinforcement Learning\n  from Human Feedback","summary":"  Reinforcement learning from human feedback serves as a crucial bridge,\naligning large language models with human and societal values. This alignment\nrequires a vast corpus of human feedback to learn a reward model, which is\nsubsequently used to finetune language models. However, we have identified that\nthe reward model often finds shortcuts to bypass its intended objectives,\nmisleadingly assuming that humans prefer longer responses. The emergence of\nlength bias often induces the model to favor longer outputs, yet it doesn't\nequate to an increase in helpful information within these outputs. In this\npaper, we propose an innovative solution, applying the Product-of-Experts (PoE)\ntechnique to separate reward modeling from the influence of sequence length. In\nour framework, the main expert concentrates on understanding human intents,\nwhile the biased expert targets the identification and capture of length bias.\nTo further enhance the learning of bias, we introduce perturbations into the\nbias-focused expert, disrupting the flow of semantic information. Experimental\nresults validate the effectiveness of our approach, indicating that language\nmodel performance is improved, irrespective of sequence length.\n","authors":["Wei Shen","Rui Zheng","Wenyu Zhan","Jun Zhao","Shihan Dou","Tao Gui","Qi Zhang","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2310.05199v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11534v4","updated":"2023-10-12T08:50:19Z","published":"2023-08-21T06:51:56Z","title":"PlatoLM: Teaching LLMs via a Socratic Questioning User Simulator","summary":"  The unparalleled performance of closed-sourced ChatGPT has sparked efforts\ntowards its democratization, with notable strides made by leveraging real user\nand ChatGPT conversations, as evidenced by Vicuna. However, due to challenges\nin gathering conversations involving human participation, current endeavors\nlike Baize and UltraChat aim to automatically generate conversational data.\nThey primarily rely on ChatGPT conducting roleplay to simulate human behaviors\nbased on instructions rather than genuine learning from humans, resulting in\nlimited scope, diminished diversity, and an absence of genuine multi-round\nconversational dynamics. To address the above issues, we target human questions\nextracted from genuine human-machine conversations as a learning goal and train\na user simulator called `Socratic' to produce a high-quality human-centric\nsynthetic conversation dataset. Subsequently, this dataset was used to train\nour assistant model, named `PlatoLM'. Experimentally, PlatoLM outpaces baseline\nmodels in both Vicuna-Bench and MT-Bench by pairwise comparison when\nconsidering equivalent training set sizes, and manual evaluation also shows\nthat our model is highly competitive. Impressively, when fine-tuned with the\nlatest LLaMA 2 model, PlatoLM achieves the SOTA performance among 7B models\n(including LLaMA-2-7B-chat and Vicuna-7B) in MT-Bench benchmark and in\nAlpaca-Eval benchmark, it ranks second among 7B models, even beating some\nlarger scale models (including LLaMA-2-13B-chat and GPT-3.5). Further in-depth\nanalysis demonstrates the scalability and transferability of our approach. The\ncode is available at https://github.com/FreedomIntelligence/PlatoLM.\n","authors":["Chuyi Kong","Yaxin Fan","Xiang Wan","Feng Jiang","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11534v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08132v1","updated":"2023-10-12T08:45:21Z","published":"2023-10-12T08:45:21Z","title":"On the Relevance of Phoneme Duration Variability of Synthesized Training\n  Data for Automatic Speech Recognition","summary":"  Synthetic data generated by text-to-speech (TTS) systems can be used to\nimprove automatic speech recognition (ASR) systems in low-resource or domain\nmismatch tasks. It has been shown that TTS-generated outputs still do not have\nthe same qualities as real data. In this work we focus on the temporal\nstructure of synthetic data and its relation to ASR training. By using a novel\noracle setup we show how much the degradation of synthetic data quality is\ninfluenced by duration modeling in non-autoregressive (NAR) TTS. To get\nreference phoneme durations we use two common alignment methods, a hidden\nMarkov Gaussian-mixture model (HMM-GMM) aligner and a neural connectionist\ntemporal classification (CTC) aligner. Using a simple algorithm based on random\nwalks we shift phoneme duration distributions of the TTS system closer to real\ndurations, resulting in an improvement of an ASR system using synthetic data in\na semi-supervised setting.\n","authors":["Nick Rossenbach","Benedikt Hilmes","Ralf Schlüter"],"pdf_url":"https://arxiv.org/pdf/2310.08132v1.pdf","comment":"To appear at ASRU 2023"},{"id":"http://arxiv.org/abs/2310.08130v1","updated":"2023-10-12T08:38:12Z","published":"2023-10-12T08:38:12Z","title":"Fine-grained Conversational Decoding via Isotropic and Proximal Search","summary":"  General-purpose text decoding approaches are usually adopted for dialogue\nresponse generation. Although the quality of the generated responses can be\nimproved with dialogue-specific encoding methods, conversational decoding\nmethods are still under-explored. Inspired by \\citet{wu2023learning} that a\ngood dialogue feature space should follow the rules of locality and isotropy,\nwe present a fine-grained conversational decoding method, termed\n\\textit{isotropic and proximal search (IPS)}. Our method is designed to\ngenerate the semantic-concentrated response, while still maintaining\ninformativeness and discrimination against the context. Experiments show that\nour approach outperforms existing decoding strategies in the dialogue field\nacross both automatic and human evaluation metrics. More in-depth analyses\nfurther confirm the effectiveness of our approach.\n","authors":["Yuxuan Yao","Han Wu","Qiling Xu","Linqi Song"],"pdf_url":"https://arxiv.org/pdf/2310.08130v1.pdf","comment":"To appear in EMNLP 2024"},{"id":"http://arxiv.org/abs/2310.08123v1","updated":"2023-10-12T08:24:15Z","published":"2023-10-12T08:24:15Z","title":"Who Wrote it and Why? Prompting Large-Language Models for Authorship\n  Verification","summary":"  Authorship verification (AV) is a fundamental task in natural language\nprocessing (NLP) and computational linguistics, with applications in forensic\nanalysis, plagiarism detection, and identification of deceptive content.\nExisting AV techniques, including traditional stylometric and deep learning\napproaches, face limitations in terms of data requirements and lack of\nexplainability. To address these limitations, this paper proposes PromptAV, a\nnovel technique that leverages Large-Language Models (LLMs) for AV by providing\nstep-by-step stylometric explanation prompts. PromptAV outperforms\nstate-of-the-art baselines, operates effectively with limited training data,\nand enhances interpretability through intuitive explanations, showcasing its\npotential as an effective and interpretable solution for the AV task.\n","authors":["Chia-Yu Hung","Zhiqiang Hu","Yujia Hu","Roy Ka-Wei Lee"],"pdf_url":"https://arxiv.org/pdf/2310.08123v1.pdf","comment":"7 pages,1 figure"},{"id":"http://arxiv.org/abs/2310.08104v1","updated":"2023-10-12T08:00:25Z","published":"2023-10-12T08:00:25Z","title":"Voice Conversion for Stuttered Speech, Instruments, Unseen Languages and\n  Textually Described Voices","summary":"  Voice conversion aims to convert source speech into a target voice using\nrecordings of the target speaker as a reference. Newer models are producing\nincreasingly realistic output. But what happens when models are fed with\nnon-standard data, such as speech from a user with a speech impairment? We\ninvestigate how a recent voice conversion model performs on non-standard\ndownstream voice conversion tasks. We use a simple but robust approach called\nk-nearest neighbors voice conversion (kNN-VC). We look at four non-standard\napplications: stuttered voice conversion, cross-lingual voice conversion,\nmusical instrument conversion, and text-to-voice conversion. The latter\ninvolves converting to a target voice specified through a text description,\ne.g. \"a young man with a high-pitched voice\". Compared to an established\nbaseline, we find that kNN-VC retains high performance in stuttered and\ncross-lingual voice conversion. Results are more mixed for the musical\ninstrument and text-to-voice conversion tasks. E.g., kNN-VC works well on some\ninstruments like drums but not on others. Nevertheless, this shows that voice\nconversion models - and kNN-VC in particular - are increasingly applicable in a\nrange of non-standard downstream tasks. But there are still limitations when\nsamples are very far from the training distribution. Code, samples, trained\nmodels: https://rf5.github.io/sacair2023-knnvc-demo/.\n","authors":["Matthew Baas","Herman Kamper"],"pdf_url":"https://arxiv.org/pdf/2310.08104v1.pdf","comment":"11 pages, 1 figure, 5 tables. Accepted at SACAIR 2023"},{"id":"http://arxiv.org/abs/2310.01917v2","updated":"2023-10-12T07:59:56Z","published":"2023-10-03T09:46:02Z","title":"Hierarchical Evaluation Framework: Best Practices for Human Evaluation","summary":"  Human evaluation plays a crucial role in Natural Language Processing (NLP) as\nit assesses the quality and relevance of developed systems, thereby\nfacilitating their enhancement. However, the absence of widely accepted human\nevaluation metrics in NLP hampers fair comparisons among different systems and\nthe establishment of universal assessment standards. Through an extensive\nanalysis of existing literature on human evaluation metrics, we identified\nseveral gaps in NLP evaluation methodologies. These gaps served as motivation\nfor developing our own hierarchical evaluation framework. The proposed\nframework offers notable advantages, particularly in providing a more\ncomprehensive representation of the NLP system's performance. We applied this\nframework to evaluate the developed Machine Reading Comprehension system, which\nwas utilized within a human-AI symbiosis model. The results highlighted the\nassociations between the quality of inputs and outputs, underscoring the\nnecessity to evaluate both components rather than solely focusing on outputs.\nIn future work, we will investigate the potential time-saving benefits of our\nproposed framework for evaluators assessing NLP systems.\n","authors":["Iva Bojic","Jessica Chen","Si Yuan Chang","Qi Chwen Ong","Shafiq Joty","Josip Car"],"pdf_url":"https://arxiv.org/pdf/2310.01917v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07282v2","updated":"2023-10-12T07:53:53Z","published":"2023-10-11T08:16:35Z","title":"An Analysis on Large Language Models in Healthcare: A Case Study of\n  BioBERT","summary":"  This paper conducts a comprehensive investigation into applying large\nlanguage models, particularly on BioBERT, in healthcare. It begins with\nthoroughly examining previous natural language processing (NLP) approaches in\nhealthcare, shedding light on the limitations and challenges these methods\nface. Following that, this research explores the path that led to the\nincorporation of BioBERT into healthcare applications, highlighting its\nsuitability for addressing the specific requirements of tasks related to\nbiomedical text mining. The analysis outlines a systematic methodology for\nfine-tuning BioBERT to meet the unique needs of the healthcare domain. This\napproach includes various components, including the gathering of data from a\nwide range of healthcare sources, data annotation for tasks like identifying\nmedical entities and categorizing them, and the application of specialized\npreprocessing techniques tailored to handle the complexities found in\nbiomedical texts. Additionally, the paper covers aspects related to model\nevaluation, with a focus on healthcare benchmarks and functions like processing\nof natural language in biomedical, question-answering, clinical document\nclassification, and medical entity recognition. It explores techniques to\nimprove the model's interpretability and validates its performance compared to\nexisting healthcare-focused language models. The paper thoroughly examines\nethical considerations, particularly patient privacy and data security. It\nhighlights the benefits of incorporating BioBERT into healthcare contexts,\nincluding enhanced clinical decision support and more efficient information\nretrieval. Nevertheless, it acknowledges the impediments and complexities of\nthis integration, encompassing concerns regarding data privacy, transparency,\nresource-intensive requirements, and the necessity for model customization to\nalign with diverse healthcare domains.\n","authors":["Shyni Sharaf","V. S. Anoop"],"pdf_url":"https://arxiv.org/pdf/2310.07282v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08102v1","updated":"2023-10-12T07:52:19Z","published":"2023-10-12T07:52:19Z","title":"QASiNa: Religious Domain Question Answering using Sirah Nabawiyah","summary":"  Nowadays, Question Answering (QA) tasks receive significant research focus,\nparticularly with the development of Large Language Model (LLM) such as Chat\nGPT [1]. LLM can be applied to various domains, but it contradicts the\nprinciples of information transmission when applied to the Islamic domain. In\nIslam we strictly regulates the sources of information and who can give\ninterpretations or tafseer for that sources [2]. The approach used by LLM to\ngenerate answers based on its own interpretation is similar to the concept of\ntafseer, LLM is neither an Islamic expert nor a human which is not permitted in\nIslam. Indonesia is the country with the largest Islamic believer population in\nthe world [3]. With the high influence of LLM, we need to make evaluation of\nLLM in religious domain. Currently, there is only few religious QA dataset\navailable and none of them using Sirah Nabawiyah especially in Indonesian\nLanguage. In this paper, we propose the Question Answering Sirah Nabawiyah\n(QASiNa) dataset, a novel dataset compiled from Sirah Nabawiyah literatures in\nIndonesian language. We demonstrate our dataset by using mBERT [4], XLM-R [5],\nand IndoBERT [6] which fine-tuned with Indonesian translation of SQuAD v2.0\n[7]. XLM-R model returned the best performance on QASiNa with EM of 61.20,\nF1-Score of 75.94, and Substring Match of 70.00. We compare XLM-R performance\nwith Chat GPT-3.5 and GPT-4 [1]. Both Chat GPT version returned lower EM and\nF1-Score with higher Substring Match, the gap of EM and Substring Match get\nwider in GPT-4. The experiment indicate that Chat GPT tends to give excessive\ninterpretations as evidenced by its higher Substring Match scores compared to\nEM and F1-Score, even after providing instruction and context. This concludes\nChat GPT is unsuitable for question answering task in religious domain\nespecially for Islamic religion.\n","authors":["Muhammad Razif Rizqullah","Ayu Purwarianti","Alham Fikri Aji"],"pdf_url":"https://arxiv.org/pdf/2310.08102v1.pdf","comment":"6 Pages. In Proceeding of 10th International Conference on Advanced\n  Informatics: Concepts, Theory and Applications (ICAICTA 2023)"},{"id":"http://arxiv.org/abs/2310.08101v1","updated":"2023-10-12T07:51:43Z","published":"2023-10-12T07:51:43Z","title":"Promptor: A Conversational and Autonomous Prompt Generation Agent for\n  Intelligent Text Entry Techniques","summary":"  Text entry is an essential task in our day-to-day digital interactions.\nNumerous intelligent features have been developed to streamline this process,\nmaking text entry more effective, efficient, and fluid. These improvements\ninclude sentence prediction and user personalization. However, as deep\nlearning-based language models become the norm for these advanced features, the\nnecessity for data collection and model fine-tuning increases. These challenges\ncan be mitigated by harnessing the in-context learning capability of large\nlanguage models such as GPT-3.5. This unique feature allows the language model\nto acquire new skills through prompts, eliminating the need for data collection\nand fine-tuning. Consequently, large language models can learn various text\nprediction techniques. We initially showed that, for a sentence prediction\ntask, merely prompting GPT-3.5 surpassed a GPT-2 backed system and is\ncomparable with a fine-tuned GPT-3.5 model, with the latter two methods\nrequiring costly data collection, fine-tuning and post-processing. However, the\ntask of prompting large language models to specialize in specific text\nprediction tasks can be challenging, particularly for designers without\nexpertise in prompt engineering. To address this, we introduce Promptor, a\nconversational prompt generation agent designed to engage proactively with\ndesigners. Promptor can automatically generate complex prompts tailored to meet\nspecific needs, thus offering a solution to this challenge. We conducted a user\nstudy involving 24 participants creating prompts for three intelligent text\nentry tasks, half of the participants used Promptor while the other half\ndesigned prompts themselves. The results show that Promptor-designed prompts\nresult in a 35% increase in similarity and 22% in coherence over those by\ndesigners.\n","authors":["Junxiao Shen","John J. Dudley","Jingyao Zheng","Bill Byrne","Per Ola Kristensson"],"pdf_url":"https://arxiv.org/pdf/2310.08101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08099v1","updated":"2023-10-12T07:48:50Z","published":"2023-10-12T07:48:50Z","title":"ClimateNLP: Analyzing Public Sentiment Towards Climate Change Using\n  Natural Language Processing","summary":"  Climate change's impact on human health poses unprecedented and diverse\nchallenges. Unless proactive measures based on solid evidence are implemented,\nthese threats will likely escalate and continue to endanger human well-being.\nThe escalating advancements in information and communication technologies have\nfacilitated the widespread availability and utilization of social media\nplatforms. Individuals utilize platforms such as Twitter and Facebook to\nexpress their opinions, thoughts, and critiques on diverse subjects,\nencompassing the pressing issue of climate change. The proliferation of climate\nchange-related content on social media necessitates comprehensive analysis to\nglean meaningful insights. This paper employs natural language processing (NLP)\ntechniques to analyze climate change discourse and quantify the sentiment of\nclimate change-related tweets. We use ClimateBERT, a pretrained model\nfine-tuned specifically for the climate change domain. The objective is to\ndiscern the sentiment individuals express and uncover patterns in public\nopinion concerning climate change. Analyzing tweet sentiments allows a deeper\ncomprehension of public perceptions, concerns, and emotions about this critical\nglobal challenge. The findings from this experiment unearth valuable insights\ninto public sentiment and the entities associated with climate change\ndiscourse. Policymakers, researchers, and organizations can leverage such\nanalyses to understand public perceptions, identify influential actors, and\ndevise informed strategies to address climate change challenges.\n","authors":["Ajay Krishnan T. K.","V. S. Anoop"],"pdf_url":"https://arxiv.org/pdf/2310.08099v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08085v1","updated":"2023-10-12T07:17:17Z","published":"2023-10-12T07:17:17Z","title":"Low-Resource Clickbait Spoiling for Indonesian via Question Answering","summary":"  Clickbait spoiling aims to generate a short text to satisfy the curiosity\ninduced by a clickbait post. As it is a newly introduced task, the dataset is\nonly available in English so far. Our contributions include the construction of\nmanually labeled clickbait spoiling corpus in Indonesian and an evaluation on\nusing cross-lingual zero-shot question answering-based models to tackle\nclikcbait spoiling for low-resource language like Indonesian. We utilize\nselection of multilingual language models. The experimental results suggest\nthat XLM-RoBERTa (large) model outperforms other models for phrase and passage\nspoilers, meanwhile, mDeBERTa (base) model outperforms other models for\nmultipart spoilers.\n","authors":["Ni Putu Intan Maharani","Ayu Purwarianti","Alham Fikri Aji"],"pdf_url":"https://arxiv.org/pdf/2310.08085v1.pdf","comment":"Accepted in ICAICTA 2023 (10th International Conference on Advanced\n  Informatics: Concepts, Theory and Applications)"},{"id":"http://arxiv.org/abs/2310.08078v1","updated":"2023-10-12T06:59:10Z","published":"2023-10-12T06:59:10Z","title":"To token or not to token: A Comparative Study of Text Representations\n  for Cross-Lingual Transfer","summary":"  Choosing an appropriate tokenization scheme is often a bottleneck in\nlow-resource cross-lingual transfer. To understand the downstream implications\nof text representation choices, we perform a comparative analysis on language\nmodels having diverse text representation modalities including 2\nsegmentation-based models (\\texttt{BERT}, \\texttt{mBERT}), 1 image-based model\n(\\texttt{PIXEL}), and 1 character-level model (\\texttt{CANINE}). First, we\npropose a scoring Language Quotient (LQ) metric capable of providing a weighted\nrepresentation of both zero-shot and few-shot evaluation combined. Utilizing\nthis metric, we perform experiments comprising 19 source languages and 133\ntarget languages on three tasks (POS tagging, Dependency parsing, and NER). Our\nanalysis reveals that image-based models excel in cross-lingual transfer when\nlanguages are closely related and share visually similar scripts. However, for\ntasks biased toward word meaning (POS, NER), segmentation-based models prove to\nbe superior. Furthermore, in dependency parsing tasks where word relationships\nplay a crucial role, models with their character-level focus, outperform\nothers. Finally, we propose a recommendation scheme based on our findings to\nguide model selection according to task and language requirements.\n","authors":["Md Mushfiqur Rahman","Fardin Ahsan Sakib","Fahim Faisal","Antonios Anastasopoulos"],"pdf_url":"https://arxiv.org/pdf/2310.08078v1.pdf","comment":"Accepted at 3RD MULTILINGUAL REPRESENTATION LEARNING (MRL) WORKSHOP,\n  2023"},{"id":"http://arxiv.org/abs/2310.08072v1","updated":"2023-10-12T06:46:07Z","published":"2023-10-12T06:46:07Z","title":"Training Generative Question-Answering on Synthetic Data Obtained from\n  an Instruct-tuned Mo","summary":"  This paper presents a simple and cost-effective method for synthesizing data\nto train question-answering systems. For training, fine-tuning GPT models is a\ncommon practice in resource-rich languages like English, however, it becomes\nchallenging for non-English languages due to the scarcity of sufficient\nquestion-answer (QA) pairs. Existing approaches use question and answer\ngenerators trained on human-authored QA pairs, which involves substantial human\nexpenses. In contrast, we use an instruct-tuned model to generate QA pairs in a\nzero-shot or few-shot manner. We conduct experiments to compare various\nstrategies for obtaining QA pairs from the instruct-tuned model. The results\ndemonstrate that a model trained on our proposed synthetic data achieves\ncomparable performance to a model trained on manually curated datasets, without\nincurring human costs.\n","authors":["Kosuke Takahashi","Takahiro Omi","Kosuke Arima","Tatsuya Ishigaki"],"pdf_url":"https://arxiv.org/pdf/2310.08072v1.pdf","comment":"PACLIC 2023 short paper, 4 pages (6 pages including references), 4\n  figures"},{"id":"http://arxiv.org/abs/2305.07375v4","updated":"2023-10-12T06:42:25Z","published":"2023-05-12T10:54:13Z","title":"Is ChatGPT a Good Causal Reasoner? A Comprehensive Evaluation","summary":"  Causal reasoning ability is crucial for numerous NLP applications. Despite\nthe impressive emerging ability of ChatGPT in various NLP tasks, it is unclear\nhow well ChatGPT performs in causal reasoning. In this paper, we conduct the\nfirst comprehensive evaluation of the ChatGPT's causal reasoning capabilities.\nExperiments show that ChatGPT is not a good causal reasoner, but a good causal\nexplainer. Besides, ChatGPT has a serious hallucination on causal reasoning,\npossibly due to the reporting biases between causal and non-causal\nrelationships in natural language, as well as ChatGPT's upgrading processes,\nsuch as RLHF. The In-Context Learning (ICL) and Chain-of-Thought (CoT)\ntechniques can further exacerbate such causal hallucination. Additionally, the\ncausal reasoning ability of ChatGPT is sensitive to the words used to express\nthe causal concept in prompts, and close-ended prompts perform better than\nopen-ended prompts. For events in sentences, ChatGPT excels at capturing\nexplicit causality rather than implicit causality, and performs better in\nsentences with lower event density and smaller lexical distance between events.\nThe code is available on https://github.com/ArrogantL/ChatGPT4CausalReasoning .\n","authors":["Jinglong Gao","Xiao Ding","Bing Qin","Ting Liu"],"pdf_url":"https://arxiv.org/pdf/2305.07375v4.pdf","comment":"Accepted to Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.08069v1","updated":"2023-10-12T06:32:42Z","published":"2023-10-12T06:32:42Z","title":"Rethinking Negative Pairs in Code Search","summary":"  Recently, contrastive learning has become a key component in fine-tuning code\nsearch models for software development efficiency and effectiveness. It pulls\ntogether positive code snippets while pushing negative samples away given\nsearch queries. Among contrastive learning, InfoNCE is the most widely used\nloss function due to its better performance. However, the following problems in\nnegative samples of InfoNCE may deteriorate its representation learning: 1) The\nexistence of false negative samples in large code corpora due to duplications.\n2). The failure to explicitly differentiate between the potential relevance of\nnegative samples. As an example, a bubble sorting algorithm example is less\n``negative'' than a file saving function for the quick sorting algorithm query.\nIn this paper, we tackle the above problems by proposing a simple yet effective\nSoft-InfoNCE loss that inserts weight terms into InfoNCE. In our proposed loss\nfunction, we apply three methods to estimate the weights of negative pairs and\nshow that the vanilla InfoNCE loss is a special case of Soft-InfoNCE.\nTheoretically, we analyze the effects of Soft-InfoNCE on controlling the\ndistribution of learnt code representations and on deducing a more precise\nmutual information estimation. We furthermore discuss the superiority of\nproposed loss functions with other design alternatives. Extensive experiments\ndemonstrate the effectiveness of Soft-InfoNCE and weights estimation methods\nunder state-of-the-art code search models on a large-scale public dataset\nconsisting of six programming languages. Source code is available at\n\\url{https://github.com/Alex-HaochenLi/Soft-InfoNCE}.\n","authors":["Haochen Li","Xin Zhou","Luu Anh Tuan","Chunyan Miao"],"pdf_url":"https://arxiv.org/pdf/2310.08069v1.pdf","comment":"Accepted to EMNLP 2023"},{"id":"http://arxiv.org/abs/2305.14069v2","updated":"2023-10-12T06:20:42Z","published":"2023-05-23T13:48:32Z","title":"Evaluating Factual Consistency of Summaries with Large Language Models","summary":"  Detecting factual errors in summaries has been an important and challenging\nsubject in summarization research. Inspired by the emergent ability of large\nlanguage models (LLMs), we explore evaluating factual consistency of summaries\nby directly prompting LLMs. We present a comprehensive empirical study to\nassess the ability of LLMs as factual consistency evaluators, which consists of\n(1) analyzing different LLMs such as the GPT model series and Flan-T5; (2)\ninvestigating a variety of prompting methods including vanilla prompting,\nchain-of-thought prompting, and a sentence-by-sentence prompting method to\ntackle long summaries; and (3) evaluating on diverse summaries generated by\nmultiple summarization systems, ranging from pre-transformer methods to SOTA\npretrained models. Our experiments demonstrate that prompting LLMs is able to\noutperform the previous best factuality systems in all settings, by up to 12.2\nabsolute points in terms of the binary classification accuracy on inconsistency\ndetection.\n","authors":["Shiqi Chen","Siyang Gao","Junxian He"],"pdf_url":"https://arxiv.org/pdf/2305.14069v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08041v1","updated":"2023-10-12T05:25:49Z","published":"2023-10-12T05:25:49Z","title":"QLLM: Accurate and Efficient Low-Bitwidth Quantization for Large\n  Language Models","summary":"  Large Language Models (LLMs) excel in NLP, but their demands hinder their\nwidespread deployment. While Quantization-Aware Training (QAT) offers a\nsolution, its extensive training costs make Post-Training Quantization (PTQ) a\nmore practical approach for LLMs. In existing studies, activation outliers in\nparticular channels are identified as the bottleneck to PTQ accuracy. They\npropose to transform the magnitudes from activations to weights, which however\noffers limited alleviation or suffers from unstable gradients, resulting in a\nsevere performance drop at low-bitwidth. In this paper, we propose QLLM, an\naccurate and efficient low-bitwidth PTQ method designed for LLMs. QLLM\nintroduces an adaptive channel reassembly technique that reallocates the\nmagnitude of outliers to other channels, thereby mitigating their impact on the\nquantization range. This is achieved by channel disassembly and channel\nassembly, which first breaks down the outlier channels into several\nsub-channels to ensure a more balanced distribution of activation magnitudes.\nThen similar channels are merged to maintain the original channel number for\nefficiency. Additionally, an adaptive strategy is designed to autonomously\ndetermine the optimal number of sub-channels for channel disassembly. To\nfurther compensate for the performance loss caused by quantization, we propose\nan efficient tuning method that only learns a small number of low-rank weights\nwhile freezing the pre-trained quantized model. After training, these low-rank\nparameters can be fused into the frozen weights without affecting inference.\nExtensive experiments on LLaMA-1 and LLaMA-2 show that QLLM can obtain accurate\nquantized models efficiently. For example, QLLM quantizes the 4-bit LLaMA-2-70B\nwithin 10 hours on a single A100-80G GPU, outperforming the previous\nstate-of-the-art method by 7.89% on the average accuracy across five zero-shot\ntasks.\n","authors":["Jing Liu","Ruihao Gong","Xiuying Wei","Zhiwei Dong","Jianfei Cai","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2310.08041v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08027v1","updated":"2023-10-12T04:14:28Z","published":"2023-10-12T04:14:28Z","title":"Exploring Large Language Models for Multi-Modal Out-of-Distribution\n  Detection","summary":"  Out-of-distribution (OOD) detection is essential for reliable and trustworthy\nmachine learning. Recent multi-modal OOD detection leverages textual\ninformation from in-distribution (ID) class names for visual OOD detection, yet\nit currently neglects the rich contextual information of ID classes. Large\nlanguage models (LLMs) encode a wealth of world knowledge and can be prompted\nto generate descriptive features for each class. Indiscriminately using such\nknowledge causes catastrophic damage to OOD detection due to LLMs'\nhallucinations, as is observed by our analysis. In this paper, we propose to\napply world knowledge to enhance OOD detection performance through selective\ngeneration from LLMs. Specifically, we introduce a consistency-based\nuncertainty calibration method to estimate the confidence score of each\ngeneration. We further extract visual objects from each image to fully\ncapitalize on the aforementioned world knowledge. Extensive experiments\ndemonstrate that our method consistently outperforms the state-of-the-art.\n","authors":["Yi Dai","Hao Lang","Kaisheng Zeng","Fei Huang","Yongbin Li"],"pdf_url":"https://arxiv.org/pdf/2310.08027v1.pdf","comment":"EMNLP2023 Findings Long Paper"},{"id":"http://arxiv.org/abs/2309.10691v2","updated":"2023-10-12T04:07:56Z","published":"2023-09-19T15:25:42Z","title":"MINT: Evaluating LLMs in Multi-turn Interaction with Tools and Language\n  Feedback","summary":"  To solve complex tasks, large language models (LLMs) often require multiple\nrounds of interactions with the user, sometimes assisted by external tools.\nHowever, current evaluation protocols often emphasize benchmark performance\nwith single-turn exchanges, neglecting the nuanced interactions among the user,\nLLMs, and external tools, while also underestimating the importance of natural\nlanguage feedback from users. These oversights contribute to discrepancies\nbetween research benchmark evaluations and real-world use cases. We introduce\nMINT, a benchmark that evaluates LLMs' ability to solve tasks with multi-turn\ninteractions by (1) using tools and (2) leveraging natural language feedback.\nTo ensure reproducibility, we provide an evaluation framework where LLMs can\naccess tools by executing Python code and receive users' natural language\nfeedback simulated by GPT-4. We repurpose a diverse set of established\nevaluation datasets focusing on reasoning, coding, and decision-making and\ncarefully curate them into a compact subset for efficient evaluation. Our\nanalysis of 20 open- and closed-source LLMs offers intriguing findings. (a)\nLLMs generally benefit from tools and language feedback, with performance gains\n(absolute, same below) of 1-8% for each turn of tool use and 2-17% with natural\nlanguage feedback. (b) Better single-turn performance does not guarantee better\nmulti-turn performance. (c) Surprisingly, on the LLMs evaluated, supervised\ninstruction-finetuning (SIFT) and reinforcement learning from human feedback\n(RLHF) generally hurt multi-turn capabilities. We expect MINT can help measure\nprogress and incentivize research in improving LLMs' capabilities in multi-turn\ninteractions, especially for open-source communities where multi-turn human\nevaluation can be less accessible compared to commercial LLMs with a larger\nuser base.\n","authors":["Xingyao Wang","Zihan Wang","Jiateng Liu","Yangyi Chen","Lifan Yuan","Hao Peng","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2309.10691v2.pdf","comment":"Code is available on our project website:\n  https://xingyaoww.github.io/mint-bench"},{"id":"http://arxiv.org/abs/2310.08017v1","updated":"2023-10-12T03:33:06Z","published":"2023-10-12T03:33:06Z","title":"Harnessing Large Language Models' Empathetic Response Generation\n  Capabilities for Online Mental Health Counselling Support","summary":"  Large Language Models (LLMs) have demonstrated remarkable performance across\nvarious information-seeking and reasoning tasks. These computational systems\ndrive state-of-the-art dialogue systems, such as ChatGPT and Bard. They also\ncarry substantial promise in meeting the growing demands of mental health care,\nalbeit relatively unexplored. As such, this study sought to examine LLMs'\ncapability to generate empathetic responses in conversations that emulate those\nin a mental health counselling setting. We selected five LLMs: version 3.5 and\nversion 4 of the Generative Pre-training (GPT), Vicuna FastChat-T5, Pathways\nLanguage Model (PaLM) version 2, and Falcon-7B-Instruct. Based on a simple\ninstructional prompt, these models responded to utterances derived from the\nEmpatheticDialogues (ED) dataset. Using three empathy-related metrics, we\ncompared their responses to those from traditional response generation dialogue\nsystems, which were fine-tuned on the ED dataset, along with human-generated\nresponses. Notably, we discovered that responses from the LLMs were remarkably\nmore empathetic in most scenarios. We position our findings in light of\ncatapulting advancements in creating empathetic conversational systems.\n","authors":["Siyuan Brandon Loh","Aravind Sesagiri Raamkumar"],"pdf_url":"https://arxiv.org/pdf/2310.08017v1.pdf","comment":"7 pages, 1 figure"},{"id":"http://arxiv.org/abs/2310.07644v2","updated":"2023-10-12T03:32:32Z","published":"2023-10-11T16:40:57Z","title":"Rethinking the BERT-like Pretraining for DNA Sequences","summary":"  With the success of large-scale pretraining in NLP, there is an increasing\ntrend of applying it to the domain of life sciences. In particular, pretraining\nmethods based on DNA sequences have garnered growing attention due to their\npotential to capture generic information about genes. However, existing\npretraining methods for DNA sequences largely rely on direct adoptions of BERT\npretraining from NLP, lacking a comprehensive understanding and a specifically\ntailored approach. To address this research gap, we first conducted a series of\nexploratory experiments and gained several insightful observations: 1) In the\nfine-tuning phase of downstream tasks, when using K-mer overlapping\ntokenization instead of K-mer non-overlapping tokenization, both overlapping\nand non-overlapping pretraining weights show consistent performance\nimprovement.2) During the pre-training process, using K-mer overlapping\ntokenization quickly produces clear K-mer embeddings and reduces the loss to a\nvery low level, while using K-mer non-overlapping tokenization results in less\ndistinct embeddings and continuously decreases the loss. 3) Using overlapping\ntokenization causes the self-attention in the intermediate layers of\npre-trained models to tend to overly focus on certain tokens, reflecting that\nthese layers are not adequately optimized. In summary, overlapping tokenization\ncan benefit the fine-tuning of downstream tasks but leads to inadequate\npretraining with fast convergence. To unleash the pretraining potential, we\nintroduce a novel approach called RandomMask, which gradually increases the\ntask difficulty of BERT-like pretraining by continuously expanding its mask\nboundary, forcing the model to learn more knowledge. RandomMask is simple but\neffective, achieving top-tier performance across 26 datasets of 28 datasets\nspanning 7 downstream tasks.\n","authors":["Chaoqi Liang","Weiqiang Bai","Lifeng Qiao","Yuchen Ren","Jianle Sun","Peng Ye","Hongliang Yan","Xinzhu Ma","Wangmeng Zuo","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2310.07644v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06488v2","updated":"2023-10-12T03:23:40Z","published":"2023-10-10T09:57:17Z","title":"SpikeCLIP: A Contrastive Language-Image Pretrained Spiking Neural\n  Network","summary":"  Spiking neural networks (SNNs) have demonstrated the capability to achieve\ncomparable performance to deep neural networks (DNNs) in both visual and\nlinguistic domains while offering the advantages of improved energy efficiency\nand adherence to biological plausibility. However, the extension of such\nsingle-modality SNNs into the realm of multimodal scenarios remains an\nunexplored territory. Drawing inspiration from the concept of contrastive\nlanguage-image pre-training (CLIP), we introduce a novel framework, named\nSpikeCLIP, to address the gap between two modalities within the context of\nspike-based computing through a two-step recipe involving ``Alignment\nPre-training + Dual-Loss Fine-tuning\". Extensive experiments demonstrate that\nSNNs achieve comparable results to their DNN counterparts while significantly\nreducing energy consumption across a variety of datasets commonly used for\nmultimodal model evaluation. Furthermore, SpikeCLIP maintains robust\nperformance in image classification tasks that involve class labels not\npredefined within specific categories.\n","authors":["Tianlong Li","Wenhao Liu","Changze Lv","Jianhan Xu","Cenyuan Zhang","Muling Wu","Xiaoqing Zheng","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2310.06488v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02285v2","updated":"2023-10-12T03:05:36Z","published":"2023-09-05T14:45:27Z","title":"PromptTTS 2: Describing and Generating Voices with Text Prompt","summary":"  Speech conveys more information than text, as the same word can be uttered in\nvarious voices to convey diverse information. Compared to traditional\ntext-to-speech (TTS) methods relying on speech prompts (reference speech) for\nvoice variability, using text prompts (descriptions) is more user-friendly\nsince speech prompts can be hard to find or may not exist at all. TTS\napproaches based on the text prompt face two main challenges: 1) the\none-to-many problem, where not all details about voice variability can be\ndescribed in the text prompt, and 2) the limited availability of text prompt\ndatasets, where vendors and large cost of data labeling are required to write\ntext prompts for speech. In this work, we introduce PromptTTS 2 to address\nthese challenges with a variation network to provide variability information of\nvoice not captured by text prompts, and a prompt generation pipeline to utilize\nthe large language models (LLM) to compose high quality text prompts.\nSpecifically, the variation network predicts the representation extracted from\nthe reference speech (which contains full information about voice variability)\nbased on the text prompt representation. For the prompt generation pipeline, it\ngenerates text prompts for speech with a speech language understanding model to\nrecognize voice attributes (e.g., gender, speed) from speech and a large\nlanguage model to formulate text prompts based on the recognition results.\nExperiments on a large-scale (44K hours) speech dataset demonstrate that\ncompared to the previous works, PromptTTS 2 generates voices more consistent\nwith text prompts and supports the sampling of diverse voice variability,\nthereby offering users more choices on voice generation. Additionally, the\nprompt generation pipeline produces high-quality text prompts, eliminating the\nlarge labeling cost. The demo page of PromptTTS 2 is available online.\n","authors":["Yichong Leng","Zhifang Guo","Kai Shen","Xu Tan","Zeqian Ju","Yanqing Liu","Yufei Liu","Dongchao Yang","Leying Zhang","Kaitao Song","Lei He","Xiang-Yang Li","Sheng Zhao","Tao Qin","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2309.02285v2.pdf","comment":"Demo page: https://speechresearch.github.io/prompttts2"},{"id":"http://arxiv.org/abs/2310.04948v2","updated":"2023-10-12T02:43:13Z","published":"2023-10-08T00:02:25Z","title":"TEMPO: Prompt-based Generative Pre-trained Transformer for Time Series\n  Forecasting","summary":"  The past decade has witnessed significant advances in time series modeling\nwith deep learning. While achieving state-of-the-art results, the\nbest-performing architectures vary highly across applications and domains.\nMeanwhile, for natural language processing, the Generative Pre-trained\nTransformer (GPT) has demonstrated impressive performance via training one\ngeneral-purpose model across various textual datasets. It is intriguing to\nexplore whether GPT-type architectures can be effective for time series,\ncapturing the intrinsic dynamic attributes and leading to significant accuracy\nimprovements. In this paper, we propose a novel framework, TEMPO, that can\neffectively learn time series representations. We focus on utilizing two\nessential inductive biases of the time series task for pre-trained models: (i)\ndecomposition of the complex interaction between trend, seasonal and residual\ncomponents; and (ii) introducing the selection-based prompts to facilitate\ndistribution adaptation in non-stationary time series. TEMPO expands the\ncapability for dynamically modeling real-world temporal phenomena from data\nwithin diverse domains. Our experiments demonstrate the superior performance of\nTEMPO over state-of-the-art methods on a number of time series benchmark\ndatasets. This performance gain is observed not only in standard supervised\nlearning settings but also in scenarios involving previously unseen datasets as\nwell as in scenarios with multi-modal inputs. This compelling finding\nhighlights TEMPO's potential to constitute a foundational model-building\nframework.\n","authors":["Defu Cao","Furong Jia","Sercan O Arik","Tomas Pfister","Yixiang Zheng","Wen Ye","Yan Liu"],"pdf_url":"https://arxiv.org/pdf/2310.04948v2.pdf","comment":"35 pages, 20 figures, 17 tables"},{"id":"http://arxiv.org/abs/2302.12461v2","updated":"2023-10-12T02:29:09Z","published":"2023-02-24T05:26:08Z","title":"Analyzing And Editing Inner Mechanisms Of Backdoored Language Models","summary":"  Poisoning of data sets is a potential security threat to large language\nmodels that can lead to backdoored models. A description of the internal\nmechanisms of backdoored language models and how they process trigger inputs,\ne.g., when switching to toxic language, has yet to be found. In this work, we\nstudy the internal representations of transformer-based backdoored language\nmodels and determine early-layer MLP modules as most important for the backdoor\nmechanism in combination with the initial embedding projection. We use this\nknowledge to remove, insert, and modify backdoor mechanisms with engineered\nreplacements that reduce the MLP module outputs to essentials for the backdoor\nmechanism. To this end, we introduce PCP ablation, where we replace transformer\nmodules with low-rank matrices based on the principal components of their\nactivations. We demonstrate our results on backdoored toy, backdoored large,\nand non-backdoored open-source models. We show that we can improve the backdoor\nrobustness of large language models by locally constraining individual modules\nduring fine-tuning on potentially poisonous data sets.\n  Trigger warning: Offensive language.\n","authors":["Max Lamparth","Anka Reuel"],"pdf_url":"https://arxiv.org/pdf/2302.12461v2.pdf","comment":"included new experimental results and addressed reviewer feedback"},{"id":"http://arxiv.org/abs/2310.07284v2","updated":"2023-10-12T01:40:37Z","published":"2023-10-11T08:17:54Z","title":"Typing to Listen at the Cocktail Party: Text-Guided Target Speaker\n  Extraction","summary":"  Humans possess an extraordinary ability to selectively focus on the sound\nsource of interest amidst complex acoustic environments, commonly referred to\nas cocktail party scenarios. In an attempt to replicate this remarkable\nauditory attention capability in machines, target speaker extraction (TSE)\nmodels have been developed. These models leverage the pre-registered cues of\nthe target speaker to extract the sound source of interest. However, the\neffectiveness of these models is hindered in real-world scenarios due to the\nunreliable or even absence of pre-registered cues. To address this limitation,\nthis study investigates the integration of natural language description to\nenhance the feasibility, controllability, and performance of existing TSE\nmodels. Specifically, we propose a model named LLM-TSE, wherein a large\nlanguage model (LLM) to extract useful semantic cues from the user's typed text\ninput. These cues can serve as independent extraction cues, task selectors to\ncontrol the TSE process, or complement the pre-registered cues. Our\nexperimental results demonstrate competitive performance when only text-based\ncues are presented, the effectiveness of using input text as a task selector,\nand a new state-of-the-art when combining text-based cues with pre-registered\ncues. To our knowledge, this is the first study to successfully incorporate\nLLMs to guide target speaker extraction, which can be a cornerstone for\ncocktail party problem research.\n","authors":["Xiang Hao","Jibin Wu","Jianwei Yu","Chenglin Xu","Kay Chen Tan"],"pdf_url":"https://arxiv.org/pdf/2310.07284v2.pdf","comment":"Under review, https://github.com/haoxiangsnr/llm-tse"},{"id":"http://arxiv.org/abs/2310.07968v1","updated":"2023-10-12T01:17:56Z","published":"2023-10-12T01:17:56Z","title":"Think, Act, and Ask: Open-World Interactive Personalized Robot\n  Navigation","summary":"  Zero-Shot Object Navigation (ZSON) enables agents to navigate towards\nopen-vocabulary objects in unknown environments. The existing works of ZSON\nmainly focus on following individual instructions to find generic object\nclasses, neglecting the utilization of natural language interaction and the\ncomplexities of identifying user-specific objects. To address these\nlimitations, we introduce Zero-shot Interactive Personalized Object Navigation\n(ZIPON), where robots need to navigate to personalized goal objects while\nengaging in conversations with users. To solve ZIPON, we propose a new\nframework termed Open-woRld Interactive persOnalized Navigation (ORION), which\nuses Large Language Models (LLMs) to make sequential decisions to manipulate\ndifferent modules for perception, navigation and communication. Experimental\nresults show that the performance of interactive agents that can leverage user\nfeedback exhibits significant improvement. However, obtaining a good balance\nbetween task completion and the efficiency of navigation and interaction\nremains challenging for all methods. We further provide more findings on the\nimpact of diverse user feedback forms on the agents' performance.\n","authors":["Yinpei Dai","Run Peng","Sikai Li","Joyce Chai"],"pdf_url":"https://arxiv.org/pdf/2310.07968v1.pdf","comment":"Video available at https://www.youtube.com/watch?v=QW6rMHVpxUY"},{"id":"http://arxiv.org/abs/2310.07659v2","updated":"2023-10-12T01:08:39Z","published":"2023-10-11T17:00:29Z","title":"Well Begun is Half Done: Generator-agnostic Knowledge Pre-Selection for\n  Knowledge-Grounded Dialogue","summary":"  Accurate knowledge selection is critical in knowledge-grounded dialogue\nsystems. Towards a closer look at it, we offer a novel perspective to organize\nexisting literature, i.e., knowledge selection coupled with, after, and before\ngeneration. We focus on the third under-explored category of study, which can\nnot only select knowledge accurately in advance, but has the advantage to\nreduce the learning, adjustment, and interpretation burden of subsequent\nresponse generation models, especially LLMs. We propose GATE, a\ngenerator-agnostic knowledge selection method, to prepare knowledge for\nsubsequent response generation models by selecting context-related knowledge\namong different knowledge structures and variable knowledge requirements.\nExperimental results demonstrate the superiority of GATE, and indicate that\nknowledge selection before generation is a lightweight yet effective way to\nfacilitate LLMs (e.g., ChatGPT) to generate more informative responses.\n","authors":["Lang Qin","Yao Zhang","Hongru Liang","Jun Wang","Zhenglu Yang"],"pdf_url":"https://arxiv.org/pdf/2310.07659v2.pdf","comment":"Accepted by EMNLP2023 main conference"},{"id":"http://arxiv.org/abs/2310.01889v3","updated":"2023-10-12T01:00:09Z","published":"2023-10-03T08:44:50Z","title":"Ring Attention with Blockwise Transformers for Near-Infinite Context","summary":"  Transformers have emerged as the architecture of choice for many\nstate-of-the-art AI models, showcasing exceptional performance across a wide\nrange of AI applications. However, the memory demands imposed by Transformers\nlimit their ability to handle long sequences, thereby creating challenges for\ntasks involving extended sequences or long-term dependencies. We present a\ndistinct approach, Ring Attention, which leverages blockwise computation of\nself-attention to distribute long sequences across multiple devices while\noverlapping the communication of key-value blocks with the computation of\nblockwise attention. Ring Attention enables training and inference of sequences\nthat are up to device count times longer than those of prior memory-efficient\nTransformers, effectively eliminating the memory constraints imposed by\nindividual devices. Extensive experiments on language modeling tasks\ndemonstrate the effectiveness of Ring Attention in allowing large sequence\ninput size and improving performance.\n","authors":["Hao Liu","Matei Zaharia","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2310.01889v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07962v1","updated":"2023-10-12T00:57:32Z","published":"2023-10-12T00:57:32Z","title":"Clustering of Spell Variations for Proper Nouns Transliterated from the\n  other languages","summary":"  One of the prominent problems with processing and operating on text data is\nthe non uniformity of it. Due to the change in the dialects and languages, the\ncaliber of translation is low. This creates a unique problem while using NLP in\ntext data; which is the spell variation arising from the inconsistent\ntranslations and transliterations. This problem can also be further aggravated\nby the human error arising from the various ways to write a Proper Noun from an\nIndian language into its English equivalent. Translating proper nouns\noriginating from Indian languages can be complicated as some proper nouns are\nalso used as common nouns which might be taken literally. Applications of NLP\nthat require addresses, names and other proper nouns face this problem\nfrequently. We propose a method to cluster these spell variations for proper\nnouns using ML techniques and mathematical similarity equations. We aimed to\nuse Affinity Propagation to determine relative similarity between the tokens.\nThe results are augmented by filtering the token-variation pair by a similarity\nthreshold. We were able to reduce the spell variations by a considerable\namount. This application can significantly reduce the amount of human\nannotation efforts needed for data cleansing and formatting.\n","authors":["Prathamesh Pawar"],"pdf_url":"https://arxiv.org/pdf/2310.07962v1.pdf","comment":"3 pages, published Airial Conference 2023"},{"id":"http://arxiv.org/abs/2310.07957v1","updated":"2023-10-12T00:50:24Z","published":"2023-10-12T00:50:24Z","title":"A New Approach Towards Autoformalization","summary":"  Verifying mathematical proofs is difficult, but can be automated with the\nassistance of a computer. Autoformalization is the task of automatically\ntranslating natural language mathematics into a formal language that can be\nverified by a program. This is a challenging task, and especially for\nhigher-level mathematics found in research papers. Research paper mathematics\nrequires large amounts of background and context. In this paper, we propose an\navenue towards tackling autoformalization for research-level mathematics, by\nbreaking the task into easier and more approachable subtasks: unlinked\nformalization (formalization with unlinked definitions and theorems), entity\nlinking (linking to the proper theorems and definitions), and finally adjusting\ntypes so it passes the type checker. In addition, we present arXiv2Formal, a\nbenchmark dataset for unlinked formalization consisting of 50 theorems\nformalized for the Lean theorem prover sampled from papers on arXiv.org. We\nwelcome any contributions from the community to future versions of this\ndataset.\n","authors":["Nilay Patel","Jeffrey Flanigan","Rahul Saha"],"pdf_url":"https://arxiv.org/pdf/2310.07957v1.pdf","comment":"Under review at MATHAI 2023 @ NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.08764v1","updated":"2023-10-12T23:17:56Z","published":"2023-10-12T23:17:56Z","title":"Calibrating Likelihoods towards Consistency in Summarization Models","summary":"  Despite the recent advances in abstractive text summarization, current\nsummarization models still suffer from generating factually inconsistent\nsummaries, reducing their utility for real-world application. We argue that the\nmain reason for such behavior is that the summarization models trained with\nmaximum likelihood objective assign high probability to plausible sequences\ngiven the context, but they often do not accurately rank sequences by their\nconsistency. In this work, we solve this problem by calibrating the likelihood\nof model generated sequences to better align with a consistency metric measured\nby natural language inference (NLI) models. The human evaluation study and\nautomatic metrics show that the calibrated models generate more consistent and\nhigher-quality summaries. We also show that the models trained using our method\nreturn probabilities that are better aligned with the NLI scores, which\nsignificantly increase reliability of summarization models.\n","authors":["Polina Zablotskaia","Misha Khalman","Rishabh Joshi","Livio Baldini Soares","Shoshana Jakobovits","Joshua Maynez","Shashi Narayan"],"pdf_url":"https://arxiv.org/pdf/2310.08764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08753v1","updated":"2023-10-12T22:43:38Z","published":"2023-10-12T22:43:38Z","title":"CompA: Addressing the Gap in Compositional Reasoning in Audio-Language\n  Models","summary":"  A fundamental characteristic of audio is its compositional nature.\nAudio-language models (ALMs) trained using a contrastive approach (e.g., CLAP)\nthat learns a shared representation between audio and language modalities have\nimproved performance in many downstream applications, including zero-shot audio\nclassification, audio retrieval, etc. However, the ability of these models to\neffectively perform compositional reasoning remains largely unexplored and\nnecessitates additional research. In this paper, we propose CompA, a collection\nof two expert-annotated benchmarks with a majority of real-world audio samples,\nto evaluate compositional reasoning in ALMs. Our proposed CompA-order evaluates\nhow well an ALM understands the order or occurrence of acoustic events in\naudio, and CompA-attribute evaluates attribute binding of acoustic events. An\ninstance from either benchmark consists of two audio-caption pairs, where both\naudios have the same acoustic events but with different compositions. An ALM is\nevaluated on how well it matches the right audio to the right caption. Using\nthis benchmark, we first show that current ALMs perform only marginally better\nthan random chance, thereby struggling with compositional reasoning. Next, we\npropose CompA-CLAP, where we fine-tune CLAP using a novel learning method to\nimprove its compositional reasoning abilities. To train CompA-CLAP, we first\npropose improvements to contrastive training with composition-aware hard\nnegatives, allowing for more focused training. Next, we propose a novel modular\ncontrastive loss that helps the model learn fine-grained compositional\nunderstanding and overcomes the acute scarcity of openly available\ncompositional audios. CompA-CLAP significantly improves over all our baseline\nmodels on the CompA benchmark, indicating its superior compositional reasoning\ncapabilities.\n","authors":["Sreyan Ghosh","Ashish Seth","Sonal Kumar","Utkarsh Tyagi","Chandra Kiran Evuru","S. Ramaneswaran","S. Sakshi","Oriol Nieto","Ramani Duraiswami","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2310.08753v1.pdf","comment":"Pre-print under review"},{"id":"http://arxiv.org/abs/2310.03328v2","updated":"2023-10-12T22:14:52Z","published":"2023-10-05T05:55:06Z","title":"Reformulating Domain Adaptation of Large Language Models as\n  Adapt-Retrieve-Revise","summary":"  While large language models (LLMs) like GPT-4 have recently demonstrated\nastonishing zero-shot capabilities in general domain tasks, they often generate\ncontent with hallucinations in specific domains such as Chinese law, hindering\ntheir application in these areas. This is typically due to the absence of\ntraining data that encompasses such a specific domain, preventing GPT-4 from\nacquiring in-domain knowledge. A pressing challenge is that it's not plausible\nto continue training LLMs of such scale on in-domain data.\n  This paper introduces a simple and effective domain adaptation framework for\nGPT-4 by reformulating generation as an \\textbf{adapt-retrieve-revise} process.\nThe initial step is to \\textbf{adapt} an affordable 7B LLM to the target domain\nby continuing learning on in-domain data. When solving a task, we leverage the\nadapted LLM to generate a draft answer given a task query. Then, the draft\nanswer will be used to \\textbf{retrieve} supporting evidence candidates from an\nexternal in-domain knowledge base. Finally, the draft answer and retrieved\nevidence are concatenated into a whole prompt to let GPT-4 assess the evidence\nand \\textbf{revise} the draft answer to generate the final answer.\n  Our proposal combines the advantages of the efficiency of adapting a smaller\n7B model with the evidence-assessing capability of GPT-4 and effectively\nprevents GPT-4 from generating hallucinatory content. In the zero-shot setting\nof four Chinese legal tasks, our method improves accuracy by 33.3\\% compared to\nthe direct generation by GPT-4. When compared to two stronger retrieval-based\nbaselines, our method outperforms them by 15.4\\% and 23.9\\%. Our code will be\nreleased\n","authors":["Zhen wan","Yating Zhang","Yexiang Wang","Fei Cheng","Sadao Kurohashi"],"pdf_url":"https://arxiv.org/pdf/2310.03328v2.pdf","comment":"Under submission to ICLR 2024"},{"id":"http://arxiv.org/abs/2310.08744v1","updated":"2023-10-12T22:12:28Z","published":"2023-10-12T22:12:28Z","title":"Circuit Component Reuse Across Tasks in Transformer Language Models","summary":"  Recent work in mechanistic interpretability has shown that behaviors in\nlanguage models can be successfully reverse-engineered through circuit\nanalysis. A common criticism, however, is that each circuit is task-specific,\nand thus such analysis cannot contribute to understanding the models at a\nhigher level. In this work, we present evidence that insights (both low-level\nfindings about specific heads and higher-level findings about general\nalgorithms) can indeed generalize across tasks. Specifically, we study the\ncircuit discovered in Wang et al. (2022) for the Indirect Object Identification\n(IOI) task and 1.) show that it reproduces on a larger GPT2 model, and 2.) that\nit is mostly reused to solve a seemingly different task: Colored Objects\n(Ippolito & Callison-Burch, 2023). We provide evidence that the process\nunderlying both tasks is functionally very similar, and contains about a 78%\noverlap in in-circuit attention heads. We further present a proof-of-concept\nintervention experiment, in which we adjust four attention heads in middle\nlayers in order to 'repair' the Colored Objects circuit and make it behave like\nthe IOI circuit. In doing so, we boost accuracy from 49.6% to 93.7% on the\nColored Objects task and explain most sources of error. The intervention\naffects downstream attention heads in specific ways predicted by their\ninteractions in the IOI circuit, indicating that this subcircuit behavior is\ninvariant to the different task inputs. Overall, our results provide evidence\nthat it may yet be possible to explain large language models' behavior in terms\nof a relatively small number of interpretable task-general algorithmic building\nblocks and computational components.\n","authors":["Jack Merullo","Carsten Eickhoff","Ellie Pavlick"],"pdf_url":"https://arxiv.org/pdf/2310.08744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14600v2","updated":"2023-10-12T21:58:06Z","published":"2023-05-24T00:46:02Z","title":"Learning Semantic Role Labeling from Compatible Label Sequences","summary":"  Semantic role labeling (SRL) has multiple disjoint label sets, e.g., VerbNet\nand PropBank. Creating these datasets is challenging, therefore a natural\nquestion is how to use each one to help the other. Prior work has shown that\ncross-task interaction helps, but only explored multitask learning so far. A\ncommon issue with multi-task setup is that argument sequences are still\nseparately decoded, running the risk of generating structurally inconsistent\nlabel sequences (as per lexicons like Semlink). In this paper, we eliminate\nsuch issue with a framework that jointly models VerbNet and PropBank labels as\none sequence. In this setup, we show that enforcing Semlink constraints during\ndecoding constantly improves the overall F1. With special input constructions,\nour joint model infers VerbNet arguments from given PropBank arguments with\nover 99 F1. For learning, we propose a constrained marginal model that learns\nwith knowledge defined in Semlink to further benefit from the large amounts of\nPropBank-only data. On the joint benchmark based on CoNLL05, our models achieve\nstate-of-the-art F1's, outperforming the prior best in-domain model by 3.5\n(VerbNet) and 0.8 (PropBank). For out-of-domain generalization, our models\nsurpass the prior best by 3.4 (VerbNet) and 0.2 (PropBank).\n","authors":["Tao Li","Ghazaleh Kazeminejad","Susan W. Brown","Martha Palmer","Vivek Srikumar"],"pdf_url":"https://arxiv.org/pdf/2305.14600v2.pdf","comment":"Accepted at Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.08740v1","updated":"2023-10-12T21:53:37Z","published":"2023-10-12T21:53:37Z","title":"A Zero-Shot Language Agent for Computer Control with Structured\n  Reflection","summary":"  Large language models (LLMs) have shown increasing capacity at planning and\nexecuting a high-level goal in a live computer environment (e.g. MiniWoB++). To\nperform a task, recent works often require a model to learn from trace examples\nof the task via either supervised learning or few/many-shot prompting. Without\nthese trace examples, it remains a challenge how an agent can autonomously\nlearn and improve its control on a computer, which limits the ability of an\nagent to perform a new task. We approach this problem with a zero-shot agent\nthat requires no given expert traces. Our agent plans for executable actions on\na partially observed environment, and iteratively progresses a task by\nidentifying and learning from its mistakes via self-reflection and structured\nthought management. On the easy tasks of MiniWoB++, we show that our zero-shot\nagent often outperforms recent SoTAs, with more efficient reasoning. For tasks\nwith more complexity, our reflective agent performs on par with prior best\nmodels, even though previous works had the advantages of accessing expert\ntraces or additional screen information.\n","authors":["Tao Li","Gang Li","Zhiwei Deng","Bryan Wang","Yang Li"],"pdf_url":"https://arxiv.org/pdf/2310.08740v1.pdf","comment":"Accepted at Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2305.16130v2","updated":"2023-10-12T21:43:18Z","published":"2023-05-25T15:04:01Z","title":"A Mechanism for Solving Relational Tasks in Transformer Language Models","summary":"  A primary criticism towards language models (LMs) is their inscrutability.\nThis paper presents evidence that, despite their size and complexity, LMs\nsometimes exploit a simple computational mechanism to solve one-to-one\nrelational tasks (e.g., capital_of(Poland)=Warsaw). We investigate a range of\nlanguage model sizes (from 124M parameters to 176B parameters) in an in-context\nlearning setting, and find that for a variety of tasks (involving capital\ncities, upper-casing, and past-tensing) a key part of the mechanism reduces to\na simple linear update typically applied by the feedforward (FFN) networks.\nThese updates also tend to promote the output of the relation in a\ncontent-independent way (e.g., encoding Poland:Warsaw::China:Beijing),\nrevealing a predictable pattern that these models take in solving these tasks.\nWe further show that this mechanism is specific to tasks that require retrieval\nfrom pretraining memory, rather than retrieval from local context. Our results\ncontribute to a growing body of work on the mechanistic interpretability of\nLLMs, and offer reason to be optimistic that, despite the massive and\nnon-linear nature of the models, the strategies they ultimately use to solve\ntasks can sometimes reduce to familiar and even intuitive algorithms.\n","authors":["Jack Merullo","Carsten Eickhoff","Ellie Pavlick"],"pdf_url":"https://arxiv.org/pdf/2305.16130v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08730v2","updated":"2023-10-12T21:28:02Z","published":"2023-09-15T19:31:40Z","title":"MusiLingo: Bridging Music and Text with Pre-trained Language Models for\n  Music Captioning and Query Response","summary":"  Large Language Models (LLMs) have shown immense potential in multimodal\napplications, yet the convergence of textual and musical domains remains\nrelatively unexplored. To address this gap, we present MusiLingo, a novel\nsystem for music caption generation and music-related query responses.\nMusiLingo employs a single projection layer to align music representations from\nthe pre-trained frozen music audio model MERT with the frozen Vicuna-7B\nlanguage model (an adaption of LLaMA), bridging the gap between music audio and\ntextual contexts. We train it on an extensive music caption dataset and\nfine-tune it with instructional data. Due to the scarcity of high-quality music\nQ\\&A datasets, we created the Music Instruct (MI) dataset from captions in the\nMusicCaps datasets, tailored for open-ended music inquiries. Empirical\nevaluations demonstrate its competitive performance in generating music\ncaptions and composing music-related Q&A pairs.\n","authors":["Zihao Deng","Yinghao Ma","Yudong Liu","Rongchen Guo","Ge Zhang","Wenhu Chen","Wenhao Huang","Emmanouil Benetos"],"pdf_url":"https://arxiv.org/pdf/2309.08730v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04711v3","updated":"2023-10-12T21:25:15Z","published":"2023-08-09T05:06:39Z","title":"Answering Unseen Questions With Smaller Language Models Using Rationale\n  Generation and Dense Retrieval","summary":"  When provided with sufficient explanatory context, smaller Language Models\nhave been shown to exhibit strong reasoning ability on challenging short-answer\nquestion-answering tasks where the questions are unseen in training. We\nevaluate two methods for further improvement in this setting. Both methods\nfocus on combining rationales generated by a larger Language Model with longer\ncontexts created from a multi-hop dense retrieval system. The first method\n($\\textit{RR}$) involves training a Rationale Ranking model to score both\ngenerated rationales and retrieved contexts with respect to relevance and\ntruthfulness. We then use the scores to derive combined contexts from both\nknowledge sources using a number of combinatory strategies. For the second\nmethod ($\\textit{RATD}$) we utilise retrieval-augmented training datasets\ndeveloped by Hartill et al. 2023 to train a smaller Reasoning model such that\nit becomes proficient at utilising relevant information from longer text\nsequences that may be only partially evidential and frequently contain many\nirrelevant sentences. We find that both methods significantly improve results.\nOur single best Reasoning model materially improves upon strong comparable\nprior baselines for unseen evaluation datasets (StrategyQA 58.9 $\\rightarrow$\n61.7 acc., CommonsenseQA 63.6 $\\rightarrow$ 72.7 acc., ARC-DA 31.6\n$\\rightarrow$ 52.1 F1, IIRC 25.5 $\\rightarrow$ 27.3 F1) and a version utilising\nour prior knowledge of each type of question in selecting a context combination\nstrategy does even better. Our proposed models also generally outperform direct\nprompts against much larger models (BLOOM 175B and StableVicuna 13B) in both\nfew-shot chain-of-thought and standard few-shot settings.\n","authors":["Tim Hartill","Diana Benavides-Prado","Michael Witbrock","Patricia J. Riddle"],"pdf_url":"https://arxiv.org/pdf/2308.04711v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14707v2","updated":"2023-10-12T21:13:09Z","published":"2023-05-24T04:24:16Z","title":"SciFix: Outperforming GPT3 on Scientific Factual Error Correction","summary":"  Due to the prohibitively high cost of creating error correction datasets,\nmost Factual Claim Correction methods rely on a powerful verification model to\nguide the correction process. This leads to a significant drop in performance\nin domains like scientific claims, where good verification models do not always\nexist. In this work, we introduce SciFix, a scientific claim correction system\nthat does not require a verifier but can outperform existing methods by a\nconsiderable margin -- achieving correction accuracy of 84% on the SciFact\ndataset, 77% on SciFact-Open and 72% on the CovidFact dataset, compared to next\nbest accuracies of 7%, 5%, and 15% on the same datasets respectively. Our\nmethod leverages the power of prompting with LLMs during training to create a\nrichly annotated dataset that can be used for fully supervised training and\nregularization. We additionally use a claim-aware decoding procedure to improve\nthe quality of corrected claims. Our method outperforms the very LLM that was\nused to generate the annotated dataset -- with Few-Shot Prompting on GPT3.5\nachieving 58%, 61%, and 64% on the respective datasets, a consistently lower\ncorrection accuracy, despite using nearly 800 times as many parameters as our\nmodel.\n","authors":["Dhananjay Ashok","Atharva Kulkarni","Hai Pham","Barnabás Póczos"],"pdf_url":"https://arxiv.org/pdf/2305.14707v2.pdf","comment":"To appear in proceedings of EMNLP2023 (findings)"},{"id":"http://arxiv.org/abs/2310.08715v1","updated":"2023-10-12T20:53:39Z","published":"2023-10-12T20:53:39Z","title":"Toward Joint Language Modeling for Speech Units and Text","summary":"  Speech and text are two major forms of human language. The research community\nhas been focusing on mapping speech to text or vice versa for many years.\nHowever, in the field of language modeling, very little effort has been made to\nmodel them jointly. In light of this, we explore joint language modeling for\nspeech units and text. Specifically, we compare different speech tokenizers to\ntransform continuous speech signals into discrete units and use different\nmethods to construct mixed speech-text data. We introduce automatic metrics to\nevaluate how well the joint LM mixes speech and text. We also fine-tune the LM\non downstream spoken language understanding (SLU) tasks with different\nmodalities (speech or text) and test its performance to assess the model's\nlearning of shared representations. Our results show that by mixing speech\nunits and text with our proposed mixing techniques, the joint LM improves over\na speech-only baseline on SLU tasks and shows zero-shot cross-modal\ntransferability.\n","authors":["Ju-Chieh Chou","Chung-Ming Chien","Wei-Ning Hsu","Karen Livescu","Arun Babu","Alexis Conneau","Alexei Baevski","Michael Auli"],"pdf_url":"https://arxiv.org/pdf/2310.08715v1.pdf","comment":"EMNLP findings 2023"},{"id":"http://arxiv.org/abs/2310.01248v2","updated":"2023-10-12T20:10:37Z","published":"2023-10-02T14:32:07Z","title":"Improving Emotional Expression and Cohesion in Image-Based Playlist\n  Description and Music Topics: A Continuous Parameterization Approach","summary":"  Text generation in image-based platforms, particularly for music-related\ncontent, requires precise control over text styles and the incorporation of\nemotional expression. However, existing approaches often need help to control\nthe proportion of external factors in generated text and rely on discrete\ninputs, lacking continuous control conditions for desired text generation. This\nstudy proposes Continuous Parameterization for Controlled Text Generation\n(CPCTG) to overcome these limitations. Our approach leverages a Language Model\n(LM) as a style learner, integrating Semantic Cohesion (SC) and Emotional\nExpression Proportion (EEP) considerations. By enhancing the reward method and\nmanipulating the CPCTG level, our experiments on playlist description and music\ntopic generation tasks demonstrate significant improvements in ROUGE scores,\nindicating enhanced relevance and coherence in the generated text.\n","authors":["Yuelyu Ji","Yuheng Song","Wei Wang","Ruoyi Xu","Zhongqian Xie","Huiyun Liu"],"pdf_url":"https://arxiv.org/pdf/2310.01248v2.pdf","comment":"Becasue I find some important fourmulation need to change"},{"id":"http://arxiv.org/abs/2310.08678v1","updated":"2023-10-12T19:28:57Z","published":"2023-10-12T19:28:57Z","title":"Can GPT models be Financial Analysts? An Evaluation of ChatGPT and GPT-4\n  on mock CFA Exams","summary":"  Large Language Models (LLMs) have demonstrated remarkable performance on a\nwide range of Natural Language Processing (NLP) tasks, often matching or even\nbeating state-of-the-art task-specific models. This study aims at assessing the\nfinancial reasoning capabilities of LLMs. We leverage mock exam questions of\nthe Chartered Financial Analyst (CFA) Program to conduct a comprehensive\nevaluation of ChatGPT and GPT-4 in financial analysis, considering Zero-Shot\n(ZS), Chain-of-Thought (CoT), and Few-Shot (FS) scenarios. We present an\nin-depth analysis of the models' performance and limitations, and estimate\nwhether they would have a chance at passing the CFA exams. Finally, we outline\ninsights into potential strategies and improvements to enhance the\napplicability of LLMs in finance. In this perspective, we hope this work paves\nthe way for future studies to continue enhancing LLMs for financial reasoning\nthrough rigorous evaluation.\n","authors":["Ethan Callanan","Amarachi Mbakwe","Antony Papadimitriou","Yulong Pei","Mathieu Sibue","Xiaodan Zhu","Zhiqiang Ma","Xiaomo Liu","Sameena Shah"],"pdf_url":"https://arxiv.org/pdf/2310.08678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.09849v5","updated":"2023-10-12T19:02:38Z","published":"2022-12-19T20:46:43Z","title":"Dataless Knowledge Fusion by Merging Weights of Language Models","summary":"  Fine-tuning pre-trained language models has become the prevalent paradigm for\nbuilding downstream NLP models. Oftentimes fine-tuned models are readily\navailable but their training data is not, due to data privacy or intellectual\nproperty concerns. This creates a barrier to fusing knowledge across individual\nmodels to yield a better single model. In this paper, we study the problem of\nmerging individual models built on different training data sets to obtain a\nsingle model that performs well both across all data set domains and can\ngeneralize on out-of-domain data. We propose a dataless knowledge fusion method\nthat merges models in their parameter space, guided by weights that minimize\nprediction differences between the merged model and the individual models. Over\na battery of evaluation settings, we show that the proposed method\nsignificantly outperforms baselines such as Fisher-weighted averaging or model\nensembling. Further, we find that our method is a promising alternative to\nmulti-task learning that can preserve or sometimes improve over the individual\nmodels without access to the training data. Finally, model merging is more\nefficient than training a multi-task model, thus making it applicable to a\nwider set of scenarios.\n","authors":["Xisen Jin","Xiang Ren","Daniel Preotiuc-Pietro","Pengxiang Cheng"],"pdf_url":"https://arxiv.org/pdf/2212.09849v5.pdf","comment":"ICLR 2023; The code is available at\n  https://github.com/bloomberg/dataless-model-merging"},{"id":"http://arxiv.org/abs/1806.07722v3","updated":"2023-10-12T18:51:54Z","published":"2018-06-15T09:29:05Z","title":"Stylized innovation: generating timelines by interrogating incrementally\n  available randomised dictionaries","summary":"  A key challenge when trying to understand innovation is that it is a dynamic,\nongoing process, which can be highly contingent on ephemeral factors such as\nculture, economics, or luck. This means that any analysis of the real-world\nprocess must necessarily be historical - and thus probably too late to be most\nuseful - but also cannot be sure what the properties of the web of connections\nbetween innovations is or was. Here I try to address this by designing and\ngenerating a set of synthetic innovation web \"dictionaries\" that can be used to\nhost sampled innovation timelines, probe the overall statistics and behaviours\nof these processes, and determine the degree of their reliance on the structure\nor generating algorithm. Thus, inspired by the work of Fink, Reeves, Palma and\nFarr (2017) on innovation in language, gastronomy, and technology, I study how\nnew symbol discovery manifests itself in terms of additional \"word\" vocabulary\nbeing available from dictionaries generated from a finite number of symbols.\nSeveral distinct dictionary generation models are investigated using numerical\nsimulation, with emphasis on the scaling of knowledge as dictionary generators\nand parameters are varied, and the role of which order the symbols are\ndiscovered in.\n","authors":["Paul Kinsler"],"pdf_url":"https://arxiv.org/pdf/1806.07722v3.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2302.04863v3","updated":"2023-10-12T18:42:34Z","published":"2023-02-09T18:59:18Z","title":"Knowledge is a Region in Weight Space for Fine-tuned Language Models","summary":"  Research on neural networks has focused on understanding a single model\ntrained on a single dataset. However, relatively little is known about the\nrelationships between different models, particularly those trained or tested on\ndifferent datasets. We address this by studying how the weight space and the\nunderlying loss landscape of different models are interconnected.\n  Specifically, we demonstrate that finetuned models that were optimized for\nhigh performance, reside in well-defined regions in weight space, and vice\nversa -- that any model that resides anywhere in those regions also exhibits\nhigh performance. Notably, we show that language models that have been\nfinetuned on the same dataset form a tight cluster in the weight space, while\nmodels finetuned on different datasets from the same underlying task form a\nlooser cluster. Moreover, traversing around the region between the models leads\nto new models that perform comparably or even better than models obtained via\nfinetuning, even on tasks that the original models were not finetuned on.\n  Our findings provide insight into the relationships between models,\ndemonstrating that a model positioned between two similar models can acquire\nthe knowledge of both. We leverage this and design a method for selecting a\nbetter model for efficient finetuning. Specifically, we show that starting from\nthe center of the region is as effective, if not more, than using the\npretrained model in 11 out of 12 datasets, resulting in an average accuracy\nimprovement of 3.06.\n","authors":["Almog Gueta","Elad Venezian","Colin Raffel","Noam Slonim","Yoav Katz","Leshem Choshen"],"pdf_url":"https://arxiv.org/pdf/2302.04863v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04668v2","updated":"2023-10-12T18:34:08Z","published":"2023-10-07T03:14:11Z","title":"Label-free Node Classification on Graphs with Large Language Models\n  (LLMS)","summary":"  In recent years, there have been remarkable advancements in node\nclassification achieved by Graph Neural Networks (GNNs). However, they\nnecessitate abundant high-quality labels to ensure promising performance. In\ncontrast, Large Language Models (LLMs) exhibit impressive zero-shot proficiency\non text-attributed graphs. Yet, they face challenges in efficiently processing\nstructural data and suffer from high inference costs. In light of these\nobservations, this work introduces a label-free node classification on graphs\nwith LLMs pipeline, LLM-GNN. It amalgamates the strengths of both GNNs and LLMs\nwhile mitigating their limitations. Specifically, LLMs are leveraged to\nannotate a small portion of nodes and then GNNs are trained on LLMs'\nannotations to make predictions for the remaining large portion of nodes. The\nimplementation of LLM-GNN faces a unique challenge: how can we actively select\nnodes for LLMs to annotate and consequently enhance the GNN training? How can\nwe leverage LLMs to obtain annotations of high quality, representativeness, and\ndiversity, thereby enhancing GNN performance with less cost? To tackle this\nchallenge, we develop an annotation quality heuristic and leverage the\nconfidence scores derived from LLMs to advanced node selection. Comprehensive\nexperimental results validate the effectiveness of LLM-GNN. In particular,\nLLM-GNN can achieve an accuracy of 74.9% on a vast-scale dataset \\products with\na cost less than 1 dollar.\n","authors":["Zhikai Chen","Haitao Mao","Hongzhi Wen","Haoyu Han","Wei Jin","Haiyang Zhang","Hui Liu","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2310.04668v2.pdf","comment":"The code will be available soon via\n  https://github.com/CurryTang/LLMGNN"},{"id":"http://arxiv.org/abs/2310.08659v1","updated":"2023-10-12T18:34:08Z","published":"2023-10-12T18:34:08Z","title":"LoftQ: LoRA-Fine-Tuning-Aware Quantization for Large Language Models","summary":"  Quantization is an indispensable technique for serving Large Language Models\n(LLMs) and has recently found its way into LoRA fine-tuning. In this work we\nfocus on the scenario where quantization and LoRA fine-tuning are applied\ntogether on a pre-trained model. In such cases it is common to observe a\nconsistent gap in the performance on downstream tasks between full fine-tuning\nand quantization plus LoRA fine-tuning approach. In response, we propose LoftQ\n(LoRA-Fine-Tuning-aware Quantization), a novel quantization framework that\nsimultaneously quantizes an LLM and finds a proper low-rank initialization for\nLoRA fine-tuning. Such an initialization alleviates the discrepancy between the\nquantized and full-precision model and significantly improves the\ngeneralization in downstream tasks. We evaluate our method on natural language\nunderstanding, question answering, summarization, and natural language\ngeneration tasks. Experiments show that our method is highly effective and\noutperforms existing quantization methods, especially in the challenging 2-bit\nand 2/4-bit mixed precision regimes. We will release our code.\n","authors":["Yixiao Li","Yifan Yu","Chen Liang","Pengcheng He","Nikos Karampatziakis","Weizhu Chen","Tuo Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.08659v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05280v3","updated":"2023-10-12T18:28:09Z","published":"2023-10-08T21:03:18Z","title":"Are Personalized Stochastic Parrots More Dangerous? Evaluating Persona\n  Biases in Dialogue Systems","summary":"  Recent advancements in Large Language Models empower them to follow freeform\ninstructions, including imitating generic or specific demographic personas in\nconversations. Generic personas refer to an individual from a demographic group\n(e.g. an Asian person), whereas specific personas can be actual names of\nhistorical figures. While the adoption of personas allows dialogue systems to\nbe more engaging and approachable to users, it also carries the potential risk\nof exacerbating social biases in model responses, further causing societal\nharms through interactions with users. In this paper, we systematically study\n\"persona biases\", which we define to be the sensitivity of harmful dialogue\nmodel behaviors to different persona adoptions. We categorize persona biases\ninto biases in harmful expression and harmful agreement, as well as establish a\ncomprehensive evaluation framework to measure persona biases in five aspects:\nOffensiveness, Toxic Continuation, Regard, Stereotype Agreement, and Toxic\nAgreement. Additionally, we propose to comprehensively investigate persona\nbiases through experimenting with UniversalPersona, a systematized persona\ndataset with a comprehensive list of both generic and specific model personas.\nThrough benchmarking on four different models, including Blender, ChatGPT,\nAlpaca, and Vicuna, our study uncovers significant persona biases in these\ndialogue systems.Findings of our study underscores the immediate need to\nrevisit the use of persona traits in dialogue agents, to ensure their safe\napplication.\n","authors":["Yixin Wan","Jieyu Zhao","Aman Chadha","Nanyun Peng","Kai-Wei Chang"],"pdf_url":"https://arxiv.org/pdf/2310.05280v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05185v2","updated":"2023-10-12T18:26:46Z","published":"2023-10-08T14:47:13Z","title":"Text2NKG: Fine-Grained N-ary Relation Extraction for N-ary relational\n  Knowledge Graph Construction","summary":"  Beyond traditional binary relational facts, n-ary relational knowledge graphs\n(NKGs) are comprised of n-ary relational facts containing more than two\nentities, which are closer to real-world facts with broader applications.\nHowever, the construction of NKGs still significantly relies on manual labor,\nand n-ary relation extraction still remains at a course-grained level, which is\nalways in a single schema and fixed arity of entities. To address these\nrestrictions, we propose Text2NKG, a novel fine-grained n-ary relation\nextraction framework for n-ary relational knowledge graph construction. We\nintroduce a span-tuple classification approach with hetero-ordered merging to\naccomplish fine-grained n-ary relation extraction in different arity.\nFurthermore, Text2NKG supports four typical NKG schemas: hyper-relational\nschema, event-based schema, role-based schema, and hypergraph-based schema,\nwith high flexibility and practicality. Experimental results demonstrate that\nText2NKG outperforms the previous state-of-the-art model by nearly 20\\% points\nin the $F_1$ scores on the fine-grained n-ary relation extraction benchmark in\nthe hyper-relational schema. Our code and datasets are publicly available.\n","authors":["Haoran Luo","Haihong E","Yuhao Yang","Tianyu Yao","Yikai Guo","Zichen Tang","Wentai Zhang","Kaiyang Wan","Shiyao Peng","Meina Song","Wei Lin"],"pdf_url":"https://arxiv.org/pdf/2310.05185v2.pdf","comment":"Preprint"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2310.08587v1","updated":"2023-10-12T17:59:58Z","published":"2023-10-12T17:59:58Z","title":"Is Generalized Dynamic Novel View Synthesis from Monocular Videos\n  Possible Today?","summary":"  Rendering scenes observed in a monocular video from novel viewpoints is a\nchallenging problem. For static scenes the community has studied both\nscene-specific optimization techniques, which optimize on every test scene, and\ngeneralized techniques, which only run a deep net forward pass on a test scene.\nIn contrast, for dynamic scenes, scene-specific optimization techniques exist,\nbut, to our best knowledge, there is currently no generalized method for\ndynamic novel view synthesis from a given monocular video. To answer whether\ngeneralized dynamic novel view synthesis from monocular videos is possible\ntoday, we establish an analysis framework based on existing techniques and work\ntoward the generalized approach. We find a pseudo-generalized process without\nscene-specific appearance optimization is possible, but geometrically and\ntemporally consistent depth estimates are needed. Despite no scene-specific\nappearance optimization, the pseudo-generalized approach improves upon some\nscene-specific methods.\n","authors":["Xiaoming Zhao","Alex Colburn","Fangchang Ma","Miguel Angel Bautista","Joshua M. Susskind","Alexander G. Schwing"],"pdf_url":"https://arxiv.org/pdf/2310.08587v1.pdf","comment":"Project page: https://xiaoming-zhao.github.io/projects/pgdvs"},{"id":"http://arxiv.org/abs/2310.08588v1","updated":"2023-10-12T17:59:58Z","published":"2023-10-12T17:59:58Z","title":"Octopus: Embodied Vision-Language Programmer from Environmental Feedback","summary":"  Large vision-language models (VLMs) have achieved substantial progress in\nmultimodal perception and reasoning. Furthermore, when seamlessly integrated\ninto an embodied agent, it signifies a crucial stride towards the creation of\nautonomous and context-aware systems capable of formulating plans and executing\ncommands with precision. In this paper, we introduce Octopus, a novel VLM\ndesigned to proficiently decipher an agent's vision and textual task objectives\nand to formulate intricate action sequences and generate executable code. Our\ndesign allows the agent to adeptly handle a wide spectrum of tasks, ranging\nfrom mundane daily chores in simulators to sophisticated interactions in\ncomplex video games. Octopus is trained by leveraging GPT-4 to control an\nexplorative agent to generate training data, i.e., action blueprints and the\ncorresponding executable code, within our experimental environment called\nOctoVerse. We also collect the feedback that allows the enhanced training\nscheme of Reinforcement Learning with Environmental Feedback (RLEF). Through a\nseries of experiments, we illuminate Octopus's functionality and present\ncompelling results, and the proposed RLEF turns out to refine the agent's\ndecision-making. By open-sourcing our model architecture, simulator, and\ndataset, we aspire to ignite further innovation and foster collaborative\napplications within the broader embodied AI community.\n","authors":["Jingkang Yang","Yuhao Dong","Shuai Liu","Bo Li","Ziyue Wang","Chencheng Jiang","Haoran Tan","Jiamu Kang","Yuanhan Zhang","Kaiyang Zhou","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2310.08588v1.pdf","comment":"Project Page: https://choiszt.github.io/Octopus/, Codebase:\n  https://github.com/dongyh20/Octopus"},{"id":"http://arxiv.org/abs/2310.08585v1","updated":"2023-10-12T17:59:57Z","published":"2023-10-12T17:59:57Z","title":"Im4D: High-Fidelity and Real-Time Novel View Synthesis for Dynamic\n  Scenes","summary":"  This paper aims to tackle the challenge of dynamic view synthesis from\nmulti-view videos. The key observation is that while previous grid-based\nmethods offer consistent rendering, they fall short in capturing appearance\ndetails of a complex dynamic scene, a domain where multi-view image-based\nrendering methods demonstrate the opposite properties. To combine the best of\ntwo worlds, we introduce Im4D, a hybrid scene representation that consists of a\ngrid-based geometry representation and a multi-view image-based appearance\nrepresentation. Specifically, the dynamic geometry is encoded as a 4D density\nfunction composed of spatiotemporal feature planes and a small MLP network,\nwhich globally models the scene structure and facilitates the rendering\nconsistency. We represent the scene appearance by the original multi-view\nvideos and a network that learns to predict the color of a 3D point from image\nfeatures, instead of memorizing detailed appearance totally with networks,\nthereby naturally making the learning of networks easier. Our method is\nevaluated on five dynamic view synthesis datasets including DyNeRF, ZJU-MoCap,\nNHR, DNA-Rendering and ENeRF-Outdoor datasets. The results show that Im4D\nexhibits state-of-the-art performance in rendering quality and can be trained\nefficiently, while realizing real-time rendering with a speed of 79.8 FPS for\n512x512 images, on a single RTX 3090 GPU.\n","authors":["Haotong Lin","Sida Peng","Zhen Xu","Tao Xie","Xingyi He","Hujun Bao","Xiaowei Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.08585v1.pdf","comment":"SIGGRAPH Asia 2023; Project page: https://zju3dv.github.io/im4d"},{"id":"http://arxiv.org/abs/2310.08586v1","updated":"2023-10-12T17:59:57Z","published":"2023-10-12T17:59:57Z","title":"PonderV2: Pave the Way for 3D Foundataion Model with A Universal\n  Pre-training Paradigm","summary":"  In contrast to numerous NLP and 2D computer vision foundational models, the\nlearning of a robust and highly generalized 3D foundational model poses\nconsiderably greater challenges. This is primarily due to the inherent data\nvariability and the diversity of downstream tasks. In this paper, we introduce\na comprehensive 3D pre-training framework designed to facilitate the\nacquisition of efficient 3D representations, thereby establishing a pathway to\n3D foundational models. Motivated by the fact that informative 3D features\nshould be able to encode rich geometry and appearance cues that can be utilized\nto render realistic images, we propose a novel universal paradigm to learn\npoint cloud representations by differentiable neural rendering, serving as a\nbridge between 3D and 2D worlds. We train a point cloud encoder within a\ndevised volumetric neural renderer by comparing the rendered images with the\nreal images. Notably, our approach demonstrates the seamless integration of the\nlearned 3D encoder into diverse downstream tasks. These tasks encompass not\nonly high-level challenges such as 3D detection and segmentation but also\nlow-level objectives like 3D reconstruction and image synthesis, spanning both\nindoor and outdoor scenarios. Besides, we also illustrate the capability of\npre-training a 2D backbone using the proposed universal methodology, surpassing\nconventional pre-training methods by a large margin. For the first time,\n\\sexyname achieves state-of-the-art performance on 11 indoor and outdoor\nbenchmarks. The consistent improvements in various settings imply the\neffectiveness of the proposed method. Code and models will be made available at\nhttps://github.com/Pointcept/Pointcept.\n","authors":["Haoyi Zhu","Honghui Yang","Xiaoyang Wu","Di Huang","Sha Zhang","Xianglong He","Tong He","Hengshuang Zhao","Chunhua Shen","Yu Qiao","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2310.08586v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2301.00157"},{"id":"http://arxiv.org/abs/2310.08584v1","updated":"2023-10-12T17:59:55Z","published":"2023-10-12T17:59:55Z","title":"Is ImageNet worth 1 video? Learning strong image encoders from 1 long\n  unlabelled video","summary":"  Self-supervised learning has unlocked the potential of scaling up pretraining\nto billions of images, since annotation is unnecessary. But are we making the\nbest use of data? How more economical can we be? In this work, we attempt to\nanswer this question by making two contributions. First, we investigate\nfirst-person videos and introduce a \"Walking Tours\" dataset. These videos are\nhigh-resolution, hours-long, captured in a single uninterrupted take, depicting\na large number of objects and actions with natural scene transitions. They are\nunlabeled and uncurated, thus realistic for self-supervision and comparable\nwith human learning.\n  Second, we introduce a novel self-supervised image pretraining method\ntailored for learning from continuous videos. Existing methods typically adapt\nimage-based pretraining approaches to incorporate more frames. Instead, we\nadvocate a \"tracking to learn to recognize\" approach. Our method called DoRA,\nleads to attention maps that Discover and tRAck objects over time in an\nend-to-end manner, using transformer cross-attention. We derive multiple views\nfrom the tracks and use them in a classical self-supervised distillation loss.\nUsing our novel approach, a single Walking Tours video remarkably becomes a\nstrong competitor to ImageNet for several image and video downstream tasks.\n","authors":["Shashanka Venkataramanan","Mamshad Nayeem Rizve","João Carreira","Yuki M. Asano","Yannis Avrithis"],"pdf_url":"https://arxiv.org/pdf/2310.08584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08581v1","updated":"2023-10-12T17:59:41Z","published":"2023-10-12T17:59:41Z","title":"Universal Visual Decomposer: Long-Horizon Manipulation Made Easy","summary":"  Real-world robotic tasks stretch over extended horizons and encompass\nmultiple stages. Learning long-horizon manipulation tasks, however, is a\nlong-standing challenge, and demands decomposing the overarching task into\nseveral manageable subtasks to facilitate policy learning and generalization to\nunseen tasks. Prior task decomposition methods require task-specific knowledge,\nare computationally intensive, and cannot readily be applied to new tasks. To\naddress these shortcomings, we propose Universal Visual Decomposer (UVD), an\noff-the-shelf task decomposition method for visual long horizon manipulation\nusing pre-trained visual representations designed for robotic control. At a\nhigh level, UVD discovers subgoals by detecting phase shifts in the embedding\nspace of the pre-trained representation. Operating purely on visual\ndemonstrations without auxiliary information, UVD can effectively extract\nvisual subgoals embedded in the videos, while incurring zero additional\ntraining cost on top of standard visuomotor policy training. Goal-conditioned\npolicies learned with UVD-discovered subgoals exhibit significantly improved\ncompositional generalization at test time to unseen tasks. Furthermore,\nUVD-discovered subgoals can be used to construct goal-based reward shaping that\njump-starts temporally extended exploration for reinforcement learning. We\nextensively evaluate UVD on both simulation and real-world tasks, and in all\ncases, UVD substantially outperforms baselines across imitation and\nreinforcement learning settings on in-domain and out-of-domain task sequences\nalike, validating the clear advantage of automated visual task decomposition\nwithin the simple, compact UVD framework.\n","authors":["Zichen Zhang","Yunshuang Li","Osbert Bastani","Abhishek Gupta","Dinesh Jayaraman","Yecheng Jason Ma","Luca Weihs"],"pdf_url":"https://arxiv.org/pdf/2310.08581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08580v1","updated":"2023-10-12T17:59:38Z","published":"2023-10-12T17:59:38Z","title":"OmniControl: Control Any Joint at Any Time for Human Motion Generation","summary":"  We present a novel approach named OmniControl for incorporating flexible\nspatial control signals into a text-conditioned human motion generation model\nbased on the diffusion process. Unlike previous methods that can only control\nthe pelvis trajectory, OmniControl can incorporate flexible spatial control\nsignals over different joints at different times with only one model.\nSpecifically, we propose analytic spatial guidance that ensures the generated\nmotion can tightly conform to the input control signals. At the same time,\nrealism guidance is introduced to refine all the joints to generate more\ncoherent motion. Both the spatial and realism guidance are essential and they\nare highly complementary for balancing control accuracy and motion realism. By\ncombining them, OmniControl generates motions that are realistic, coherent, and\nconsistent with the spatial constraints. Experiments on HumanML3D and KIT-ML\ndatasets show that OmniControl not only achieves significant improvement over\nstate-of-the-art methods on pelvis control but also shows promising results\nwhen incorporating the constraints over other joints.\n","authors":["Yiming Xie","Varun Jampani","Lei Zhong","Deqing Sun","Huaizu Jiang"],"pdf_url":"https://arxiv.org/pdf/2310.08580v1.pdf","comment":"Project page: https://neu-vi.github.io/omnicontrol/"},{"id":"http://arxiv.org/abs/2310.08579v1","updated":"2023-10-12T17:59:34Z","published":"2023-10-12T17:59:34Z","title":"HyperHuman: Hyper-Realistic Human Generation with Latent Structural\n  Diffusion","summary":"  Despite significant advances in large-scale text-to-image models, achieving\nhyper-realistic human image generation remains a desirable yet unsolved task.\nExisting models like Stable Diffusion and DALL-E 2 tend to generate human\nimages with incoherent parts or unnatural poses. To tackle these challenges,\nour key insight is that human image is inherently structural over multiple\ngranularities, from the coarse-level body skeleton to fine-grained spatial\ngeometry. Therefore, capturing such correlations between the explicit\nappearance and latent structure in one model is essential to generate coherent\nand natural human images. To this end, we propose a unified framework,\nHyperHuman, that generates in-the-wild human images of high realism and diverse\nlayouts. Specifically, 1) we first build a large-scale human-centric dataset,\nnamed HumanVerse, which consists of 340M images with comprehensive annotations\nlike human pose, depth, and surface normal. 2) Next, we propose a Latent\nStructural Diffusion Model that simultaneously denoises the depth and surface\nnormal along with the synthesized RGB image. Our model enforces the joint\nlearning of image appearance, spatial relationship, and geometry in a unified\nnetwork, where each branch in the model complements to each other with both\nstructural awareness and textural richness. 3) Finally, to further boost the\nvisual quality, we propose a Structure-Guided Refiner to compose the predicted\nconditions for more detailed generation of higher resolution. Extensive\nexperiments demonstrate that our framework yields the state-of-the-art\nperformance, generating hyper-realistic human images under diverse scenarios.\nProject Page: https://snap-research.github.io/HyperHuman/\n","authors":["Xian Liu","Jian Ren","Aliaksandr Siarohin","Ivan Skorokhodov","Yanyu Li","Dahua Lin","Xihui Liu","Ziwei Liu","Sergey Tulyakov"],"pdf_url":"https://arxiv.org/pdf/2310.08579v1.pdf","comment":"Project Page: https://snap-research.github.io/HyperHuman/"},{"id":"http://arxiv.org/abs/2310.08577v1","updated":"2023-10-12T17:59:30Z","published":"2023-10-12T17:59:30Z","title":"Visual Data-Type Understanding does not emerge from Scaling\n  Vision-Language Models","summary":"  Recent advances in the development of vision-language models (VLMs) are\nyielding remarkable success in recognizing visual semantic content, including\nimpressive instances of compositional image understanding. Here, we introduce\nthe novel task of \\textit{Visual Data-Type Identification}, a basic perceptual\nskill with implications for data curation (e.g., noisy data-removal from large\ndatasets, domain-specific retrieval) and autonomous vision (e.g.,\ndistinguishing changing weather conditions from camera lens staining). We\ndevelop two datasets consisting of animal images altered across a diverse set\nof 27 visual \\textit{data-types}, spanning four broad categories. An extensive\nzero-shot evaluation of 39 VLMs, ranging from 100M to 80B parameters, shows a\nnuanced performance landscape. While VLMs are reasonably good at identifying\ncertain stylistic \\textit{data-types}, such as cartoons and sketches, they\nstruggle with simpler \\textit{data-types} arising from basic manipulations like\nimage rotations or additive noise. Our findings reveal that (i) model scaling\nalone yields marginal gains for contrastively-trained models like CLIP, and\n(ii) there is a pronounced drop in performance for the largest\nauto-regressively trained VLMs like OpenFlamingo. This finding points to a\nblind spot in current frontier VLMs: they excel in recognizing semantic content\nbut fail to acquire an understanding of visual \\textit{data-types} through\nscaling. By analyzing the pre-training distributions of these models and\nincorporating \\textit{data-type} information into the captions during\nfine-tuning, we achieve a significant enhancement in performance. By exploring\nthis previously uncharted task, we aim to set the stage for further advancing\nVLMs to equip them with visual data-type understanding. Code and datasets are\nreleased \\href{https://github.com/bethgelab/DataTypeIdentification}{here}.\n","authors":["Vishaal Udandarao","Max F. Burg","Samuel Albanie","Matthias Bethge"],"pdf_url":"https://arxiv.org/pdf/2310.08577v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08576v1","updated":"2023-10-12T17:59:23Z","published":"2023-10-12T17:59:23Z","title":"Learning to Act from Actionless Videos through Dense Correspondences","summary":"  In this work, we present an approach to construct a video-based robot policy\ncapable of reliably executing diverse tasks across different robots and\nenvironments from few video demonstrations without using any action\nannotations. Our method leverages images as a task-agnostic representation,\nencoding both the state and action information, and text as a general\nrepresentation for specifying robot goals. By synthesizing videos that\n``hallucinate'' robot executing actions and in combination with dense\ncorrespondences between frames, our approach can infer the closed-formed action\nto execute to an environment without the need of any explicit action labels.\nThis unique capability allows us to train the policy solely based on RGB videos\nand deploy learned policies to various robotic tasks. We demonstrate the\nefficacy of our approach in learning policies on table-top manipulation and\nnavigation tasks. Additionally, we contribute an open-source framework for\nefficient video modeling, enabling the training of high-fidelity policy models\nwith four GPUs within a single day.\n","authors":["Po-Chen Ko","Jiayuan Mao","Yilun Du","Shao-Hua Sun","Joshua B. Tenenbaum"],"pdf_url":"https://arxiv.org/pdf/2310.08576v1.pdf","comment":"Project page: https://flow-diffusion.github.io/"},{"id":"http://arxiv.org/abs/2112.09726v2","updated":"2023-10-12T17:57:51Z","published":"2021-12-17T19:22:01Z","title":"Soundify: Matching Sound Effects to Video","summary":"  In the art of video editing, sound helps add character to an object and\nimmerse the viewer within a space. Through formative interviews with\nprofessional editors (N=10), we found that the task of adding sounds to video\ncan be challenging. This paper presents Soundify, a system that assists editors\nin matching sounds to video. Given a video, Soundify identifies matching\nsounds, synchronizes the sounds to the video, and dynamically adjusts panning\nand volume to create spatial audio. In a human evaluation study (N=889), we\nshow that Soundify is capable of matching sounds to video out-of-the-box for a\ndiverse range of audio categories. In a within-subjects expert study (N=12), we\ndemonstrate the usefulness of Soundify in helping video editors match sounds to\nvideo with lighter workload, reduced task completion time, and improved\nusability.\n","authors":["David Chuan-En Lin","Anastasis Germanidis","Cristóbal Valenzuela","Yining Shi","Nikolas Martelaro"],"pdf_url":"https://arxiv.org/pdf/2112.09726v2.pdf","comment":"Full paper in UIST 2023; Short paper in NeurIPS 2021 ML4CD Workshop;\n  Online demo: https://soundify.cc"},{"id":"http://arxiv.org/abs/2308.11606v2","updated":"2023-10-12T17:50:38Z","published":"2023-08-22T17:53:55Z","title":"StoryBench: A Multifaceted Benchmark for Continuous Story Visualization","summary":"  Generating video stories from text prompts is a complex task. In addition to\nhaving high visual quality, videos need to realistically adhere to a sequence\nof text prompts whilst being consistent throughout the frames. Creating a\nbenchmark for video generation requires data annotated over time, which\ncontrasts with the single caption used often in video datasets. To fill this\ngap, we collect comprehensive human annotations on three existing datasets, and\nintroduce StoryBench: a new, challenging multi-task benchmark to reliably\nevaluate forthcoming text-to-video models. Our benchmark includes three video\ngeneration tasks of increasing difficulty: action execution, where the next\naction must be generated starting from a conditioning video; story\ncontinuation, where a sequence of actions must be executed starting from a\nconditioning video; and story generation, where a video must be generated from\nonly text prompts. We evaluate small yet strong text-to-video baselines, and\nshow the benefits of training on story-like data algorithmically generated from\nexisting video captions. Finally, we establish guidelines for human evaluation\nof video stories, and reaffirm the need of better automatic metrics for video\ngeneration. StoryBench aims at encouraging future research efforts in this\nexciting new area.\n","authors":["Emanuele Bugliarello","Hernan Moraldo","Ruben Villegas","Mohammad Babaeizadeh","Mohammad Taghi Saffar","Han Zhang","Dumitru Erhan","Vittorio Ferrari","Pieter-Jan Kindermans","Paul Voigtlaender"],"pdf_url":"https://arxiv.org/pdf/2308.11606v2.pdf","comment":"NeurIPS D&B 2023"},{"id":"http://arxiv.org/abs/2310.08541v1","updated":"2023-10-12T17:34:20Z","published":"2023-10-12T17:34:20Z","title":"Idea2Img: Iterative Self-Refinement with GPT-4V(ision) for Automatic\n  Image Design and Generation","summary":"  We introduce ``Idea to Image,'' a system that enables multimodal iterative\nself-refinement with GPT-4V(ision) for automatic image design and generation.\nHumans can quickly identify the characteristics of different text-to-image\n(T2I) models via iterative explorations. This enables them to efficiently\nconvert their high-level generation ideas into effective T2I prompts that can\nproduce good images. We investigate if systems based on large multimodal models\n(LMMs) can develop analogous multimodal self-refinement abilities that enable\nexploring unknown models or environments via self-refining tries. Idea2Img\ncyclically generates revised T2I prompts to synthesize draft images, and\nprovides directional feedback for prompt revision, both conditioned on its\nmemory of the probed T2I model's characteristics. The iterative self-refinement\nbrings Idea2Img various advantages over vanilla T2I models. Notably, Idea2Img\ncan process input ideas with interleaved image-text sequences, follow ideas\nwith design instructions, and generate images of better semantic and visual\nqualities. The user preference study validates the efficacy of multimodal\niterative self-refinement on automatic image design and generation.\n","authors":["Zhengyuan Yang","Jianfeng Wang","Linjie Li","Kevin Lin","Chung-Ching Lin","Zicheng Liu","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2310.08541v1.pdf","comment":"Project page at https://idea2img.github.io/"},{"id":"http://arxiv.org/abs/2310.08538v1","updated":"2023-10-12T17:28:06Z","published":"2023-10-12T17:28:06Z","title":"Image2PCI -- A Multitask Learning Framework for Estimating Pavement\n  Condition Indices Directly from Images","summary":"  The Pavement Condition Index (PCI) is a widely used metric for evaluating\npavement performance based on the type, extent and severity of distresses\ndetected on a pavement surface. In recent times, significant progress has been\nmade in utilizing deep-learning approaches to automate PCI estimation process.\nHowever, the current approaches rely on at least two separate models to\nestimate PCI values -- one model dedicated to determining the type and extent\nand another for estimating their severity. This approach presents several\nchallenges, including complexities, high computational resource demands, and\nmaintenance burdens that necessitate careful consideration and resolution. To\novercome these challenges, the current study develops a unified multi-tasking\nmodel that predicts the PCI directly from a top-down pavement image. The\nproposed architecture is a multi-task model composed of one encoder for feature\nextraction and four decoders to handle specific tasks: two detection heads, one\nsegmentation head and one PCI estimation head. By multitasking, we are able to\nextract features from the detection and segmentation heads for automatically\nestimating the PCI directly from the images. The model performs very well on\nour benchmarked and open pavement distress dataset that is annotated for\nmultitask learning (the first of its kind). To our best knowledge, this is the\nfirst work that can estimate PCI directly from an image at real time speeds\nwhile maintaining excellent accuracy on all related tasks for crack detection\nand segmentation.\n","authors":["Neema Jakisa Owor","Hang Du","Abdulateef Daud","Armstrong Aboah","Yaw Adu-Gyamfi"],"pdf_url":"https://arxiv.org/pdf/2310.08538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08537v1","updated":"2023-10-12T17:26:16Z","published":"2023-10-12T17:26:16Z","title":"XAI Benchmark for Visual Explanation","summary":"  The rise of deep learning algorithms has led to significant advancements in\ncomputer vision tasks, but their \"black box\" nature has raised concerns\nregarding interpretability. Explainable AI (XAI) has emerged as a critical area\nof research aiming to open this \"black box\", and shed light on the\ndecision-making process of AI models. Visual explanations, as a subset of\nExplainable Artificial Intelligence (XAI), provide intuitive insights into the\ndecision-making processes of AI models handling visual data by highlighting\ninfluential areas in an input image. Despite extensive research conducted on\nvisual explanations, most evaluations are model-centered since the availability\nof corresponding real-world datasets with ground truth explanations is scarce\nin the context of image data. To bridge this gap, we introduce an XAI Benchmark\ncomprising a dataset collection from diverse topics that provide both class\nlabels and corresponding explanation annotations for images. We have processed\ndata from diverse domains to align with our unified visual explanation\nframework. We introduce a comprehensive Visual Explanation pipeline, which\nintegrates data loading, preprocessing, experimental setup, and model\nevaluation processes. This structure enables researchers to conduct fair\ncomparisons of various visual explanation techniques. In addition, we provide a\ncomprehensive review of over 10 evaluation methods for visual explanation to\nassist researchers in effectively utilizing our dataset collection. To further\nassess the performance of existing visual explanation methods, we conduct\nexperiments on selected datasets using various model-centered and ground\ntruth-centered evaluation metrics. We envision this benchmark could facilitate\nthe advancement of visual explanation models. The XAI dataset collection and\neasy-to-use code for evaluation are publicly accessible at\nhttps://xaidataset.github.io.\n","authors":["Yifei Zhang","Siyi Gu","James Song","Bo Pan","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.08537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05173v2","updated":"2023-10-12T17:25:44Z","published":"2023-09-11T00:02:05Z","title":"DePT: Decomposed Prompt Tuning for Parameter-Efficient Fine-tuning","summary":"  Prompt tuning (PT), where a small amount of trainable soft (continuous)\nprompt vectors is affixed to the input of language models (LM), has shown\npromising results across various tasks and models for parameter-efficient\nfine-tuning (PEFT). PT stands out from other PEFT approaches because it\nmaintains competitive performance with fewer trainable parameters and does not\ndrastically scale up its parameters as the model size expands. However, PT\nintroduces additional soft prompt tokens, leading to longer input sequences,\nwhich significantly impacts training and inference time and memory usage due to\nthe Transformer's quadratic complexity. Particularly concerning for Large\nLanguage Models (LLMs) that face heavy daily querying. To address this issue,\nwe propose Decomposed Prompt Tuning (DePT), which decomposes the soft prompt\ninto a shorter soft prompt and a pair of low-rank matrices that are then\noptimised with two different learning rates. This allows DePT to achieve better\nperformance while saving over 20% memory and time costs compared to vanilla PT\nand its variants, without changing trainable parameter sizes. Through extensive\nexperiments on 23 natural language processing (NLP) and vision-language (VL)\ntasks, we demonstrate that DePT outperforms state-of-the-art PEFT approaches,\nincluding the full fine-tuning baseline in some scenarios. Additionally, we\nempirically show that DEPT grows more efficient as the model size increases.\nOur further study reveals that DePT integrates seamlessly with\nparameter-efficient transfer learning in the few-shot learning setting and\nhighlights its adaptability to various model architectures and sizes.\n","authors":["Zhengxiang Shi","Aldo Lipani"],"pdf_url":"https://arxiv.org/pdf/2309.05173v2.pdf","comment":"Code is available at https://github.com/ZhengxiangShi/DePT"},{"id":"http://arxiv.org/abs/2307.03293v2","updated":"2023-10-12T17:25:26Z","published":"2023-07-06T21:08:03Z","title":"CheXmask: a large-scale dataset of anatomical segmentation masks for\n  multi-center chest x-ray images","summary":"  The development of successful artificial intelligence models for chest X-ray\nanalysis relies on large, diverse datasets with high-quality annotations. While\nseveral databases of chest X-ray images have been released, most include\ndisease diagnosis labels but lack detailed pixel-level anatomical segmentation\nlabels. To address this gap, we introduce an extensive chest X-ray multi-center\nsegmentation dataset with uniform and fine-grain anatomical annotations for\nimages coming from six well-known publicly available databases: CANDID-PTX,\nChestX-ray8, Chexpert, MIMIC-CXR-JPG, Padchest, and VinDr-CXR, resulting in\n676,803 segmentation masks. Our methodology utilizes the HybridGNet model to\nensure consistent and high-quality segmentations across all datasets. Rigorous\nvalidation, including expert physician evaluation and automatic quality\ncontrol, was conducted to validate the resulting masks. Additionally, we\nprovide individualized quality indices per mask and an overall quality\nestimation per dataset. This dataset serves as a valuable resource for the\nbroader scientific community, streamlining the development and assessment of\ninnovative methodologies in chest X-ray analysis. The CheXmask dataset is\npublicly available at:\nhttps://physionet.org/content/chexmask-cxr-segmentation-data/\n","authors":["Nicolás Gaggion","Candelaria Mosquera","Lucas Mansilla","Martina Aineseder","Diego H. Milone","Enzo Ferrante"],"pdf_url":"https://arxiv.org/pdf/2307.03293v2.pdf","comment":"The CheXmask dataset is publicly available at\n  https://physionet.org/content/chexmask-cxr-segmentation-data/"},{"id":"http://arxiv.org/abs/2310.08534v1","updated":"2023-10-12T17:24:05Z","published":"2023-10-12T17:24:05Z","title":"Animating Street View","summary":"  We present a system that automatically brings street view imagery to life by\npopulating it with naturally behaving, animated pedestrians and vehicles. Our\napproach is to remove existing people and vehicles from the input image, insert\nmoving objects with proper scale, angle, motion, and appearance, plan paths and\ntraffic behavior, as well as render the scene with plausible occlusion and\nshadowing effects. The system achieves these by reconstructing the still image\nstreet scene, simulating crowd behavior, and rendering with consistent\nlighting, visibility, occlusions, and shadows. We demonstrate results on a\ndiverse range of street scenes including regular still images and panoramas.\n","authors":["Mengyi Shan","Brian Curless","Ira Kemelmacher-Shlizerman","Steve Seitz"],"pdf_url":"https://arxiv.org/pdf/2310.08534v1.pdf","comment":"SIGGRAPH Asia 2023 Conference Track"},{"id":"http://arxiv.org/abs/2310.08530v1","updated":"2023-10-12T17:22:58Z","published":"2023-10-12T17:22:58Z","title":"UniPose: Detecting Any Keypoints","summary":"  This work proposes a unified framework called UniPose to detect keypoints of\nany articulated (e.g., human and animal), rigid, and soft objects via visual or\ntextual prompts for fine-grained vision understanding and manipulation.\nKeypoint is a structure-aware, pixel-level, and compact representation of any\nobject, especially articulated objects. Existing fine-grained promptable tasks\nmainly focus on object instance detection and segmentation but often fail to\nidentify fine-grained granularity and structured information of image and\ninstance, such as eyes, leg, paw, etc. Meanwhile, prompt-based keypoint\ndetection is still under-explored. To bridge the gap, we make the first attempt\nto develop an end-to-end prompt-based keypoint detection framework called\nUniPose to detect keypoints of any objects. As keypoint detection tasks are\nunified in this framework, we can leverage 13 keypoint detection datasets with\n338 keypoints across 1,237 categories over 400K instances to train a generic\nkeypoint detection model. UniPose can effectively align text-to-keypoint and\nimage-to-keypoint due to the mutual enhancement of textual and visual prompts\nbased on the cross-modality contrastive learning optimization objectives. Our\nexperimental results show that UniPose has strong fine-grained localization and\ngeneralization abilities across image styles, categories, and poses. Based on\nUniPose as a generalist keypoint detector, we hope it could serve fine-grained\nvisual perception, understanding, and generation.\n","authors":["Jie Yang","Ailing Zeng","Ruimao Zhang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08529v1","updated":"2023-10-12T17:22:24Z","published":"2023-10-12T17:22:24Z","title":"GaussianDreamer: Fast Generation from Text to 3D Gaussian Splatting with\n  Point Cloud Priors","summary":"  In recent times, the generation of 3D assets from text prompts has shown\nimpressive results. Both 2D and 3D diffusion models can generate decent 3D\nobjects based on prompts. 3D diffusion models have good 3D consistency, but\ntheir quality and generalization are limited as trainable 3D data is expensive\nand hard to obtain. 2D diffusion models enjoy strong abilities of\ngeneralization and fine generation, but the 3D consistency is hard to\nguarantee. This paper attempts to bridge the power from the two types of\ndiffusion models via the recent explicit and efficient 3D Gaussian splatting\nrepresentation. A fast 3D generation framework, named as \\name, is proposed,\nwhere the 3D diffusion model provides point cloud priors for initialization and\nthe 2D diffusion model enriches the geometry and appearance. Operations of\nnoisy point growing and color perturbation are introduced to enhance the\ninitialized Gaussians. Our \\name can generate a high-quality 3D instance within\n25 minutes on one GPU, much faster than previous methods, while the generated\ninstances can be directly rendered in real time. Demos and code are available\nat https://taoranyi.com/gaussiandreamer/.\n","authors":["Taoran Yi","Jiemin Fang","Guanjun Wu","Lingxi Xie","Xiaopeng Zhang","Wenyu Liu","Qi Tian","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2310.08529v1.pdf","comment":"Work in progress. Project page: https://taoranyi.com/gaussiandreamer/"},{"id":"http://arxiv.org/abs/2310.08528v1","updated":"2023-10-12T17:21:41Z","published":"2023-10-12T17:21:41Z","title":"4D Gaussian Splatting for Real-Time Dynamic Scene Rendering","summary":"  Representing and rendering dynamic scenes has been an important but\nchallenging task. Especially, to accurately model complex motions, high\nefficiency is usually hard to maintain. We introduce the 4D Gaussian Splatting\n(4D-GS) to achieve real-time dynamic scene rendering while also enjoying high\ntraining and storage efficiency. An efficient deformation field is constructed\nto model both Gaussian motions and shape deformations. Different adjacent\nGaussians are connected via a HexPlane to produce more accurate position and\nshape deformations. Our 4D-GS method achieves real-time rendering under high\nresolutions, 70 FPS at a 800$\\times$800 resolution on an RTX 3090 GPU, while\nmaintaining comparable or higher quality than previous state-of-the-art\nmethods. More demos and code are available at\nhttps://guanjunwu.github.io/4dgs/.\n","authors":["Guanjun Wu","Taoran Yi","Jiemin Fang","Lingxi Xie","Xiaopeng Zhang","Wei Wei","Wenyu Liu","Qi Tian","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2310.08528v1.pdf","comment":"Work in progress. Project page: https://guanjunwu.github.io/4dgs/"},{"id":"http://arxiv.org/abs/2310.08501v1","updated":"2023-10-12T16:59:50Z","published":"2023-10-12T16:59:50Z","title":"Unsupervised Learning of Object-Centric Embeddings for Cell Instance\n  Segmentation in Microscopy Images","summary":"  Segmentation of objects in microscopy images is required for many biomedical\napplications. We introduce object-centric embeddings (OCEs), which embed image\npatches such that the spatial offsets between patches cropped from the same\nobject are preserved. Those learnt embeddings can be used to delineate\nindividual objects and thus obtain instance segmentations. Here, we show\ntheoretically that, under assumptions commonly found in microscopy images, OCEs\ncan be learnt through a self-supervised task that predicts the spatial offset\nbetween image patches. Together, this forms an unsupervised cell instance\nsegmentation method which we evaluate on nine diverse large-scale microscopy\ndatasets. Segmentations obtained with our method lead to substantially improved\nresults, compared to state-of-the-art baselines on six out of nine datasets,\nand perform on par on the remaining three datasets. If ground-truth annotations\nare available, our method serves as an excellent starting point for supervised\ntraining, reducing the required amount of ground-truth needed by one order of\nmagnitude, thus substantially increasing the practical applicability of our\nmethod. Source code is available at https://github.com/funkelab/cellulus.\n","authors":["Steffen Wolf","Manan Lalit","Henry Westmacott","Katie McDole","Jan Funke"],"pdf_url":"https://arxiv.org/pdf/2310.08501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06823v2","updated":"2023-10-12T16:42:55Z","published":"2023-10-10T17:53:36Z","title":"NECO: NEural Collapse Based Out-of-distribution detection","summary":"  Detecting out-of-distribution (OOD) data is a critical challenge in machine\nlearning due to model overconfidence, often without awareness of their\nepistemological limits. We hypothesize that ``neural collapse'', a phenomenon\naffecting in-distribution data for models trained beyond loss convergence, also\ninfluences OOD data. To benefit from this interplay, we introduce NECO, a novel\npost-hoc method for OOD detection, which leverages the geometric properties of\n``neural collapse'' and of principal component spaces to identify OOD data. Our\nextensive experiments demonstrate that NECO achieves state-of-the-art results\non both small and large-scale OOD detection tasks while exhibiting strong\ngeneralization capabilities across different network architectures.\nFurthermore, we provide a theoretical explanation for the effectiveness of our\nmethod in OOD detection. We plan to release the code after the anonymity\nperiod.\n","authors":["Mouïn Ben Ammar","Nacim Belkhir","Sebastian Popescu","Antoine Manzanera","Gianni Franchi"],"pdf_url":"https://arxiv.org/pdf/2310.06823v2.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2310.08475v1","updated":"2023-10-12T16:32:44Z","published":"2023-10-12T16:32:44Z","title":"Can We Edit Multimodal Large Language Models?","summary":"  In this paper, we focus on editing Multimodal Large Language Models (MLLMs).\nCompared to editing single-modal LLMs, multimodal model editing is more\nchallenging, which demands a higher level of scrutiny and careful consideration\nin the editing process. To facilitate research in this area, we construct a new\nbenchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite\nof innovative metrics for evaluation. We conduct comprehensive experiments\ninvolving various model editing baselines and analyze the impact of editing\ndifferent components for multimodal LLMs. Empirically, we notice that previous\nbaselines can implement editing multimodal LLMs to some extent, but the effect\nis still barely satisfactory, indicating the potential difficulty of this task.\nWe hope that our work can provide the NLP community with insights\\footnote{Code\nand dataset are available in https://github.com/zjunlp/EasyEdit.\n","authors":["Siyuan Cheng","Bozhong Tian","Qingbin Liu","Xi Chen","Yongheng Wang","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08475v1.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.08465v1","updated":"2023-10-12T16:26:18Z","published":"2023-10-12T16:26:18Z","title":"MotionDirector: Motion Customization of Text-to-Video Diffusion Models","summary":"  Large-scale pre-trained diffusion models have exhibited remarkable\ncapabilities in diverse video generations. Given a set of video clips of the\nsame motion concept, the task of Motion Customization is to adapt existing\ntext-to-video diffusion models to generate videos with this motion. For\nexample, generating a video with a car moving in a prescribed manner under\nspecific camera movements to make a movie, or a video illustrating how a bear\nwould lift weights to inspire creators. Adaptation methods have been developed\nfor customizing appearance like subject or style, yet unexplored for motion. It\nis straightforward to extend mainstream adaption methods for motion\ncustomization, including full model tuning, parameter-efficient tuning of\nadditional layers, and Low-Rank Adaptions (LoRAs). However, the motion concept\nlearned by these methods is often coupled with the limited appearances in the\ntraining videos, making it difficult to generalize the customized motion to\nother appearances. To overcome this challenge, we propose MotionDirector, with\na dual-path LoRAs architecture to decouple the learning of appearance and\nmotion. Further, we design a novel appearance-debiased temporal loss to\nmitigate the influence of appearance on the temporal training objective.\nExperimental results show the proposed method can generate videos of diverse\nappearances for the customized motions. Our method also supports various\ndownstream applications, such as the mixing of different videos with their\nappearance and motion respectively, and animating a single image with\ncustomized motions. Our code and model weights will be released.\n","authors":["Rui Zhao","Yuchao Gu","Jay Zhangjie Wu","David Junhao Zhang","Jiawei Liu","Weijia Wu","Jussi Keppo","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2310.08465v1.pdf","comment":"Project Page: https://showlab.github.io/MotionDirector/"},{"id":"http://arxiv.org/abs/2310.08451v1","updated":"2023-10-12T16:11:13Z","published":"2023-10-12T16:11:13Z","title":"Proving the Potential of Skeleton Based Action Recognition to Automate\n  the Analysis of Manual Processes","summary":"  In manufacturing sectors such as textiles and electronics, manual processes\nare a fundamental part of production. The analysis and monitoring of the\nprocesses is necessary for efficient production design. Traditional methods for\nanalyzing manual processes are complex, expensive, and inflexible. Compared to\nestablished approaches such as Methods-Time-Measurement (MTM), machine learning\n(ML) methods promise: Higher flexibility, self-sufficient & permanent use,\nlower costs. In this work, based on a video stream, the current motion class in\na manual assembly process is detected. With information on the current motion,\nKey-Performance-Indicators (KPIs) can be derived easily. A skeleton-based\naction recognition approach is taken, as this field recently shows major\nsuccess in machine vision tasks. For skeleton-based action recognition in\nmanual assembly, no sufficient pre-work could be found. Therefore, a ML\npipeline is developed, to enable extensive research on different (pre-)\nprocessing methods and neural nets. Suitable well generalizing approaches are\nfound, proving the potential of ML to enhance analyzation of manual processes.\nModels detect the current motion, performed by an operator in manual assembly,\nbut the results can be transferred to all kinds of manual processes.\n","authors":["Marlin Berger","Frederik Cloppenburg","Jens Eufinger","Thomas Gries"],"pdf_url":"https://arxiv.org/pdf/2310.08451v1.pdf","comment":"16 pages, 6 figures. Find peer-reviewed version in Proceedings of\n  IntelliSys 2023"},{"id":"http://arxiv.org/abs/2310.08442v1","updated":"2023-10-12T16:04:41Z","published":"2023-10-12T16:04:41Z","title":"Debias the Training of Diffusion Models","summary":"  Diffusion models have demonstrated compelling generation quality by\noptimizing the variational lower bound through a simple denoising score\nmatching loss. In this paper, we provide theoretical evidence that the\nprevailing practice of using a constant loss weight strategy in diffusion\nmodels leads to biased estimation during the training phase. Simply optimizing\nthe denoising network to predict Gaussian noise with constant weighting may\nhinder precise estimations of original images. To address the issue, we propose\nan elegant and effective weighting strategy grounded in the theoretically\nunbiased principle. Moreover, we conduct a comprehensive and systematic\nexploration to dissect the inherent bias problem deriving from constant\nweighting loss from the perspectives of its existence, impact and reasons.\nThese analyses are expected to advance our understanding and demystify the\ninner workings of diffusion models. Through empirical evaluation, we\ndemonstrate that our proposed debiased estimation method significantly enhances\nsample quality without the reliance on complex techniques, and exhibits\nimproved efficiency compared to the baseline method both in training and\nsampling processes.\n","authors":["Hu Yu","Li Shen","Jie Huang","Man Zhou","Hongsheng Li","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.08442v1.pdf","comment":"University of Science and Technology of China, Alibaba Group, The\n  Chinese University of Hong Kong"},{"id":"http://arxiv.org/abs/2310.08430v1","updated":"2023-10-12T15:53:47Z","published":"2023-10-12T15:53:47Z","title":"Assessing of Soil Erosion Risk Through Geoinformation Sciences and\n  Remote Sensing -- A Review","summary":"  During past decades a marked manifestation of widespread erosion phenomena\nwas studied worldwide. Global conservation community has launched campaigns at\nlocal, regional and continental level in developing countries for preservation\nof soil resources in order not only to stop or mitigate human impact on nature\nbut also to improve life in rural areas introducing new approaches for soil\ncultivation. After the adoption of Sustainable Development Goals of UNs and\nlaunching several world initiatives such as the Land Degradation Neutrality\n(LDN) the world came to realize the very importance of the soil resources on\nwhich the biosphere relies for its existence. The main goal of the chapter is\nto review different types and structures erosion models as well as their\napplications. Several methods using spatial analysis capabilities of geographic\ninformation systems (GIS) are in operation for soil erosion risk assessment,\nsuch as Universal Soil Loss Equation (USLE), Revised Universal Soil Loss\nEquation (RUSLE) in operation worldwide and in the USA and MESALES model. These\nand more models are being discussed in the present work alongside more\nexperimental models and methods for assessing soil erosion risk such as\nArtificial Intelligence (AI), Machine and Deep Learning, etc. At the end of\nthis work, a prospectus for the future development of soil erosion risk\nassessment is drawn.\n","authors":["Lachezar Filchev","Vasil Kolev"],"pdf_url":"https://arxiv.org/pdf/2310.08430v1.pdf","comment":"Chapter 21 (pages 54)"},{"id":"http://arxiv.org/abs/2310.08429v1","updated":"2023-10-12T15:53:24Z","published":"2023-10-12T15:53:24Z","title":"Revisiting Data Augmentation for Rotational Invariance in Convolutional\n  Neural Networks","summary":"  Convolutional Neural Networks (CNN) offer state of the art performance in\nvarious computer vision tasks. Many of those tasks require different subtypes\nof affine invariances (scale, rotational, translational) to image\ntransformations. Convolutional layers are translation equivariant by design,\nbut in their basic form lack invariances. In this work we investigate how best\nto include rotational invariance in a CNN for image classification. Our\nexperiments show that networks trained with data augmentation alone can\nclassify rotated images nearly as well as in the normal unrotated case; this\nincrease in representational power comes only at the cost of training time. We\nalso compare data augmentation versus two modified CNN models for achieving\nrotational invariance or equivariance, Spatial Transformer Networks and Group\nEquivariant CNNs, finding no significant accuracy increase with these\nspecialized methods. In the case of data augmented networks, we also analyze\nwhich layers help the network to encode the rotational invariance, which is\nimportant for understanding its limitations and how to best retrain a network\nwith data augmentation to achieve invariance to rotation.\n","authors":["Facundo Manuel Quiroga","Franco Ronchetti","Laura Lanzarini","Aurelio Fernandez-Bariviera"],"pdf_url":"https://arxiv.org/pdf/2310.08429v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.05188v3","updated":"2023-10-12T15:43:01Z","published":"2022-05-10T21:55:26Z","title":"On Scale Space Radon Transform, Properties and Application in CT Image\n  Reconstruction","summary":"  Since the Radon transform (RT) consists in a line integral function, some\nmodeling assumptions are made on Computed Tomography (CT) system, making image\nreconstruction analytical methods, such as Filtered Backprojection (FBP),\nsensitive to artifacts and noise. In the other hand, recently, a new integral\ntransform, called Scale Space Radon Transform (SSRT), is introduced where, RT\nis a particular case. Thanks to its interesting properties, such as good scale\nspace behavior, the SSRT has known number of new applications. In this paper,\nwith the aim to improve the reconstructed image quality for these methods, we\npropose to model the X-ray beam with the Scale Space Radon Transform (SSRT)\nwhere, the assumptions done on the physical dimensions of the CT system\nelements reflect better the reality. After depicting the basic properties and\nthe inversion of SSRT, the FBP algorithm is used to reconstruct the image from\nthe SSRT sinogram where the RT spectrum used in FBP is replaced by SSRT and the\nGaussian kernel, expressed in their frequency domain. PSNR and SSIM, as quality\nmeasures, are used to compare RT and SSRT-based image reconstruction on\nShepp-Logan head and anthropomorphic abdominal phantoms. The first findings\nshow that the SSRT-based method outperforms the methods based on RT,\nespecially, when the number of projections is reduced, making it more\nappropriate for applications requiring low-dose radiation, such as medical\nX-ray CT. While SSRT-FBP and RT-FBP have utmost the same runtime, the\nexperiments show that SSRT-FBP is more robust to Poisson-Gaussian noise\ncorrupting CT data.\n","authors":["Nafaa Nacereddine","Djemel Ziou","Aicha Baya Goumeidane"],"pdf_url":"https://arxiv.org/pdf/2205.05188v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08421v1","updated":"2023-10-12T15:42:17Z","published":"2023-10-12T15:42:17Z","title":"\"SegLoc\": Study on Novel Visual Self-supervised Learning Scheme (Segment\n  Localization) Tailored for Dense Prediction Tasks of Security Inspection\n  X-ray Images","summary":"  Lately, remarkable advancements of artificial intelligence have been\nattributed to the integration of self-supervised learning scheme. Despite\nimpressive achievements within NLP, yet SSL in computer vision has not been\nable to stay on track comparatively. Recently, integration of contrastive\nlearning on top of existing SSL models has established considerable progress in\ncomputer vision through which visual SSL models have outperformed their\nsupervised counterparts. Nevertheless, most of these improvements were limited\nto classification tasks, and also, few works have been dedicated to evaluation\nof SSL models in real-world scenarios of computer vision, while the majority of\nworks are centered around datasets containing class-wise portrait images, most\nnotably, ImageNet. Consequently, in this work, we have considered dense\nprediction task of semantic segmentation in security inspection x-ray images to\nevaluate our proposed model Segmentation Localization. Based upon the model\nInstance Localization, our model SegLoc has managed to address one of the most\nchallenging downsides of contrastive learning, i.e., false negative pairs of\nquery embeddings. In order to do so, in contrast to baseline model InsLoc, our\npretraining dataset is synthesized by cropping, transforming, then pasting\nalready labeled segments from an available labeled dataset, foregrounds, onto\ninstances of an unlabeled dataset, backgrounds. In our case, PIDray and SIXray\ndatasets are considered as labeled and unlabeled datasets, respectively.\nMoreover, we fully harness labels by avoiding false negative pairs through\nimplementing the idea, one queue per class, in MoCo-v2 whereby negative pairs\ncorresponding to each query are extracted from its corresponding queue within\nthe memory bank. Our approach has outperformed random initialization by 3% to\n6%, while having underperformed supervised initialization.\n","authors":["Shervin Halat","Mohammad Rahmati","Ehsan Nazerfard"],"pdf_url":"https://arxiv.org/pdf/2310.08421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08420v1","updated":"2023-10-12T15:39:54Z","published":"2023-10-12T15:39:54Z","title":"Visual Attention-Prompted Prediction and Learning","summary":"  Explanation(attention)-guided learning is a method that enhances a model's\npredictive power by incorporating human understanding during the training\nphase. While attention-guided learning has shown promising results, it often\ninvolves time-consuming and computationally expensive model retraining. To\naddress this issue, we introduce the attention-prompted prediction technique,\nwhich enables direct prediction guided by the attention prompt without the need\nfor model retraining. However, this approach presents several challenges,\nincluding: 1) How to incorporate the visual attention prompt into the model's\ndecision-making process and leverage it for future predictions even in the\nabsence of a prompt? and 2) How to handle the incomplete information from the\nvisual attention prompt? To tackle these challenges, we propose a novel\nframework called Visual Attention-Prompted Prediction and Learning, which\nseamlessly integrates visual attention prompts into the model's decision-making\nprocess and adapts to images both with and without attention prompts for\nprediction. To address the incomplete information of the visual attention\nprompt, we introduce a perturbation-based attention map modification method.\nAdditionally, we propose an optimization-based mask aggregation method with a\nnew weight learning function for adaptive perturbed annotation aggregation in\nthe attention map modification process. Our overall framework is designed to\nlearn in an attention-prompt guided multi-task manner to enhance future\npredictions even for samples without attention prompts and trained in an\nalternating manner for better convergence. Extensive experiments conducted on\ntwo datasets demonstrate the effectiveness of our proposed framework in\nenhancing predictions for samples, both with and without provided prompts.\n","authors":["Yifei Zhang","Siyi Gu","Bo Pan","Guangji Bai","Xiaofeng Yang","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.08420v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.11713v4","updated":"2023-10-12T15:30:41Z","published":"2023-02-23T00:33:54Z","title":"Can Pre-trained Vision and Language Models Answer Visual\n  Information-Seeking Questions?","summary":"  Pre-trained vision and language models have demonstrated state-of-the-art\ncapabilities over existing tasks involving images and texts, including visual\nquestion answering. However, it remains unclear whether these models possess\nthe capability to answer questions that are not only querying visual content\nbut knowledge-intensive and information-seeking. In this study, we introduce\nInfoSeek, a visual question answering dataset tailored for information-seeking\nquestions that cannot be answered with only common sense knowledge. Using\nInfoSeek, we analyze various pre-trained visual question answering models and\ngain insights into their characteristics. Our findings reveal that\nstate-of-the-art pre-trained multi-modal models (e.g., PaLI-X, BLIP2, etc.)\nface challenges in answering visual information-seeking questions, but\nfine-tuning on the InfoSeek dataset elicits models to use fine-grained\nknowledge that was learned during their pre-training. Furthermore, we show that\naccurate visual entity recognition can be used to improve performance on\nInfoSeek by retrieving relevant documents, showing a significant space for\nimprovement.\n","authors":["Yang Chen","Hexiang Hu","Yi Luan","Haitian Sun","Soravit Changpinyo","Alan Ritter","Ming-Wei Chang"],"pdf_url":"https://arxiv.org/pdf/2302.11713v4.pdf","comment":"EMNLP 2023 (main conference); Our dataset and evaluation is available\n  at https://open-vision-language.github.io/infoseek/"},{"id":"http://arxiv.org/abs/2310.08398v1","updated":"2023-10-12T15:09:12Z","published":"2023-10-12T15:09:12Z","title":"Towards Design and Development of an ArUco Markers-Based Quantitative\n  Surface Tactile Sensor","summary":"  In this paper, with the goal of quantifying the qualitative image outputs of\na Vision-based Tactile Sensor (VTS), we present the design, fabrication, and\ncharacterization of a novel Quantitative Surface Tactile Sensor (called QS-TS).\nQS-TS directly estimates the sensor's gel layer deformation in real-time\nenabling safe and autonomous tactile manipulation and servoing of delicate\nobjects using robotic manipulators. The core of the proposed sensor is the\nutilization of miniature 1.5 mm x 1.5 mm synthetic square markers with inner\nbinary patterns and a broad black border, called ArUco Markers. Each ArUco\nmarker can provide real-time camera pose estimation that, in our design, is\nused as a quantitative measure for obtaining deformation of the QS-TS gel\nlayer. Moreover, thanks to the use of ArUco markers, we propose a unique\nfabrication procedure that mitigates various challenges associated with the\nfabrication of the existing marker-based VTSs and offers an intuitive and\nless-arduous method for the construction of the VTS. Remarkably, the proposed\nfabrication facilitates the integration and adherence of markers with the gel\nlayer to robustly and reliably obtain a quantitative measure of deformation in\nreal-time regardless of the orientation of ArUco Markers. The performance and\nefficacy of the proposed QS-TS in estimating the deformation of the sensor's\ngel layer were experimentally evaluated and verified. Results demonstrate the\nphenomenal performance of the QS-TS in estimating the deformation of the gel\nlayer with a relative error of <5%.\n","authors":["Ozdemir Can Kara","Charles Everson","Farshid Alambeigi"],"pdf_url":"https://arxiv.org/pdf/2310.08398v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.02621v2","updated":"2023-10-12T15:05:15Z","published":"2023-04-05T17:43:57Z","title":"High-fidelity Pseudo-labels for Boosting Weakly-Supervised Segmentation","summary":"  Image-level weakly-supervised semantic segmentation (WSSS) reduces the\nusually vast data annotation cost by surrogate segmentation masks during\ntraining. The typical approach involves training an image classification\nnetwork using global average pooling (GAP) on convolutional feature maps. This\nenables the estimation of object locations based on class activation maps\n(CAMs), which identify the importance of image regions. The CAMs are then used\nto generate pseudo-labels, in the form of segmentation masks, to supervise a\nsegmentation model in the absence of pixel-level ground truth. Our work is\nbased on two techniques for improving CAMs; importance sampling, which is a\nsubstitute for GAP, and the feature similarity loss, which utilizes a heuristic\nthat object contours almost always align with color edges in images. However,\nboth are based on the multinomial posterior with softmax, and implicitly assume\nthat classes are mutually exclusive, which turns out suboptimal in our\nexperiments. Thus, we reformulate both techniques based on binomial posteriors\nof multiple independent binary problems. This has two benefits; their\nperformance is improved and they become more general, resulting in an add-on\nmethod that can boost virtually any WSSS method. This is demonstrated on a wide\nvariety of baselines on the PASCAL VOC dataset, improving the region similarity\nand contour quality of all implemented state-of-the-art methods. Experiments on\nthe MS COCO dataset show that our proposed add-on is well-suited for\nlarge-scale settings. Our code is available at https://github.com/arvijj/hfpl.\n","authors":["Arvi Jonnarth","Yushan Zhang","Michael Felsberg"],"pdf_url":"https://arxiv.org/pdf/2304.02621v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04946v2","updated":"2023-10-12T15:04:30Z","published":"2023-09-10T06:33:17Z","title":"Efficient Emotional Adaptation for Audio-Driven Talking-Head Generation","summary":"  Audio-driven talking-head synthesis is a popular research topic for virtual\nhuman-related applications. However, the inflexibility and inefficiency of\nexisting methods, which necessitate expensive end-to-end training to transfer\nemotions from guidance videos to talking-head predictions, are significant\nlimitations. In this work, we propose the Emotional Adaptation for Audio-driven\nTalking-head (EAT) method, which transforms emotion-agnostic talking-head\nmodels into emotion-controllable ones in a cost-effective and efficient manner\nthrough parameter-efficient adaptations. Our approach utilizes a pretrained\nemotion-agnostic talking-head transformer and introduces three lightweight\nadaptations (the Deep Emotional Prompts, Emotional Deformation Network, and\nEmotional Adaptation Module) from different perspectives to enable precise and\nrealistic emotion controls. Our experiments demonstrate that our approach\nachieves state-of-the-art performance on widely-used benchmarks, including LRW\nand MEAD. Additionally, our parameter-efficient adaptations exhibit remarkable\ngeneralization ability, even in scenarios where emotional training videos are\nscarce or nonexistent. Project website: https://yuangan.github.io/eat/\n","authors":["Yuan Gan","Zongxin Yang","Xihang Yue","Lingyun Sun","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2309.04946v2.pdf","comment":"Accepted to ICCV 2023. Project page: https://yuangan.github.io/eat/"},{"id":"http://arxiv.org/abs/2310.08390v1","updated":"2023-10-12T15:00:06Z","published":"2023-10-12T15:00:06Z","title":"Hyp-UML: Hyperbolic Image Retrieval with Uncertainty-aware Metric\n  Learning","summary":"  Metric learning plays a critical role in training image retrieval and\nclassification. It is also a key algorithm in representation learning, e.g.,\nfor feature learning and its alignment in metric space. Hyperbolic embedding\nhas been recently developed, compared to the conventional Euclidean embedding\nin most of the previously developed models, and can be more effective in\nrepresenting the hierarchical data structure. Second, uncertainty\nestimation/measurement is a long-lasting challenge in artificial intelligence.\nSuccessful uncertainty estimation can improve a machine learning model's\nperformance, robustness, and security. In Hyperbolic space, uncertainty\nmeasurement is at least with equivalent, if not more, critical importance. In\nthis paper, we develop a Hyperbolic image embedding with uncertainty-aware\nmetric learning for image retrieval. We call our method Hyp-UML: Hyperbolic\nUncertainty-aware Metric Learning. Our contribution are threefold: we propose\nan image embedding algorithm based on Hyperbolic space, with their\ncorresponding uncertainty value; we propose two types of uncertainty-aware\nmetric learning, for the popular Contrastive learning and conventional\nmargin-based metric learning, respectively. We perform extensive experimental\nvalidations to prove that the proposed algorithm can achieve state-of-the-art\nresults among related methods. The comprehensive ablation study validates the\neffectiveness of each component of the proposed algorithm.\n","authors":["Shiyang Yan","Zongxuan Liu","Lin Xu"],"pdf_url":"https://arxiv.org/pdf/2310.08390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08387v1","updated":"2023-10-12T14:59:22Z","published":"2023-10-12T14:59:22Z","title":"MeanAP-Guided Reinforced Active Learning for Object Detection","summary":"  Active learning presents a promising avenue for training high-performance\nmodels with minimal labeled data, achieved by judiciously selecting the most\ninformative instances to label and incorporating them into the task learner.\nDespite notable advancements in active learning for image recognition, metrics\ndevised or learned to gauge the information gain of data, crucial for query\nstrategy design, do not consistently align with task model performance metrics,\nsuch as Mean Average Precision (MeanAP) in object detection tasks. This paper\nintroduces MeanAP-Guided Reinforced Active Learning for Object Detection\n(MAGRAL), a novel approach that directly utilizes the MeanAP metric of the task\nmodel to devise a sampling strategy employing a reinforcement learning-based\nsampling agent. Built upon LSTM architecture, the agent efficiently explores\nand selects subsequent training instances, and optimizes the process through\npolicy gradient with MeanAP serving as reward. Recognizing the time-intensive\nnature of MeanAP computation at each step, we propose fast look-up tables to\nexpedite agent training. We assess MAGRAL's efficacy across popular benchmarks,\nPASCAL VOC and MS COCO, utilizing different backbone architectures. Empirical\nfindings substantiate MAGRAL's superiority over recent state-of-the-art\nmethods, showcasing substantial performance gains. MAGRAL establishes a robust\nbaseline for reinforced active object detection, signifying its potential in\nadvancing the field.\n","authors":["Zhixuan Liang","Xingyu Zeng","Rui Zhao","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2310.08387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08381v1","updated":"2023-10-12T14:55:31Z","published":"2023-10-12T14:55:31Z","title":"AutoVP: An Automated Visual Prompting Framework and Benchmark","summary":"  Visual prompting (VP) is an emerging parameter-efficient fine-tuning approach\nto adapting pre-trained vision models to solve various downstream\nimage-classification tasks. However, there has hitherto been little systematic\nstudy of the design space of VP and no clear benchmark for evaluating its\nperformance. To bridge this gap, we propose AutoVP, an end-to-end expandable\nframework for automating VP design choices, along with 12 downstream\nimage-classification tasks that can serve as a holistic VP-performance\nbenchmark. Our design space covers 1) the joint optimization of the prompts; 2)\nthe selection of pre-trained models, including image classifiers and text-image\nencoders; and 3) model output mapping strategies, including nonparametric and\ntrainable label mapping. Our extensive experimental results show that AutoVP\noutperforms the best-known current VP methods by a substantial margin, having\nup to 6.7% improvement in accuracy; and attains a maximum performance increase\nof 27.5% compared to linear-probing (LP) baseline. AutoVP thus makes a two-fold\ncontribution: serving both as an efficient tool for hyperparameter tuning on VP\ndesign choices, and as a comprehensive benchmark that can reasonably be\nexpected to accelerate VP's development. The source code is available at\nhttps://github.com/IBM/AutoVP.\n","authors":["Hsi-Ai Tsao","Lei Hsiung","Pin-Yu Chen","Sijia Liu","Tsung-Yi Ho"],"pdf_url":"https://arxiv.org/pdf/2310.08381v1.pdf","comment":"Preprint. The code is available at https://github.com/IBM/AutoVP"},{"id":"http://arxiv.org/abs/2310.08371v1","updated":"2023-10-12T14:40:24Z","published":"2023-10-12T14:40:24Z","title":"Worst-Case Morphs using Wasserstein ALI and Improved MIPGAN","summary":"  A lot of progress has been made in the last years on using Generative\nAdversarial Networks (GAN) to create realistic images. However, to be able\nreconstruct images or to generate images using real data as input, an Encoder\nis needed that reverses the mapping from the GAN's latent space to image space.\nThis means that three networks are needed: an Encoder, a Decoder (called\nGenerator in a normal GAN) and a Discriminator. These three networks can be\ntrained from scratch simultaneously (Adversarially Learned Inference), or\nalternatively an Encoder network can be trained that maps images into the\nlatent space of a \\textit{pretrained} GAN model (Inverse GAN). In the latter\ncase, the networks are trained consecutively, so the Encoder has to make do\nwith whatever model the Decoder learned during GAN training. Training three\nnetworks simultaneously is more unstable and therefore more challenging, but it\nis possible that the Encoder and Decoder benefit from interacting with each\nother during training. We compare the two different approaches and discuss\nwhether it is worth the extra effort to train all three networks\nsimultaneously.\n","authors":["Una M. Kelly","Meike Nauta","Lu Liu","Luuk J. Spreeuwers","Raymond N. J. Veldhuis"],"pdf_url":"https://arxiv.org/pdf/2310.08371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08370v1","updated":"2023-10-12T14:39:58Z","published":"2023-10-12T14:39:58Z","title":"UniPAD: A Universal Pre-training Paradigm for Autonomous Driving","summary":"  In the context of autonomous driving, the significance of effective feature\nlearning is widely acknowledged. While conventional 3D self-supervised\npre-training methods have shown widespread success, most methods follow the\nideas originally designed for 2D images. In this paper, we present UniPAD, a\nnovel self-supervised learning paradigm applying 3D volumetric differentiable\nrendering. UniPAD implicitly encodes 3D space, facilitating the reconstruction\nof continuous 3D shape structures and the intricate appearance characteristics\nof their 2D projections. The flexibility of our method enables seamless\nintegration into both 2D and 3D frameworks, enabling a more holistic\ncomprehension of the scenes. We manifest the feasibility and effectiveness of\nUniPAD by conducting extensive experiments on various downstream 3D tasks. Our\nmethod significantly improves lidar-, camera-, and lidar-camera-based baseline\nby 9.1, 7.7, and 6.9 NDS, respectively. Notably, our pre-training pipeline\nachieves 73.2 NDS for 3D object detection and 79.4 mIoU for 3D semantic\nsegmentation on the nuScenes validation set, achieving state-of-the-art results\nin comparison with previous methods. The code will be available at\nhttps://github.com/Nightmare-n/UniPAD.\n","authors":["Honghui Yang","Sha Zhang","Di Huang","Xiaoyang Wu","Haoyi Zhu","Tong He","Shixiang Tang","Hengshuang Zhao","Qibo Qiu","Binbin Lin","Xiaofei He","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2310.08370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08368v1","updated":"2023-10-12T14:38:52Z","published":"2023-10-12T14:38:52Z","title":"Mapping Memes to Words for Multimodal Hateful Meme Classification","summary":"  Multimodal image-text memes are prevalent on the internet, serving as a\nunique form of communication that combines visual and textual elements to\nconvey humor, ideas, or emotions. However, some memes take a malicious turn,\npromoting hateful content and perpetuating discrimination. Detecting hateful\nmemes within this multimodal context is a challenging task that requires\nunderstanding the intertwined meaning of text and images. In this work, we\naddress this issue by proposing a novel approach named ISSUES for multimodal\nhateful meme classification. ISSUES leverages a pre-trained CLIP\nvision-language model and the textual inversion technique to effectively\ncapture the multimodal semantic content of the memes. The experiments show that\nour method achieves state-of-the-art results on the Hateful Memes Challenge and\nHarMeme datasets. The code and the pre-trained models are publicly available at\nhttps://github.com/miccunifi/ISSUES.\n","authors":["Giovanni Burbi","Alberto Baldrati","Lorenzo Agnolucci","Marco Bertini","Alberto Del Bimbo"],"pdf_url":"https://arxiv.org/pdf/2310.08368v1.pdf","comment":"ICCV2023 CLVL Workshop"},{"id":"http://arxiv.org/abs/2310.08367v1","updated":"2023-10-12T14:38:25Z","published":"2023-10-12T14:38:25Z","title":"MCU: A Task-centric Framework for Open-ended Agent Evaluation in\n  Minecraft","summary":"  To pursue the goal of creating an open-ended agent in Minecraft, an\nopen-ended game environment with unlimited possibilities, this paper introduces\na task-centric framework named MCU for Minecraft agent evaluation. The MCU\nframework leverages the concept of atom tasks as fundamental building blocks,\nenabling the generation of diverse or even arbitrary tasks. Within the MCU\nframework, each task is measured with six distinct difficulty scores (time\nconsumption, operational effort, planning complexity, intricacy, creativity,\nnovelty). These scores offer a multi-dimensional assessment of a task from\ndifferent angles, and thus can reveal an agent's capability on specific facets.\nThe difficulty scores also serve as the feature of each task, which creates a\nmeaningful task space and unveils the relationship between tasks. For efficient\nevaluation of Minecraft agents employing the MCU framework, we maintain a\nunified benchmark, namely SkillForge, which comprises representative tasks with\ndiverse categories and difficulty distribution. We also provide convenient\nfilters for users to select tasks to assess specific capabilities of agents. We\nshow that MCU has the high expressivity to cover all tasks used in recent\nliterature on Minecraft agent, and underscores the need for advancements in\nareas such as creativity, precise control, and out-of-distribution\ngeneralization under the goal of open-ended Minecraft agent development.\n","authors":["Haowei Lin","Zihao Wang","Jianzhu Ma","Yitao Liang"],"pdf_url":"https://arxiv.org/pdf/2310.08367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04099v2","updated":"2023-10-12T14:18:52Z","published":"2023-10-06T09:01:15Z","title":"ClusVPR: Efficient Visual Place Recognition with Clustering-based\n  Weighted Transformer","summary":"  Visual place recognition (VPR) is a highly challenging task that has a wide\nrange of applications, including robot navigation and self-driving vehicles.\nVPR is particularly difficult due to the presence of duplicate regions and the\nlack of attention to small objects in complex scenes, resulting in recognition\ndeviations. In this paper, we present ClusVPR, a novel approach that tackles\nthe specific issues of redundant information in duplicate regions and\nrepresentations of small objects. Different from existing methods that rely on\nConvolutional Neural Networks (CNNs) for feature map generation, ClusVPR\nintroduces a unique paradigm called Clustering-based Weighted Transformer\nNetwork (CWTNet). CWTNet leverages the power of clustering-based weighted\nfeature maps and integrates global dependencies to effectively address visual\ndeviations encountered in large-scale VPR problems. We also introduce the\noptimized-VLAD (OptLAD) layer that significantly reduces the number of\nparameters and enhances model efficiency. This layer is specifically designed\nto aggregate the information obtained from scale-wise image patches.\nAdditionally, our pyramid self-supervised strategy focuses on extracting\nrepresentative and diverse information from scale-wise image patches instead of\nentire images, which is crucial for capturing representative and diverse\ninformation in VPR. Extensive experiments on four VPR datasets show our model's\nsuperior performance compared to existing models while being less complex.\n","authors":["Yifan Xu","Pourya Shamsolmoali","Jie Yang"],"pdf_url":"https://arxiv.org/pdf/2310.04099v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08339v1","updated":"2023-10-12T13:57:32Z","published":"2023-10-12T13:57:32Z","title":"A Generic Software Framework for Distributed Topological Analysis\n  Pipelines","summary":"  This system paper presents a software framework for the support of\ntopological analysis pipelines in a distributed-memory model. While several\nrecent papers introduced topology-based approaches for distributed-memory\nenvironments, these were reporting experiments obtained with tailored,\nmono-algorithm implementations. In contrast, we describe in this paper a\ngeneral-purpose, generic framework for topological analysis pipelines, i.e. a\nsequence of topological algorithms interacting together, possibly on distinct\nnumbers of processes. Specifically, we instantiated our framework with the MPI\nmodel, within the Topology ToolKit (TTK). While developing this framework, we\nfaced several algorithmic and software engineering challenges, which we\ndocument in this paper. We provide a taxonomy for the distributed-memory\ntopological algorithms supported by TTK, depending on their communication needs\nand provide examples of hybrid MPI+thread parallelizations. Detailed\nperformance analyses show that parallel efficiencies range from $20\\%$ to\n$80\\%$ (depending on the algorithms), and that the MPI-specific preconditioning\nintroduced by our framework induces a negligible computation time overhead. We\nillustrate the new distributed-memory capabilities of TTK with an example of\nadvanced analysis pipeline, combining multiple algorithms, run on the largest\npublicly available dataset we have found (120 billion vertices) on a standard\ncluster with 64 nodes (for a total of 1,536 cores). Finally, we provide a\nroadmap for the completion of TTK's MPI extension, along with generic\nrecommendations for each algorithm communication category.\n","authors":["Eve Le Guillou","Michael Will","Pierre Guillou","Jonas Lukasczyk","Pierre Fortin","Christoph Garth","Julien Tierny"],"pdf_url":"https://arxiv.org/pdf/2310.08339v1.pdf","comment":"18 pages, 12 figures"},{"id":"http://arxiv.org/abs/2206.02136v3","updated":"2023-10-12T13:55:06Z","published":"2022-06-05T09:39:12Z","title":"LDRNet: Enabling Real-time Document Localization on Mobile Devices","summary":"  While Identity Document Verification (IDV) technology on mobile devices\nbecomes ubiquitous in modern business operations, the risk of identity theft\nand fraud is increasing. The identity document holder is normally required to\nparticipate in an online video interview to circumvent impostors. However, the\ncurrent IDV process depends on an additional human workforce to support online\nstep-by-step guidance which is inefficient and expensive. The performance of\nexisting AI-based approaches cannot meet the real-time and lightweight demands\nof mobile devices. In this paper, we address those challenges by designing an\nedge intelligence-assisted approach for real-time IDV. Aiming at improving the\nresponsiveness of the IDV process, we propose a new document localization model\nfor mobile devices, LDRNet, to Localize the identity Document in Real-time. On\nthe basis of a lightweight backbone network, we build three prediction branches\nfor LDRNet, the corner points prediction, the line borders prediction and the\ndocument classification. We design novel supplementary targets, the\nequal-division points, and use a new loss function named Line Loss, to improve\nthe speed and accuracy of our approach. In addition to the IDV process, LDRNet\nis an efficient and reliable document localization alternative for all kinds of\nmobile applications. As a matter of proof, we compare the performance of LDRNet\nwith other popular approaches on localizing general document datasets. The\nexperimental results show that LDRNet runs at a speed up to 790 FPS which is\n47x faster, while still achieving comparable Jaccard Index(JI) in single-model\nand single-scale tests.\n","authors":["Han Wu","Holland Qian","Huaming Wu","Aad van Moorsel"],"pdf_url":"https://arxiv.org/pdf/2206.02136v3.pdf","comment":"ECML-PKDD 2022 https://doi.org/10.1007/978-3-031-23618-1_42"},{"id":"http://arxiv.org/abs/2310.08332v1","updated":"2023-10-12T13:46:36Z","published":"2023-10-12T13:46:36Z","title":"Real-Time Neural BRDF with Spherically Distributed Primitives","summary":"  We propose a novel compact and efficient neural BRDF offering highly\nversatile material representation, yet with very-light memory and neural\ncomputation consumption towards achieving real-time rendering. The results in\nFigure 1, rendered at full HD resolution on a current desktop machine, show\nthat our system achieves real-time rendering with a wide variety of\nappearances, which is approached by the following two designs. On the one hand,\nnoting that bidirectional reflectance is distributed in a very sparse\nhigh-dimensional subspace, we propose to project the BRDF into two\nlow-dimensional components, i.e., two hemisphere feature-grids for incoming and\noutgoing directions, respectively. On the other hand, learnable neural\nreflectance primitives are distributed on our highly-tailored spherical surface\ngrid, which offer informative features for each component and alleviate the\nconventional heavy feature learning network to a much smaller one, leading to\nvery fast evaluation. These primitives are centrally stored in a codebook and\ncan be shared across multiple grids and even across materials, based on the\nlow-cost indices stored in material-specific spherical surface grids. Our\nneural BRDF, which is agnostic to the material, provides a unified framework\nthat can represent a variety of materials in consistent manner. Comprehensive\nexperimental results on measured BRDF compression, Monte Carlo simulated BRDF\nacceleration, and extension to spatially varying effect demonstrate the\nsuperior quality and generalizability achieved by the proposed scheme.\n","authors":["Yishun Dou","Zhong Zheng","Qiaoqiao Jin","Bingbing Ni","Yugang Chen","Junxiang Ke"],"pdf_url":"https://arxiv.org/pdf/2310.08332v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08326v1","updated":"2023-10-12T13:42:49Z","published":"2023-10-12T13:42:49Z","title":"NSM4D: Neural Scene Model Based Online 4D Point Cloud Sequence\n  Understanding","summary":"  Understanding 4D point cloud sequences online is of significant practical\nvalue in various scenarios such as VR/AR, robotics, and autonomous driving. The\nkey goal is to continuously analyze the geometry and dynamics of a 3D scene as\nunstructured and redundant point cloud sequences arrive. And the main challenge\nis to effectively model the long-term history while keeping computational costs\nmanageable. To tackle these challenges, we introduce a generic online 4D\nperception paradigm called NSM4D. NSM4D serves as a plug-and-play strategy that\ncan be adapted to existing 4D backbones, significantly enhancing their online\nperception capabilities for both indoor and outdoor scenarios. To efficiently\ncapture the redundant 4D history, we propose a neural scene model that\nfactorizes geometry and motion information by constructing geometry tokens\nseparately storing geometry and motion features. Exploiting the history becomes\nas straightforward as querying the neural scene model. As the sequence\nprogresses, the neural scene model dynamically deforms to align with new\nobservations, effectively providing the historical context and updating itself\nwith the new observations. By employing token representation, NSM4D also\nexhibits robustness to low-level sensor noise and maintains a compact size\nthrough a geometric sampling scheme. We integrate NSM4D with state-of-the-art\n4D perception backbones, demonstrating significant improvements on various\nonline perception benchmarks in indoor and outdoor settings. Notably, we\nachieve a 9.6% accuracy improvement for HOI4D online action segmentation and a\n3.4% mIoU improvement for SemanticKITTI online semantic segmentation.\nFurthermore, we show that NSM4D inherently offers excellent scalability to\nlonger sequences beyond the training set, which is crucial for real-world\napplications.\n","authors":["Yuhao Dong","Zhuoyang Zhang","Yunze Liu","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2310.08326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08320v1","updated":"2023-10-12T13:33:04Z","published":"2023-10-12T13:33:04Z","title":"Defending Our Privacy With Backdoors","summary":"  The proliferation of large AI models trained on uncurated, often sensitive\nweb-scraped data has raised significant privacy concerns. One of the concerns\nis that adversaries can extract information about the training data using\nprivacy attacks. Unfortunately, the task of removing specific information from\nthe models without sacrificing performance is not straightforward and has\nproven to be challenging. We propose a rather easy yet effective defense based\non backdoor attacks to remove private information such as names of individuals\nfrom models, and focus in this work on text encoders. Specifically, through\nstrategic insertion of backdoors, we align the embeddings of sensitive phrases\nwith those of neutral terms-\"a person\" instead of the person's name. Our\nempirical results demonstrate the effectiveness of our backdoor-based defense\non CLIP by assessing its performance using a specialized privacy attack for\nzero-shot classifiers. Our approach provides not only a new \"dual-use\"\nperspective on backdoor attacks, but also presents a promising avenue to\nenhance the privacy of individuals within models trained on uncurated\nweb-scraped data.\n","authors":["Dominik Hintersdorf","Lukas Struppek","Daniel Neider","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2310.08320v1.pdf","comment":"14 pages, 4 figures"},{"id":"http://arxiv.org/abs/2310.08316v1","updated":"2023-10-12T13:27:21Z","published":"2023-10-12T13:27:21Z","title":"Extended target tracking utilizing machine-learning software -- with\n  applications to animal classification","summary":"  This paper considers the problem of detecting and tracking objects in a\nsequence of images. The problem is formulated in a filtering framework, using\nthe output of object-detection algorithms as measurements. An extension to the\nfiltering formulation is proposed that incorporates class information from the\nprevious frame to robustify the classification, even if the object-detection\nalgorithm outputs an incorrect prediction. Further, the properties of the\nobject-detection algorithm are exploited to quantify the uncertainty of the\nbounding box detection in each frame. The complete filtering method is\nevaluated on camera trap images of the four large Swedish carnivores, bear,\nlynx, wolf, and wolverine. The experiments show that the class tracking\nformulation leads to a more robust classification.\n","authors":["Magnus Malmström","Anton Kullberg","Isaac Skog","Daniel Axehill","Fredrik Gustafsson"],"pdf_url":"https://arxiv.org/pdf/2310.08316v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.08312v1","updated":"2023-10-12T13:20:17Z","published":"2023-10-12T13:20:17Z","title":"GePSAn: Generative Procedure Step Anticipation in Cooking Videos","summary":"  We study the problem of future step anticipation in procedural videos. Given\na video of an ongoing procedural activity, we predict a plausible next\nprocedure step described in rich natural language. While most previous work\nfocus on the problem of data scarcity in procedural video datasets, another\ncore challenge of future anticipation is how to account for multiple plausible\nfuture realizations in natural settings. This problem has been largely\noverlooked in previous work. To address this challenge, we frame future step\nprediction as modelling the distribution of all possible candidates for the\nnext step. Specifically, we design a generative model that takes a series of\nvideo clips as input, and generates multiple plausible and diverse candidates\n(in natural language) for the next step. Following previous work, we side-step\nthe video annotation scarcity by pretraining our model on a large text-based\ncorpus of procedural activities, and then transfer the model to the video\ndomain. Our experiments, both in textual and video domains, show that our model\ncaptures diversity in the next step prediction and generates multiple plausible\nfuture predictions. Moreover, our model establishes new state-of-the-art\nresults on YouCookII, where it outperforms existing baselines on the next step\nanticipation. Finally, we also show that our model can successfully transfer\nfrom text to the video domain zero-shot, ie, without fine-tuning or adaptation,\nand produces good-quality future step predictions from video.\n","authors":["Mohamed Ashraf Abdelsalam","Samrudhdhi B. Rangrej","Isma Hadji","Nikita Dvornik","Konstantinos G. Derpanis","Afsaneh Fazly"],"pdf_url":"https://arxiv.org/pdf/2310.08312v1.pdf","comment":"published at ICCV 2023"},{"id":"http://arxiv.org/abs/2310.08304v1","updated":"2023-10-12T13:11:38Z","published":"2023-10-12T13:11:38Z","title":"CHIP: Contrastive Hierarchical Image Pretraining","summary":"  Few-shot object classification is the task of classifying objects in an image\nwith limited number of examples as supervision. We propose a one-shot/few-shot\nclassification model that can classify an object of any unseen class into a\nrelatively general category in an hierarchically based classification. Our\nmodel uses a three-level hierarchical contrastive loss based ResNet152\nclassifier for classifying an object based on its features extracted from Image\nembedding, not used during the training phase. For our experimentation, we have\nused a subset of the ImageNet (ILSVRC-12) dataset that contains only the animal\nclasses for training our model and created our own dataset of unseen classes\nfor evaluating our trained model. Our model provides satisfactory results in\nclassifying the unknown objects into a generic category which has been later\ndiscussed in greater detail.\n","authors":["Arpit Mittal","Harshil Jhaveri","Swapnil Mallick","Abhishek Ajmera"],"pdf_url":"https://arxiv.org/pdf/2310.08304v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08303v1","updated":"2023-10-12T13:09:40Z","published":"2023-10-12T13:09:40Z","title":"Multimodal Variational Auto-encoder based Audio-Visual Segmentation","summary":"  We propose an Explicit Conditional Multimodal Variational Auto-Encoder\n(ECMVAE) for audio-visual segmentation (AVS), aiming to segment sound sources\nin the video sequence. Existing AVS methods focus on implicit feature fusion\nstrategies, where models are trained to fit the discrete samples in the\ndataset. With a limited and less diverse dataset, the resulting performance is\nusually unsatisfactory. In contrast, we address this problem from an effective\nrepresentation learning perspective, aiming to model the contribution of each\nmodality explicitly. Specifically, we find that audio contains critical\ncategory information of the sound producers, and visual data provides candidate\nsound producer(s). Their shared information corresponds to the target sound\nproducer(s) shown in the visual data. In this case, cross-modal shared\nrepresentation learning is especially important for AVS. To achieve this, our\nECMVAE factorizes the representations of each modality with a modality-shared\nrepresentation and a modality-specific representation. An orthogonality\nconstraint is applied between the shared and specific representations to\nmaintain the exclusive attribute of the factorized latent code. Further, a\nmutual information maximization regularizer is introduced to achieve extensive\nexploration of each modality. Quantitative and qualitative evaluations on the\nAVSBench demonstrate the effectiveness of our approach, leading to a new\nstate-of-the-art for AVS, with a 3.84 mIOU performance leap on the challenging\nMS3 subset for multiple sound source segmentation.\n","authors":["Yuxin Mao","Jing Zhang","Mochu Xiang","Yiran Zhong","Yuchao Dai"],"pdf_url":"https://arxiv.org/pdf/2310.08303v1.pdf","comment":"Accepted by ICCV2023,Project\n  page(https://npucvr.github.io/MMVAE-AVS),Code(https://github.com/OpenNLPLab/MMVAE-AVS)"},{"id":"http://arxiv.org/abs/2308.12435v2","updated":"2023-10-12T12:57:55Z","published":"2023-08-23T21:36:35Z","title":"Characterising representation dynamics in recurrent neural networks for\n  object recognition","summary":"  Recurrent neural networks (RNNs) have yielded promising results for both\nrecognizing objects in challenging conditions and modeling aspects of primate\nvision. However, the representational dynamics of recurrent computations remain\npoorly understood, especially in large-scale visual models. Here, we studied\nsuch dynamics in RNNs trained for object classification on MiniEcoset, a novel\nsubset of ecoset. We report two main insights. First, upon inference,\nrepresentations continued to evolve after correct classification, suggesting a\nlack of the notion of being ``done with classification''. Second, focusing on\n``readout zones'' as a way to characterize the activation trajectories, we\nobserve that misclassified representations exhibit activation patterns with\nlower L2 norm, and are positioned more peripherally in the readout zones. Such\narrangements help the misclassified representations move into the correct zones\nas time progresses. Our findings generalize to networks with lateral and\ntop-down connections, and include both additive and multiplicative interactions\nwith the bottom-up sweep. The results therefore contribute to a general\nunderstanding of RNN dynamics in naturalistic tasks. We hope that the analysis\nframework will aid future investigations of other types of RNNs, including\nunderstanding of representational dynamics in primate vision.\n","authors":["Sushrut Thorat","Adrien Doerig","Tim C. Kietzmann"],"pdf_url":"https://arxiv.org/pdf/2308.12435v2.pdf","comment":"8 pages, 7 figures; revision of our Conference on Cognitive\n  Computational Neuroscience (CCN) 2023 paper"},{"id":"http://arxiv.org/abs/2202.09348v2","updated":"2023-10-12T12:56:28Z","published":"2022-02-18T18:36:01Z","title":"A Machine Learning Paradigm for Studying Pictorial Realism: Are\n  Constable's Clouds More Real than His Contemporaries?","summary":"  The British landscape painter John Constable is considered foundational for\nthe Realist movement in 19th-century European painting. Constable's painted\nskies, in particular, were seen as remarkably accurate by his contemporaries,\nan impression shared by many viewers today. Yet, assessing the accuracy of\nrealist paintings like Constable's is subjective or intuitive, even for\nprofessional art historians, making it difficult to say with certainty what set\nConstable's skies apart from those of his contemporaries. Our goal is to\ncontribute to a more objective understanding of Constable's realism. We propose\na new machine-learning-based paradigm for studying pictorial realism in an\nexplainable way. Our framework assesses realism by measuring the similarity\nbetween clouds painted by artists noted for their skies, like Constable, and\nphotographs of clouds. The experimental results of cloud classification show\nthat Constable approximates more consistently than his contemporaries the\nformal features of actual clouds in his paintings. The study, as a novel\ninterdisciplinary approach that combines computer vision and machine learning,\nmeteorology, and art history, is a springboard for broader and deeper analyses\nof pictorial realism.\n","authors":["Zhuomin Zhang","Elizabeth C. Mansfield","Jia Li","John Russell","George S. Young","Catherine Adams","James Z. Wang"],"pdf_url":"https://arxiv.org/pdf/2202.09348v2.pdf","comment":"Supplementary materials are available from the authors or\n  http://wang.ist.psu.edu"},{"id":"http://arxiv.org/abs/2303.07189v3","updated":"2023-10-12T12:47:22Z","published":"2023-03-13T15:30:28Z","title":"Optimizing Convolutional Neural Networks for Chronic Obstructive\n  Pulmonary Disease Detection in Clinical Computed Tomography Imaging","summary":"  We aim to optimize the binary detection of Chronic Obstructive Pulmonary\nDisease (COPD) based on emphysema presence in the lung with convolutional\nneural networks (CNN) by exploring manually adjusted versus automated\nwindow-setting optimization (WSO) on computed tomography (CT) images. 7,194 CT\nimages (3,597 with COPD; 3,597 healthy controls) from 78 subjects (43 with\nCOPD; 35 healthy controls) were selected retrospectively (10.2018-12.2019) and\npreprocessed. For each image, intensity values were manually clipped to the\nemphysema window setting and a baseline 'full-range' window setting.\nClass-balanced train, validation, and test sets contained 3,392, 1,114, and\n2,688 images. The network backbone was optimized by comparing various CNN\narchitectures. Furthermore, automated WSO was implemented by adding a\ncustomized layer to the model. The image-level area under the Receiver\nOperating Characteristics curve (AUC) [lower, upper limit 95% confidence] was\nutilized to compare model variations. Repeated inference (n=7) on the test set\nshowed that the DenseNet was the most efficient backbone and achieved a mean\nAUC of 0.80 [0.76, 0.85] without WSO. Comparably, with input images manually\nadjusted to the emphysema window, the DenseNet model predicted COPD with a mean\nAUC of 0.86 [0.82, 0.89]. By adding a customized WSO layer to the DenseNet, an\noptimal window in the proximity of the emphysema window setting was learned\nautomatically, and a mean AUC of 0.82 [0.78, 0.86] was achieved. Detection of\nCOPD with DenseNet models was improved by WSO of CT data to the emphysema\nwindow setting range.\n","authors":["Tina Dorosti","Manuel Schultheiss","Felix Hofmann","Johannes Thalhammer","Luisa Kirchner","Theresa Urban","Franz Pfeiffer","Florian Schaff","Tobias Lasser","Daniela Pfeiffer"],"pdf_url":"https://arxiv.org/pdf/2303.07189v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01601v3","updated":"2023-10-12T12:43:39Z","published":"2023-04-04T07:43:56Z","title":"Primitive Simultaneous Optimization of Similarity Metrics for Image\n  Registration","summary":"  Even though simultaneous optimization of similarity metrics is a standard\nprocedure in the field of semantic segmentation, surprisingly, this is much\nless established for image registration. To help closing this gap in the\nliterature, we investigate in a complex multi-modal 3D setting whether\nsimultaneous optimization of registration metrics, here implemented by means of\nprimitive summation, can benefit image registration. We evaluate two\nchallenging datasets containing collections of pre- to post-operative and pre-\nto intra-operative MR images of glioma. Employing the proposed optimization, we\ndemonstrate improved registration accuracy in terms of TRE on expert\nneuroradiologists' landmark annotations.\n","authors":["Diana Waldmannstetter","Benedikt Wiestler","Julian Schwarting","Ivan Ezhov","Marie Metz","Spyridon Bakas","Bhakti Baheti","Satrajit Chakrabarty","Daniel Rueckert","Jan S. Kirschke","Rolf A. Heckemann","Marie Piraud","Bjoern H. Menze","Florian Kofler"],"pdf_url":"https://arxiv.org/pdf/2304.01601v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08276v1","updated":"2023-10-12T12:28:47Z","published":"2023-10-12T12:28:47Z","title":"Direction-Oriented Visual-semantic Embedding Model for Remote Sensing\n  Image-text Retrieval","summary":"  Image-text retrieval has developed rapidly in recent years. However, it is\nstill a challenge in remote sensing due to visual-semantic imbalance, which\nleads to incorrect matching of non-semantic visual and textual features. To\nsolve this problem, we propose a novel Direction-Oriented Visual-semantic\nEmbedding Model (DOVE) to mine the relationship between vision and language.\nConcretely, a Regional-Oriented Attention Module (ROAM) adaptively adjusts the\ndistance between the final visual and textual embeddings in the latent semantic\nspace, oriented by regional visual features. Meanwhile, a lightweight Digging\nText Genome Assistant (DTGA) is designed to expand the range of tractable\ntextual representation and enhance global word-level semantic connections using\nless attention operations. Ultimately, we exploit a global visual-semantic\nconstraint to reduce single visual dependency and serve as an external\nconstraint for the final visual and textual representations. The effectiveness\nand superiority of our method are verified by extensive experiments including\nparameter evaluation, quantitative comparison, ablation studies and visual\nanalysis, on two benchmark datasets, RSICD and RSITMD.\n","authors":["Qing Ma","Jiancheng Pan","Cong Bai"],"pdf_url":"https://arxiv.org/pdf/2310.08276v1.pdf","comment":"13 pages, 11 figures"},{"id":"http://arxiv.org/abs/2310.08261v1","updated":"2023-10-12T12:06:31Z","published":"2023-10-12T12:06:31Z","title":"GraphAlign: Enhancing Accurate Feature Alignment by Graph matching for\n  Multi-Modal 3D Object Detection","summary":"  LiDAR and cameras are complementary sensors for 3D object detection in\nautonomous driving. However, it is challenging to explore the unnatural\ninteraction between point clouds and images, and the critical factor is how to\nconduct feature alignment of heterogeneous modalities. Currently, many methods\nachieve feature alignment by projection calibration only, without considering\nthe problem of coordinate conversion accuracy errors between sensors, leading\nto sub-optimal performance. In this paper, we present GraphAlign, a more\naccurate feature alignment strategy for 3D object detection by graph matching.\nSpecifically, we fuse image features from a semantic segmentation encoder in\nthe image branch and point cloud features from a 3D Sparse CNN in the LiDAR\nbranch. To save computation, we construct the nearest neighbor relationship by\ncalculating Euclidean distance within the subspaces that are divided into the\npoint cloud features. Through the projection calibration between the image and\npoint cloud, we project the nearest neighbors of point cloud features onto the\nimage features. Then by matching the nearest neighbors with a single point\ncloud to multiple images, we search for a more appropriate feature alignment.\nIn addition, we provide a self-attention module to enhance the weights of\nsignificant relations to fine-tune the feature alignment between heterogeneous\nmodalities. Extensive experiments on nuScenes benchmark demonstrate the\neffectiveness and efficiency of our GraphAlign.\n","authors":["Ziying Song","Haiyue Wei","Lin Bai","Lei Yang","Caiyan Jia"],"pdf_url":"https://arxiv.org/pdf/2310.08261v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08259v1","updated":"2023-10-12T12:05:51Z","published":"2023-10-12T12:05:51Z","title":"Invisible Threats: Backdoor Attack in OCR Systems","summary":"  Optical Character Recognition (OCR) is a widely used tool to extract text\nfrom scanned documents. Today, the state-of-the-art is achieved by exploiting\ndeep neural networks. However, the cost of this performance is paid at the\nprice of system vulnerability. For instance, in backdoor attacks, attackers\ncompromise the training phase by inserting a backdoor in the victim's model\nthat will be activated at testing time by specific patterns while leaving the\noverall model performance intact. This work proposes a backdoor attack for OCR\nresulting in the injection of non-readable characters from malicious input\nimages. This simple but effective attack exposes the state-of-the-art OCR\nweakness, making the extracted text correct to human eyes but simultaneously\nunusable for the NLP application that uses OCR as a preprocessing step.\nExperimental results show that the attacked models successfully output\nnon-readable characters for around 90% of the poisoned instances without\nharming their performance for the remaining instances.\n","authors":["Mauro Conti","Nicola Farronato","Stefanos Koffas","Luca Pajola","Stjepan Picek"],"pdf_url":"https://arxiv.org/pdf/2310.08259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04825v2","updated":"2023-10-12T12:05:15Z","published":"2023-10-07T14:29:57Z","title":"Comparative study of multi-person tracking methods","summary":"  This paper presents a study of two tracking algorithms (SORT~\\cite{7533003}\nand Tracktor++~\\cite{2019}) that were ranked first positions on the MOT\nChallenge leaderboard (The MOTChallenge web page: https://motchallenge.net ).\nThe purpose of this study is to discover the techniques used and to provide\nuseful insights about these algorithms in the tracking pipeline that could\nimprove the performance of MOT tracking algorithms. To this end, we adopted the\npopular tracking-by-detection approach. We trained our own Pedestrian Detection\nmodel using the MOT17Det dataset (MOT17Det :\nhttps://motchallenge.net/data/MOT17Det/ ). We also used a re-identification\nmodel trained on MOT17 dataset (MOT17 : https://motchallenge.net/data/MOT17/ )\nfor Tracktor++ to reduce the false re-identification alarms. We then present\nexperimental results which shows that Tracktor++ is a better multi-person\ntracking algorithm than SORT. We also performed ablation studies to discover\nthe contribution of re-identification(RE-ID) network and motion to the results\nof Tracktor++. We finally conclude by providing some recommendations for future\nresearch.\n","authors":["Denis Mbey Akola"],"pdf_url":"https://arxiv.org/pdf/2310.04825v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04829v2","updated":"2023-10-12T12:04:16Z","published":"2023-10-07T14:38:16Z","title":"How to effectively train an ensemble of Faster R-CNN object detectors to\n  quantify uncertainty","summary":"  This paper presents a new approach for training two-stage object detection\nensemble models, more specifically, Faster R-CNN models to estimate\nuncertainty. We propose training one Region Proposal\nNetwork(RPN)~\\cite{https://doi.org/10.48550/arxiv.1506.01497} and multiple Fast\nR-CNN prediction heads is all you need to build a robust deep ensemble network\nfor estimating uncertainty in object detection. We present this approach and\nprovide experiments to show that this approach is much faster than the naive\nmethod of fully training all $n$ models in an ensemble. We also estimate the\nuncertainty by measuring this ensemble model's Expected Calibration Error\n(ECE). We then further compare the performance of this model with that of\nGaussian YOLOv3, a variant of YOLOv3 that models uncertainty using predicted\nbounding box coordinates. The source code is released at\n\\url{https://github.com/Akola-Mbey-Denis/EfficientEnsemble}\n","authors":["Denis Mbey Akola","Gianni Franchi"],"pdf_url":"https://arxiv.org/pdf/2310.04829v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19443v2","updated":"2023-10-12T12:02:34Z","published":"2023-05-30T22:34:48Z","title":"OWAdapt: An adaptive loss function for deep learning using OWA operators","summary":"  In this paper, we propose a fuzzy adaptive loss function for enhancing deep\nlearning performance in classification tasks. Specifically, we redefine the\ncross-entropy loss to effectively address class-level noise conditions,\nincluding the challenging problem of class imbalance. Our approach introduces\naggregation operators, leveraging the power of fuzzy logic to improve\nclassification accuracy. The rationale behind our proposed method lies in the\niterative up-weighting of class-level components within the loss function,\nfocusing on those with larger errors. To achieve this, we employ the ordered\nweighted average (OWA) operator and combine it with an adaptive scheme for\ngradient-based learning. Through extensive experimentation, our method\noutperforms other commonly used loss functions, such as the standard\ncross-entropy or focal loss, across various binary and multiclass\nclassification tasks. Furthermore, we explore the influence of hyperparameters\nassociated with the OWA operators and present a default configuration that\nperforms well across different experimental settings.\n","authors":["Sebastián Maldonado","Carla Vairetti","Katherine Jara","Miguel Carrasco","Julio López"],"pdf_url":"https://arxiv.org/pdf/2305.19443v2.pdf","comment":"15 pages, 1 figure, published"},{"id":"http://arxiv.org/abs/2310.08255v1","updated":"2023-10-12T11:59:54Z","published":"2023-10-12T11:59:54Z","title":"Distilling from Vision-Language Models for Improved OOD Generalization\n  in Vision Tasks","summary":"  Vision-Language Models (VLMs) such as CLIP are trained on large amounts of\nimage-text pairs, resulting in remarkable generalization across several data\ndistributions. The prohibitively expensive training and data\ncollection/curation costs of these models make them valuable Intellectual\nProperty (IP) for organizations. This motivates a vendor-client paradigm, where\na vendor trains a large-scale VLM and grants only input-output access to\nclients on a pay-per-query basis in a black-box setting. The client aims to\nminimize inference cost by distilling the VLM to a student model using the\nlimited available task-specific data, and further deploying this student model\nin the downstream application. While naive distillation largely improves the\nIn-Domain (ID) accuracy of the student, it fails to transfer the superior\nout-of-distribution (OOD) generalization of the VLM teacher using the limited\navailable labeled images. To mitigate this, we propose Vision-Language to\nVision-Align, Distill, Predict (VL2V-ADiP), which first aligns the vision and\nlanguage modalities of the teacher model with the vision modality of a\npre-trained student model, and further distills the aligned VLM embeddings to\nthe student. This maximally retains the pre-trained features of the student,\nwhile also incorporating the rich representations of the VLM image encoder and\nthe superior generalization of the text embeddings. The proposed approach\nachieves state-of-the-art results on the standard Domain Generalization\nbenchmarks in a black-box teacher setting, and also when weights of the VLM are\naccessible.\n","authors":["Sravanti Addepalli","Ashish Ramayee Asokan","Lakshay Sharma","R. Venkatesh Babu"],"pdf_url":"https://arxiv.org/pdf/2310.08255v1.pdf","comment":"Code is available at https://github.com/val-iisc/VL2V-ADiP.git"},{"id":"http://arxiv.org/abs/2309.06188v2","updated":"2023-10-12T11:51:21Z","published":"2023-09-12T12:54:12Z","title":"Computer Vision Pipeline for Automated Antarctic Krill Analysis","summary":"  British Antarctic Survey (BAS) researchers launch annual expeditions to the\nAntarctic in order to estimate Antarctic Krill biomass and assess the change\nfrom previous years. These comparisons provide insight into the effects of the\ncurrent environment on this key component of the marine food chain. In this\nwork we have developed tools for automating the data collection and analysis\nprocess, using web-based image annotation tools and deep learning image\nclassification and regression models. We achieve highly accurate krill instance\nsegmentation results with an average 77.28% AP score, as well as separate\nmaturity stage and length estimation of krill specimens with 62.99% accuracy\nand a 1.98mm length error respectively.\n","authors":["Mazvydas Gudelis","Michal Mackiewicz","Julie Bremner","Sophie Fielding"],"pdf_url":"https://arxiv.org/pdf/2309.06188v2.pdf","comment":"Accepted to MVEO @ BMVC 2023"},{"id":"http://arxiv.org/abs/2308.03998v4","updated":"2023-10-12T11:49:34Z","published":"2023-08-08T02:28:48Z","title":"Real-time Strawberry Detection Based on Improved YOLOv5s Architecture\n  for Robotic Harvesting in open-field environment","summary":"  This study proposed a YOLOv5-based custom object detection model to detect\nstrawberries in an outdoor environment. The original architecture of the\nYOLOv5s was modified by replacing the C3 module with the C2f module in the\nbackbone network, which provided a better feature gradient flow. Secondly, the\nSpatial Pyramid Pooling Fast in the final layer of the backbone network of\nYOLOv5s was combined with Cross Stage Partial Net to improve the generalization\nability over the strawberry dataset in this study. The proposed architecture\nwas named YOLOv5s-Straw. The RGB images dataset of the strawberry canopy with\nthree maturity classes (immature, nearly mature, and mature) was collected in\nopen-field environment and augmented through a series of operations including\nbrightness reduction, brightness increase, and noise adding. To verify the\nsuperiority of the proposed method for strawberry detection in open-field\nenvironment, four competitive detection models (YOLOv3-tiny, YOLOv5s,\nYOLOv5s-C2f, and YOLOv8s) were trained, and tested under the same computational\nenvironment and compared with YOLOv5s-Straw. The results showed that the\nhighest mean average precision of 80.3% was achieved using the proposed\narchitecture whereas the same was achieved with YOLOv3-tiny, YOLOv5s,\nYOLOv5s-C2f, and YOLOv8s were 73.4%, 77.8%, 79.8%, 79.3%, respectively.\nSpecifically, the average precision of YOLOv5s-Straw was 82.1% in the immature\nclass, 73.5% in the nearly mature class, and 86.6% in the mature class, which\nwere 2.3% and 3.7%, respectively, higher than that of the latest YOLOv8s. The\nmodel included 8.6*10^6 network parameters with an inference speed of 18ms per\nimage while the inference speed of YOLOv8s had a slower inference speed of\n21.0ms and heavy parameters of 11.1*10^6, which indicates that the proposed\nmodel is fast enough for real time strawberry detection and localization for\nthe robotic picking.\n","authors":["Zixuan He","Salik Ram Khanal","Xin Zhang","Manoj Karkee","Qin Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03998v4.pdf","comment":"20 pages; 15 figures"},{"id":"http://arxiv.org/abs/2310.08230v1","updated":"2023-10-12T11:23:07Z","published":"2023-10-12T11:23:07Z","title":"Fast Discrete Optimisation for Geometrically Consistent 3D Shape\n  Matching","summary":"  In this work we propose to combine the advantages of learning-based and\ncombinatorial formalisms for 3D shape matching. While learning-based shape\nmatching solutions lead to state-of-the-art matching performance, they do not\nensure geometric consistency, so that obtained matchings are locally unsmooth.\nOn the contrary, axiomatic methods allow to take geometric consistency into\naccount by explicitly constraining the space of valid matchings. However,\nexisting axiomatic formalisms are impractical since they do not scale to\npractically relevant problem sizes, or they require user input for the\ninitialisation of non-convex optimisation problems. In this work we aim to\nclose this gap by proposing a novel combinatorial solver that combines a unique\nset of favourable properties: our approach is (i) initialisation free, (ii)\nmassively parallelisable powered by a quasi-Newton method, (iii) provides\noptimality gaps, and (iv) delivers decreased runtime and globally optimal\nresults for many instances.\n","authors":["Paul Roetzer","Ahmed Abbas","Dongliang Cao","Florian Bernard","Paul Swoboda"],"pdf_url":"https://arxiv.org/pdf/2310.08230v1.pdf","comment":"Paul Roetzer and Ahmed Abbas contributed equally"},{"id":"http://arxiv.org/abs/2310.08222v1","updated":"2023-10-12T11:14:27Z","published":"2023-10-12T11:14:27Z","title":"Structural analysis of Hindi online handwritten characters for character\n  recognition","summary":"  Direction properties of online strokes are used to analyze them in terms of\nhomogeneous regions or sub-strokes with points satisfying common geometric\nproperties. Such sub-strokes are called sub-units. These properties are used to\nextract sub-units from Hindi ideal online characters. These properties along\nwith some heuristics are used to extract sub-units from Hindi online\nhandwritten characters.\\\\ A method is developed to extract point stroke,\nclockwise curve stroke, counter-clockwise curve stroke and loop stroke segments\nas sub-units from Hindi online handwritten characters. These extracted\nsub-units are close in structure to the sub-units of the corresponding Hindi\nonline ideal characters.\\\\ Importance of local representation of online\nhandwritten characters in terms of sub-units is assessed by training a\nclassifier with sub-unit level local and character level global features\nextracted from characters for character recognition. The classifier has the\nrecognition accuracy of 93.5\\% on the testing set. This accuracy is the highest\nwhen compared with that of the classifiers trained only with global features\nextracted from characters in the same training set and evaluated on the same\ntesting set.\\\\ Sub-unit extraction algorithm and the sub-unit based character\nclassifier are tested on Hindi online handwritten character dataset. This\ndataset consists of samples from 96 different characters. There are 12832 and\n2821 samples in the training and testing sets, respectively.\n","authors":["Anand Sharma","A. G. Ramakrishnan"],"pdf_url":"https://arxiv.org/pdf/2310.08222v1.pdf","comment":"34 pages, 36 jpg figures"},{"id":"http://arxiv.org/abs/2309.00848v2","updated":"2023-10-12T11:11:23Z","published":"2023-09-02T07:17:43Z","title":"Bengali Document Layout Analysis -- A YOLOV8 Based Ensembling Approach","summary":"  This paper focuses on enhancing Bengali Document Layout Analysis (DLA) using\nthe YOLOv8 model and innovative post-processing techniques. We tackle\nchallenges unique to the complex Bengali script by employing data augmentation\nfor model robustness. After meticulous validation set evaluation, we fine-tune\nour approach on the complete dataset, leading to a two-stage prediction\nstrategy for accurate element segmentation. Our ensemble model, combined with\npost-processing, outperforms individual base architectures, addressing issues\nidentified in the BaDLAD dataset. By leveraging this approach, we aim to\nadvance Bengali document analysis, contributing to improved OCR and document\ncomprehension and BaDLAD serves as a foundational resource for this endeavor,\naiding future research in the field. Furthermore, our experiments provided key\ninsights to incorporate new strategies into the established solution.\n","authors":["Nazmus Sakib Ahmed","Saad Sakib Noor","Ashraful Islam Shanto Sikder","Abhijit Paul"],"pdf_url":"https://arxiv.org/pdf/2309.00848v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08217v1","updated":"2023-10-12T11:05:34Z","published":"2023-10-12T11:05:34Z","title":"TriRE: A Multi-Mechanism Learning Paradigm for Continual Knowledge\n  Retention and Promotion","summary":"  Continual learning (CL) has remained a persistent challenge for deep neural\nnetworks due to catastrophic forgetting (CF) of previously learned tasks.\nSeveral techniques such as weight regularization, experience rehearsal, and\nparameter isolation have been proposed to alleviate CF. Despite their relative\nsuccess, these research directions have predominantly remained orthogonal and\nsuffer from several shortcomings, while missing out on the advantages of\ncompeting strategies. On the contrary, the brain continually learns,\naccommodates, and transfers knowledge across tasks by simultaneously leveraging\nseveral neurophysiological processes, including neurogenesis, active\nforgetting, neuromodulation, metaplasticity, experience rehearsal, and\ncontext-dependent gating, rarely resulting in CF. Inspired by how the brain\nexploits multiple mechanisms concurrently, we propose TriRE, a novel CL\nparadigm that encompasses retaining the most prominent neurons for each task,\nrevising and solidifying the extracted knowledge of current and past tasks, and\nactively promoting less active neurons for subsequent tasks through rewinding\nand relearning. Across CL settings, TriRE significantly reduces task\ninterference and surpasses different CL approaches considered in isolation.\n","authors":["Preetha Vijayan","Prashant Bhat","Elahe Arani","Bahram Zonooz"],"pdf_url":"https://arxiv.org/pdf/2310.08217v1.pdf","comment":"Accepted at 37th Conference on Neural Information Processing Systems\n  (NeurIPS 2023)"},{"id":"http://arxiv.org/abs/2310.08206v1","updated":"2023-10-12T10:51:23Z","published":"2023-10-12T10:51:23Z","title":"Long-Tailed Classification Based on Coarse-Grained Leading Forest and\n  Multi-Center Loss","summary":"  Long-tailed(LT) classification is an unavoidable and challenging problem in\nthe real world. Most of the existing long-tailed classification methods focus\nonly on solving the inter-class imbalance in which there are more samples in\nthe head class than in the tail class, while ignoring the intra-lass imbalance\nin which the number of samples of the head attribute within the same class is\nmuch larger than the number of samples of the tail attribute. The deviation in\nthe model is caused by both of these factors, and due to the fact that\nattributes are implicit in most datasets and the combination of attributes is\nvery complex, the intra-class imbalance is more difficult to handle. For this\npurpose, we proposed a long-tailed classification framework, known as\n\\textbf{\\textsc{Cognisance}}, which is founded on Coarse-Grained Leading Forest\n(CLF) and Multi-Center Loss (MCL), aiming to build a multi-granularity joint\nsolution model by means of invariant feature learning. In this method, we\ndesigned an unsupervised learning method, i.e., CLF, to better characterize the\ndistribution of attributes within a class. Depending on the distribution of\nattributes, we can flexibly construct sampling strategies suitable for\ndifferent environments. In addition, we introduce a new metric learning loss\n(MCL), which aims to gradually eliminate confusing attributes during the\nfeature learning process. More importantly, this approach does not depend on a\nspecific model structure and can be integrated with existing LT methods as an\nindependent component. We have conducted extensive experiments and our approach\nhas state-of-the-art performance in both existing benchmarks ImageNet-GLT and\nMSCOCO-GLT, and can improve the performance of existing LT methods. Our codes\nare available on GitHub: \\url{https://github.com/jinyery/cognisance}\n","authors":["Jinye Yang","Ji Xu"],"pdf_url":"https://arxiv.org/pdf/2310.08206v1.pdf","comment":"This is another research work to apply leading tree structure along\n  with deep learning architecture"},{"id":"http://arxiv.org/abs/2310.08204v1","updated":"2023-10-12T10:50:21Z","published":"2023-10-12T10:50:21Z","title":"Lifelong Audio-video Masked Autoencoder with Forget-robust Localized\n  Alignments","summary":"  We present a lifelong audio-video masked autoencoder that continually learns\nthe multimodal representations from a video stream containing audio-video\npairs, while its distribution continually shifts over time. Specifically, we\npropose two novel ideas to tackle the problem: (1) Localized Alignment: We\nintroduce a small trainable multimodal encoder that predicts the audio and\nvideo tokens that are well-aligned with each other. This allows the model to\nlearn only the highly correlated audiovisual patches with accurate multimodal\nrelationships. (2) Forget-robust multimodal patch selection: We compare the\nrelative importance of each audio-video patch between the current and past data\npair to mitigate unintended drift of the previously learned audio-video\nrepresentations. Our proposed method, FLAVA (Forget-robust Localized\nAudio-Video Alignment), therefore, captures the complex relationships between\nthe audio and video modalities during training on a sequence of pre-training\ntasks while alleviating the forgetting of learned audiovisual correlations. Our\nexperiments validate that FLAVA outperforms the state-of-the-art continual\nlearning methods on several benchmark datasets under continual audio-video\nrepresentation learning scenarios.\n","authors":["Jaewoo Lee","Jaehong Yoon","Wonjae Kim","Yunji Kim","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2310.08204v1.pdf","comment":"Preprint, project page: https://g-jwlee.github.io/FLAVA/"},{"id":"http://arxiv.org/abs/2310.05969v2","updated":"2023-10-12T10:26:17Z","published":"2023-09-28T07:57:03Z","title":"Automated Chest X-Ray Report Generator Using Multi-Model Deep Learning\n  Approach","summary":"  Reading and interpreting chest X-ray images is one of the most radiologist's\nroutines. However, it still can be challenging, even for the most experienced\nones. Therefore, we proposed a multi-model deep learning-based automated chest\nX-ray report generator system designed to assist radiologists in their work.\nThe basic idea of the proposed system is by utilizing multi\nbinary-classification models for detecting multi abnormalities, with each model\nresponsible for detecting one abnormality, in a single image. In this study, we\nlimited the radiology abnormalities detection to only cardiomegaly, lung\neffusion, and consolidation. The system generates a radiology report by\nperforming the following three steps: image pre-processing, utilizing deep\nlearning models to detect abnormalities, and producing a report. The aim of the\nimage pre-processing step is to standardize the input by scaling it to 128x128\npixels and slicing it into three segments, which covers the upper, lower, and\nmiddle parts of the lung. After pre-processing, each corresponding model\nclassifies the image, resulting in a 0 (zero) for no abnormality detected and a\n1 (one) for the presence of an abnormality. The prediction outputs of each\nmodel are then concatenated to form a 'result code'. The 'result code' is used\nto construct a report by selecting the appropriate pre-determined sentence for\neach detected abnormality in the report generation step. The proposed system is\nexpected to reduce the workload of radiologists and increase the accuracy of\nchest X-ray diagnosis.\n","authors":["Arief Purnama Muharram","Hollyana Puteri Haryono","Abassi Haji Juma","Ira Puspasari","Nugraha Priya Utama"],"pdf_url":"https://arxiv.org/pdf/2310.05969v2.pdf","comment":"Presented in the 2023 IEEE International Conference on Data and\n  Software Engineering (ICoDSE 2023)"},{"id":"http://arxiv.org/abs/2309.13336v2","updated":"2023-10-12T10:24:42Z","published":"2023-09-23T10:58:08Z","title":"FedDrive v2: an Analysis of the Impact of Label Skewness in Federated\n  Semantic Segmentation for Autonomous Driving","summary":"  We propose FedDrive v2, an extension of the Federated Learning benchmark for\nSemantic Segmentation in Autonomous Driving. While the first version aims at\nstudying the effect of domain shift of the visual features across clients, in\nthis work, we focus on the distribution skewness of the labels. We propose six\nnew federated scenarios to investigate how label skewness affects the\nperformance of segmentation models and compare it with the effect of domain\nshift. Finally, we study the impact of using the domain information during\ntesting. Official website: https://feddrive.github.io\n","authors":["Eros Fanì","Marco Ciccone","Barbara Caputo"],"pdf_url":"https://arxiv.org/pdf/2309.13336v2.pdf","comment":"5th Italian Conference on Robotics and Intelligent Machines (I-RIM)\n  2023"},{"id":"http://arxiv.org/abs/2310.08182v1","updated":"2023-10-12T10:17:40Z","published":"2023-10-12T10:17:40Z","title":"XIMAGENET-12: An Explainable AI Benchmark Dataset for Model Robustness\n  Evaluation","summary":"  The lack of standardized robustness metrics and the widespread reliance on\nnumerous unrelated benchmark datasets for testing have created a gap between\nacademically validated robust models and their often problematic practical\nadoption. To address this, we introduce XIMAGENET-12, an explainable benchmark\ndataset with over 200K images and 15,600 manual semantic annotations. Covering\n12 categories from ImageNet to represent objects commonly encountered in\npractical life and simulating six diverse scenarios, including overexposure,\nblurring, color changing, etc., we further propose a novel robustness criterion\nthat extends beyond model generation ability assessment. This benchmark\ndataset, along with related code, is available at\nhttps://sites.google.com/view/ximagenet-12/home. Researchers and practitioners\ncan leverage this resource to evaluate the robustness of their visual models\nunder challenging conditions and ultimately benefit from the demands of\npractical computer vision systems.\n","authors":["Qiang Li","Dan Zhang","Shengzhao Lei","Xun Zhao","Shuyan Li","Porawit Kamnoedboon","WeiWei Li"],"pdf_url":"https://arxiv.org/pdf/2310.08182v1.pdf","comment":"UnderSubmission"},{"id":"http://arxiv.org/abs/2310.07449v2","updated":"2023-10-12T10:14:39Z","published":"2023-10-11T12:51:16Z","title":"PoRF: Pose Residual Field for Accurate Neural Surface Reconstruction","summary":"  Neural surface reconstruction is sensitive to the camera pose noise, even if\nstate-of-the-art pose estimators like COLMAP or ARKit are used. More\nimportantly, existing Pose-NeRF joint optimisation methods have struggled to\nimprove pose accuracy in challenging real-world scenarios. To overcome the\nchallenges, we introduce the pose residual field (\\textbf{PoRF}), a novel\nimplicit representation that uses an MLP for regressing pose updates. This is\nmore robust than the conventional pose parameter optimisation due to parameter\nsharing that leverages global information over the entire sequence.\nFurthermore, we propose an epipolar geometry loss to enhance the supervision\nthat leverages the correspondences exported from COLMAP results without the\nextra computational overhead. Our method yields promising results. On the DTU\ndataset, we reduce the rotation error by 78\\% for COLMAP poses, leading to the\ndecreased reconstruction Chamfer distance from 3.48mm to 0.85mm. On the\nMobileBrick dataset that contains casually captured unbounded 360-degree\nvideos, our method refines ARKit poses and improves the reconstruction F1 score\nfrom 69.18 to 75.67, outperforming that with the dataset provided ground-truth\npose (75.14). These achievements demonstrate the efficacy of our approach in\nrefining camera poses and improving the accuracy of neural surface\nreconstruction in real-world scenarios.\n","authors":["Jia-Wang Bian","Wenjing Bian","Victor Adrian Prisacariu","Philip Torr"],"pdf_url":"https://arxiv.org/pdf/2310.07449v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2310.08177v1","updated":"2023-10-12T10:03:25Z","published":"2023-10-12T10:03:25Z","title":"Improving Fast Minimum-Norm Attacks with Hyperparameter Optimization","summary":"  Evaluating the adversarial robustness of machine learning models using\ngradient-based attacks is challenging. In this work, we show that\nhyperparameter optimization can improve fast minimum-norm attacks by automating\nthe selection of the loss function, the optimizer and the step-size scheduler,\nalong with the corresponding hyperparameters. Our extensive evaluation\ninvolving several robust models demonstrates the improved efficacy of fast\nminimum-norm attacks when hyper-up with hyperparameter optimization. We release\nour open-source code at https://github.com/pralab/HO-FMN.\n","authors":["Giuseppe Floris","Raffaele Mura","Luca Scionis","Giorgio Piras","Maura Pintor","Ambra Demontis","Battista Biggio"],"pdf_url":"https://arxiv.org/pdf/2310.08177v1.pdf","comment":"Accepted at ESANN23"},{"id":"http://arxiv.org/abs/2310.08165v1","updated":"2023-10-12T09:37:56Z","published":"2023-10-12T09:37:56Z","title":"COVID-19 Detection Using Swin Transformer Approach from Computed\n  Tomography Images","summary":"  The accurate and efficient diagnosis of COVID-19 is of paramount importance,\nparticularly in the context of large-scale medical imaging datasets. In this\npreprint paper, we propose a novel approach for COVID-19 diagnosis using CT\nimages that leverages the power of Swin Transformer models, state-of-the-art\nsolutions in computer vision tasks. Our method includes a systematic approach\nfor patient-level predictions, where individual CT slices are classified as\nCOVID-19 or non-COVID, and the patient's overall diagnosis is determined\nthrough majority voting. The application of the Swin Transformer in this\ncontext results in patient-level predictions that demonstrate exceptional\ndiagnostic accuracy. In terms of evaluation metrics, our approach consistently\noutperforms the baseline, as well as numerous competing methods, showcasing its\neffectiveness in COVID-19 diagnosis. The macro F1 score achieved by our model\nexceeds the baseline and offers a robust solution for accurate diagnosis.\n","authors":["Kenan Morani"],"pdf_url":"https://arxiv.org/pdf/2310.08165v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.17046v2","updated":"2023-10-12T09:22:10Z","published":"2023-06-29T15:43:06Z","title":"Spiking Denoising Diffusion Probabilistic Models","summary":"  Spiking neural networks (SNNs) have ultra-low energy consumption and high\nbiological plausibility due to their binary and bio-driven nature compared with\nartificial neural networks (ANNs). While previous research has primarily\nfocused on enhancing the performance of SNNs in classification tasks, the\ngenerative potential of SNNs remains relatively unexplored. In our paper, we\nput forward Spiking Denoising Diffusion Probabilistic Models (SDDPM), a new\nclass of SNN-based generative models that achieve high sample quality. To fully\nexploit the energy efficiency of SNNs, we propose a purely Spiking U-Net\narchitecture, which achieves comparable performance to its ANN counterpart\nusing only 4 time steps, resulting in significantly reduced energy consumption.\nExtensive experimental results reveal that our approach achieves\nstate-of-the-art on the generative tasks and substantially outperforms other\nSNN-based generative models, achieving up to $12\\times$ and $6\\times$\nimprovement on the CIFAR-10 and the CelebA datasets, respectively. Moreover, we\npropose a threshold-guided strategy that can further improve the performances\nby 16.7% in a training-free manner. The SDDPM symbolizes a significant\nadvancement in the field of SNN generation, injecting new perspectives and\npotential avenues of exploration.\n","authors":["Jiahang Cao","Ziqing Wang","Hanzhong Guo","Hao Cheng","Qiang Zhang","Renjing Xu"],"pdf_url":"https://arxiv.org/pdf/2306.17046v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2207.09339v3","updated":"2023-10-12T09:13:37Z","published":"2022-07-19T15:49:35Z","title":"Vision Transformers: From Semantic Segmentation to Dense Prediction","summary":"  The emergence of vision transformers (ViTs) in image classification has\nshifted the methodologies for visual representation learning. In particular,\nViTs learn visual representation at full receptive field per layer across all\nthe image patches, in comparison to the increasing receptive fields of CNNs\nacross layers and other alternatives (e.g., large kernels and atrous\nconvolution). In this work, for the first time we explore the global context\nlearning potentials of ViTs for dense visual prediction (e.g., semantic\nsegmentation). Our motivation is that through learning global context at full\nreceptive field layer by layer, ViTs may capture stronger long-range dependency\ninformation, critical for dense prediction tasks. We first demonstrate that\nencoding an image as a sequence of patches, a vanilla ViT without local\nconvolution and resolution reduction can yield stronger visual representation\nfor semantic segmentation. For example, our model, termed as SEgmentation\nTRansformer (SETR), excels on ADE20K (50.28% mIoU, the first position in the\ntest leaderboard on the day of submission) and Pascal Context (55.83% mIoU),\nand performs competitively on Cityscapes. For tackling general dense visual\nprediction tasks in a cost-effective manner, we further formulate a family of\nHierarchical Local-Global (HLG) Transformers, characterized by local attention\nwithin windows and global-attention across windows in a pyramidal architecture.\nExtensive experiments show that our methods achieve appealing performance on a\nvariety of dense prediction tasks (e.g., object detection and instance\nsegmentation and semantic segmentation) as well as image classification. Our\ncode and models are available at https://github.com/fudan-zvg/SETR.\n","authors":["Li Zhang","Jiachen Lu","Sixiao Zheng","Xinxuan Zhao","Xiatian Zhu","Yanwei Fu","Tao Xiang","Jianfeng Feng","Philip H. S. Torr"],"pdf_url":"https://arxiv.org/pdf/2207.09339v3.pdf","comment":"Extended version of CVPR 2021 paper arXiv:2012.15840"},{"id":"http://arxiv.org/abs/2301.12082v3","updated":"2023-10-12T09:01:04Z","published":"2023-01-28T03:58:32Z","title":"Pushing the Limits of Fewshot Anomaly Detection in Industry Vision:\n  Graphcore","summary":"  In the area of fewshot anomaly detection (FSAD), efficient visual feature\nplays an essential role in memory bank M-based methods. However, these methods\ndo not account for the relationship between the visual feature and its rotated\nvisual feature, drastically limiting the anomaly detection performance. To push\nthe limits, we reveal that rotation-invariant feature property has a\nsignificant impact in industrial-based FSAD. Specifically, we utilize graph\nrepresentation in FSAD and provide a novel visual isometric invariant feature\n(VIIF) as anomaly measurement feature. As a result, VIIF can robustly improve\nthe anomaly discriminating ability and can further reduce the size of redundant\nfeatures stored in M by a large amount. Besides, we provide a novel model\nGraphCore via VIIFs that can fast implement unsupervised FSAD training and can\nimprove the performance of anomaly detection. A comprehensive evaluation is\nprovided for comparing GraphCore and other SOTA anomaly detection models under\nour proposed fewshot anomaly detection setting, which shows GraphCore can\nincrease average AUC by 5.8%, 4.1%, 3.4%, and 1.6% on MVTec AD and by 25.5%,\n22.0%, 16.9%, and 14.1% on MPDD for 1, 2, 4, and 8-shot cases, respectively.\n","authors":["Guoyang Xie","Jinbao Wang","Jiaqi Liu","Feng Zheng","Yaochu Jin"],"pdf_url":"https://arxiv.org/pdf/2301.12082v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.02081v2","updated":"2023-10-12T09:00:34Z","published":"2022-10-05T08:19:16Z","title":"Locate before Answering: Answer Guided Question Localization for Video\n  Question Answering","summary":"  Video question answering (VideoQA) is an essential task in vision-language\nunderstanding, which has attracted numerous research attention recently.\nNevertheless, existing works mostly achieve promising performances on short\nvideos of duration within 15 seconds. For VideoQA on minute-level long-term\nvideos, those methods are likely to fail because of lacking the ability to deal\nwith noise and redundancy caused by scene changes and multiple actions in the\nvideo. Considering the fact that the question often remains concentrated in a\nshort temporal range, we propose to first locate the question to a segment in\nthe video and then infer the answer using the located segment only. Under this\nscheme, we propose \"Locate before Answering\" (LocAns), a novel approach that\nintegrates a question locator and an answer predictor into an end-to-end model.\nDuring the training phase, the available answer label not only serves as the\nsupervision signal of the answer predictor, but also is used to generate pseudo\ntemporal labels for the question locator. Moreover, we design a decoupled\nalternative training strategy to update the two modules separately. In the\nexperiments, LocAns achieves state-of-the-art performance on two modern\nlong-term VideoQA datasets NExT-QA and ActivityNet-QA, and its qualitative\nexamples show the reliable performance of the question localization.\n","authors":["Tianwen Qian","Ran Cui","Jingjing Chen","Pai Peng","Xiaowei Guo","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2210.02081v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08143v1","updated":"2023-10-12T08:58:01Z","published":"2023-10-12T08:58:01Z","title":"A Deep Learning Framework for Spatiotemporal Ultrasound Localization\n  Microscopy","summary":"  Ultrasound Localization Microscopy can resolve the microvascular bed down to\na few micrometers. To achieve such performance microbubble contrast agents must\nperfuse the entire microvascular network. Microbubbles are then located\nindividually and tracked over time to sample individual vessels, typically over\nhundreds of thousands of images. To overcome the fundamental limit of\ndiffraction and achieve a dense reconstruction of the network, low microbubble\nconcentrations must be used, which lead to acquisitions lasting several\nminutes. Conventional processing pipelines are currently unable to deal with\ninterference from multiple nearby microbubbles, further reducing achievable\nconcentrations. This work overcomes this problem by proposing a Deep Learning\napproach to recover dense vascular networks from ultrasound acquisitions with\nhigh microbubble concentrations. A realistic mouse brain microvascular network,\nsegmented from 2-photon microscopy, was used to train a three-dimensional\nconvolutional neural network based on a V-net architecture. Ultrasound data\nsets from multiple microbubbles flowing through the microvascular network were\nsimulated and used as ground truth to train the 3D CNN to track microbubbles.\nThe 3D-CNN approach was validated in silico using a subset of the data and in\nvivo on a rat brain acquisition. In silico, the CNN reconstructed vascular\nnetworks with higher precision (81%) than a conventional ULM framework (70%).\nIn vivo, the CNN could resolve micro vessels as small as 10 $\\mu$m with an\nincrease in resolution when compared against a conventional approach.\n","authors":["Léo Milecki","Jonathan Porée","Hatim Belgharbi","Chloé Bourquin","Rafat Damseh","Patrick Delafontaine-Martel","Frédéric Lesage","Maxime Gasse","Jean Provost"],"pdf_url":"https://arxiv.org/pdf/2310.08143v1.pdf","comment":"Copyright 2021 IEEE. Personal use of this material is permitted.\n  Permission from IEEE must be obtained for all other uses, in any current or\n  future media, including reprinting/republishing this material for advertising\n  or promotional purposes, creating new collective works, for resale or\n  redistribution to servers or lists, or reuse of any copyrighted component of\n  this work in other works"},{"id":"http://arxiv.org/abs/2310.08142v1","updated":"2023-10-12T08:57:33Z","published":"2023-10-12T08:57:33Z","title":"Fine-Grained Annotation for Face Anti-Spoofing","summary":"  Face anti-spoofing plays a critical role in safeguarding facial recognition\nsystems against presentation attacks. While existing deep learning methods show\npromising results, they still suffer from the lack of fine-grained annotations,\nwhich lead models to learn task-irrelevant or unfaithful features. In this\npaper, we propose a fine-grained annotation method for face anti-spoofing.\nSpecifically, we first leverage the Segment Anything Model (SAM) to obtain\npixel-wise segmentation masks by utilizing face landmarks as point prompts. The\nface landmarks provide segmentation semantics, which segments the face into\nregions. We then adopt these regions as masks and assemble them into three\nseparate annotation maps: spoof, living, and background maps. Finally, we\ncombine three separate maps into a three-channel map as annotations for model\ntraining. Furthermore, we introduce the Multi-Channel Region Exchange\nAugmentation (MCREA) to diversify training data and reduce overfitting.\nExperimental results demonstrate that our method outperforms existing\nstate-of-the-art approaches in both intra-dataset and cross-dataset\nevaluations.\n","authors":["Xu Chen","Yunde Jia","Yuwei Wu"],"pdf_url":"https://arxiv.org/pdf/2310.08142v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.08139v1","updated":"2023-10-12T08:55:10Z","published":"2023-10-12T08:55:10Z","title":"DualAug: Exploiting Additional Heavy Augmentation with OOD Data\n  Rejection","summary":"  Data augmentation is a dominant method for reducing model overfitting and\nimproving generalization. Most existing data augmentation methods tend to find\na compromise in augmenting the data, \\textit{i.e.}, increasing the amplitude of\naugmentation carefully to avoid degrading some data too much and doing harm to\nthe model performance. We delve into the relationship between data augmentation\nand model performance, revealing that the performance drop with heavy\naugmentation comes from the presence of out-of-distribution (OOD) data.\nNonetheless, as the same data transformation has different effects for\ndifferent training samples, even for heavy augmentation, there remains part of\nin-distribution data which is beneficial to model training. Based on the\nobservation, we propose a novel data augmentation method, named\n\\textbf{DualAug}, to keep the augmentation in distribution as much as possible\nat a reasonable time and computational cost. We design a data mixing strategy\nto fuse augmented data from both the basic- and the heavy-augmentation\nbranches. Extensive experiments on supervised image classification benchmarks\nshow that DualAug improve various automated data augmentation method. Moreover,\nthe experiments on semi-supervised learning and contrastive self-supervised\nlearning demonstrate that our DualAug can also improve related method. Code is\navailable at\n\\href{https://github.com/shuguang99/DualAug}{https://github.com/shuguang99/DualAug}.\n","authors":["Zehao Wang","Yiwen Guo","Qizhang Li","Guanglei Yang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2310.08139v1.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2301.11514v5","updated":"2023-10-12T08:49:31Z","published":"2023-01-27T03:18:09Z","title":"Deep Industrial Image Anomaly Detection: A Survey","summary":"  The recent rapid development of deep learning has laid a milestone in\nindustrial Image Anomaly Detection (IAD). In this paper, we provide a\ncomprehensive review of deep learning-based image anomaly detection techniques,\nfrom the perspectives of neural network architectures, levels of supervision,\nloss functions, metrics and datasets. In addition, we extract the new setting\nfrom industrial manufacturing and review the current IAD approaches under our\nproposed our new setting. Moreover, we highlight several opening challenges for\nimage anomaly detection. The merits and downsides of representative network\narchitectures under varying supervision are discussed. Finally, we summarize\nthe research findings and point out future research directions. More resources\nare available at\nhttps://github.com/M-3LAB/awesome-industrial-anomaly-detection.\n","authors":["Jiaqi Liu","Guoyang Xie","Jinbao Wang","Shangnian Li","Chengjie Wang","Feng Zheng","Yaochu Jin"],"pdf_url":"https://arxiv.org/pdf/2301.11514v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.13359v3","updated":"2023-10-12T08:47:19Z","published":"2023-01-31T01:24:45Z","title":"IM-IAD: Industrial Image Anomaly Detection Benchmark in Manufacturing","summary":"  Image anomaly detection (IAD) is an urgent issue that needs to be addressed\nin modern industrial manufacturing (IM). Recently, many advanced algorithms\nhave been released, but their performance varies greatly due to non-uniformed\nsettings. That is, researchers find it difficult to analyze because they are\ndesigned for different or specific cases in IM. To eliminate this problem, we\nfirst propose a uniform IAD setting to systematically assess the effectiveness\nof these algorithms, mainly considering three aspects of supervision level\n(unsupervised, fully supervised), learning paradigm (few-shot, continual, noisy\nlabel), and efficiency (memory usage, inference speed). Then, we skillfully\nconstruct a comprehensive image anomaly detection benchmark (IM-IAD), which\nincludes 19 algorithms on 7 major datasets with the same setting. Our extensive\nexperiments (17,017 total) provide new insights into the redesign or selection\nof the IAD algorithm under uniform conditions. Importantly, the proposed IM-IAD\npresents feasible challenges and future directions for further work. We believe\nthat this work can have a significant impact on the IAD field. To foster\nreproducibility and accessibility, the source code of IM-IAD is uploaded on the\nwebsite, https://github.com/M-3LAB/IM-IAD.\n","authors":["Guoyang Xie","Jinbao Wang","Jiaqi Liu","Jiayi Lyu","Yong Liu","Chengjie Wang","Feng Zheng","Yaochu Jin"],"pdf_url":"https://arxiv.org/pdf/2301.13359v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08129v1","updated":"2023-10-12T08:36:25Z","published":"2023-10-12T08:36:25Z","title":"Tailored Visions: Enhancing Text-to-Image Generation with Personalized\n  Prompt Rewriting","summary":"  We propose a novel perspective of viewing large pretrained models as search\nengines, thereby enabling the repurposing of techniques previously used to\nenhance search engine performance. As an illustration, we employ a personalized\nquery rewriting technique in the realm of text-to-image generation. Despite\nsignificant progress in the field, it is still challenging to create\npersonalized visual representations that align closely with the desires and\npreferences of individual users. This process requires users to articulate\ntheir ideas in words that are both comprehensible to the models and accurately\ncapture their vision, posing difficulties for many users. In this paper, we\ntackle this challenge by leveraging historical user interactions with the\nsystem to enhance user prompts. We propose a novel approach that involves\nrewriting user prompts based a new large-scale text-to-image dataset with over\n300k prompts from 3115 users. Our rewriting model enhances the expressiveness\nand alignment of user prompts with their intended visual outputs. Experimental\nresults demonstrate the superiority of our methods over baseline approaches, as\nevidenced in our new offline evaluation method and online tests. Our approach\nopens up exciting possibilities of applying more search engine techniques to\nbuild truly personalized large pretrained models.\n","authors":["Zijie Chen","Lichao Zhang","Fangsheng Weng","Lili Pan","Zhenzhong Lan"],"pdf_url":"https://arxiv.org/pdf/2310.08129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03198v5","updated":"2023-10-12T08:31:50Z","published":"2023-04-06T16:21:56Z","title":"RFAConv: Innovating Spatial Attention and Standard Convolutional\n  Operation","summary":"  Spatial attention has been widely used to improve the performance of\nconvolutional neural networks. However, it has certain limitations. In this\npaper, we propose a new perspective on the effectiveness of spatial attention,\nwhich is that the spatial attention mechanism essentially solves the problem of\nconvolutional kernel parameter sharing. However, the information contained in\nthe attention map generated by spatial attention is not sufficient for\nlarge-size convolutional kernels. Therefore, we propose a novel attention\nmechanism called Receptive-Field Attention (RFA). Existing spatial attention,\nsuch as Convolutional Block Attention Module (CBAM) and Coordinated Attention\n(CA) focus only on spatial features, which does not fully address the problem\nof convolutional kernel parameter sharing. In contrast, RFA not only focuses on\nthe receptive-field spatial feature but also provides effective attention\nweights for large-size convolutional kernels. The Receptive-Field Attention\nconvolutional operation (RFAConv), developed by RFA, represents a new approach\nto replace the standard convolution operation. It offers nearly negligible\nincrement of computational cost and parameters, while significantly improving\nnetwork performance. We conducted a series of experiments on ImageNet-1k, COCO,\nand VOC datasets to demonstrate the superiority of our approach. Of particular\nimportance, we believe that it is time to shift focus from spatial features to\nreceptive-field spatial features for current spatial attention mechanisms. In\nthis way, we can further improve network performance and achieve even better\nresults. The code and pre-trained models for the relevant tasks can be found at\nhttps://github.com/Liuchen1997/RFAConv.\n","authors":["Xin Zhang","Chen Liu","Degang Yang","Tingting Song","Yichen Ye","Ke Li","Yingze Song"],"pdf_url":"https://arxiv.org/pdf/2304.03198v5.pdf","comment":"12 pages, 11figures"},{"id":"http://arxiv.org/abs/2310.08117v1","updated":"2023-10-12T08:21:17Z","published":"2023-10-12T08:21:17Z","title":"DUSA: Decoupled Unsupervised Sim2Real Adaptation for\n  Vehicle-to-Everything Collaborative Perception","summary":"  Vehicle-to-Everything (V2X) collaborative perception is crucial for\nautonomous driving. However, achieving high-precision V2X perception requires a\nsignificant amount of annotated real-world data, which can always be expensive\nand hard to acquire. Simulated data have raised much attention since they can\nbe massively produced at an extremely low cost. Nevertheless, the significant\ndomain gap between simulated and real-world data, including differences in\nsensor type, reflectance patterns, and road surroundings, often leads to poor\nperformance of models trained on simulated data when evaluated on real-world\ndata. In addition, there remains a domain gap between real-world collaborative\nagents, e.g. different types of sensors may be installed on autonomous vehicles\nand roadside infrastructures with different extrinsics, further increasing the\ndifficulty of sim2real generalization. To take full advantage of simulated\ndata, we present a new unsupervised sim2real domain adaptation method for V2X\ncollaborative detection named Decoupled Unsupervised Sim2Real Adaptation\n(DUSA). Our new method decouples the V2X collaborative sim2real domain\nadaptation problem into two sub-problems: sim2real adaptation and inter-agent\nadaptation. For sim2real adaptation, we design a Location-adaptive Sim2Real\nAdapter (LSA) module to adaptively aggregate features from critical locations\nof the feature map and align the features between simulated data and real-world\ndata via a sim/real discriminator on the aggregated global feature. For\ninter-agent adaptation, we further devise a Confidence-aware Inter-agent\nAdapter (CIA) module to align the fine-grained features from heterogeneous\nagents under the guidance of agent-wise confidence maps. Experiments\ndemonstrate the effectiveness of the proposed DUSA approach on unsupervised\nsim2real adaptation from the simulated V2XSet dataset to the real-world\nDAIR-V2X-C dataset.\n","authors":["Xianghao Kong","Wentao Jiang","Jinrang Jia","Yifeng Shi","Runsheng Xu","Si Liu"],"pdf_url":"https://arxiv.org/pdf/2310.08117v1.pdf","comment":"ACM MM 2023"},{"id":"http://arxiv.org/abs/2310.08116v1","updated":"2023-10-12T08:17:57Z","published":"2023-10-12T08:17:57Z","title":"Multimodal Active Measurement for Human Mesh Recovery in Close Proximity","summary":"  For safe and sophisticated physical human-robot interactions (pHRI), a robot\nneeds to estimate the accurate body pose or mesh of the target person. However,\nin these pHRI scenarios, the robot cannot fully observe the target person's\nbody with equipped cameras because the target person is usually close to the\nrobot. This leads to severe truncation and occlusions, and results in poor\naccuracy of human pose estimation. For better accuracy of human pose estimation\nor mesh recovery on this limited information from cameras, we propose an active\nmeasurement and sensor fusion framework of the equipped cameras and other\nsensors such as touch sensors and 2D LiDAR. These touch and LiDAR sensing are\nobtained attendantly through pHRI without additional costs. These sensor\nmeasurements are sparse but reliable and informative cues for human mesh\nrecovery. In our active measurement process, camera viewpoints and sensor\nplacements are optimized based on the uncertainty of the estimated pose, which\nis closely related to the truncated or occluded areas. In our sensor fusion\nprocess, we fuse the sensor measurements to the camera-based estimated pose by\nminimizing the distance between the estimated mesh and measured positions. Our\nmethod is agnostic to robot configurations. Experiments were conducted using\nthe Toyota Human Support Robot, which has a camera, 2D LiDAR, and a touch\nsensor on the robot arm. Our proposed method demonstrated the superiority in\nthe human pose estimation accuracy on the quantitative comparison. Furthermore,\nour proposed method reliably estimated the pose of the target person in\npractical settings such as target people occluded by a blanket and standing aid\nwith the robot arm.\n","authors":["Takahiro Maeda","Keisuke Takeshita","Kazuhito Tanaka"],"pdf_url":"https://arxiv.org/pdf/2310.08116v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.14108v3","updated":"2023-10-12T08:05:18Z","published":"2022-11-25T13:50:00Z","title":"3DDesigner: Towards Photorealistic 3D Object Generation and Editing with\n  Text-guided Diffusion Models","summary":"  Text-guided diffusion models have shown superior performance in image/video\ngeneration and editing. While few explorations have been performed in 3D\nscenarios. In this paper, we discuss three fundamental and interesting problems\non this topic. First, we equip text-guided diffusion models to achieve\n3D-consistent generation. Specifically, we integrate a NeRF-like neural field\nto generate low-resolution coarse results for a given camera view. Such results\ncan provide 3D priors as condition information for the following diffusion\nprocess. During denoising diffusion, we further enhance the 3D consistency by\nmodeling cross-view correspondences with a novel two-stream (corresponding to\ntwo different views) asynchronous diffusion process. Second, we study 3D local\nediting and propose a two-step solution that can generate 360-degree\nmanipulated results by editing an object from a single view. Step 1, we propose\nto perform 2D local editing by blending the predicted noises. Step 2, we\nconduct a noise-to-text inversion process that maps 2D blended noises into the\nview-independent text embedding space. Once the corresponding text embedding is\nobtained, 360-degree images can be generated. Last but not least, we extend our\nmodel to perform one-shot novel view synthesis by fine-tuning on a single\nimage, firstly showing the potential of leveraging text guidance for novel view\nsynthesis. Extensive experiments and various applications show the prowess of\nour 3DDesigner. The project page is available at\nhttps://3ddesigner-diffusion.github.io/.\n","authors":["Gang Li","Heliang Zheng","Chaoyue Wang","Chang Li","Changwen Zheng","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2211.14108v3.pdf","comment":"Submitted to IJCV"},{"id":"http://arxiv.org/abs/2310.00847v2","updated":"2023-10-12T08:04:14Z","published":"2023-10-02T02:01:00Z","title":"Can Pre-trained Networks Detect Familiar Out-of-Distribution Data?","summary":"  Out-of-distribution (OOD) detection is critical for safety-sensitive machine\nlearning applications and has been extensively studied, yielding a plethora of\nmethods developed in the literature. However, most studies for OOD detection\ndid not use pre-trained models and trained a backbone from scratch. In recent\nyears, transferring knowledge from large pre-trained models to downstream tasks\nby lightweight tuning has become mainstream for training in-distribution (ID)\nclassifiers. To bridge the gap between the practice of OOD detection and\ncurrent classifiers, the unique and crucial problem is that the samples whose\ninformation networks know often come as OOD input. We consider that such data\nmay significantly affect the performance of large pre-trained networks because\nthe discriminability of these OOD data depends on the pre-training algorithm.\nHere, we define such OOD data as PT-OOD (Pre-Trained OOD) data. In this paper,\nwe aim to reveal the effect of PT-OOD on the OOD detection performance of\npre-trained networks from the perspective of pre-training algorithms. To\nachieve this, we explore the PT-OOD detection performance of supervised and\nself-supervised pre-training algorithms with linear-probing tuning, the most\ncommon efficient tuning method. Through our experiments and analysis, we find\nthat the low linear separability of PT-OOD in the feature space heavily\ndegrades the PT-OOD detection performance, and self-supervised models are more\nvulnerable to PT-OOD than supervised pre-trained models, even with\nstate-of-the-art detection methods. To solve this vulnerability, we further\npropose a unique solution to large-scale pre-trained models: Leveraging\npowerful instance-by-instance discriminative representations of pre-trained\nmodels and detecting OOD in the feature space independent of the ID decision\nboundaries. The code will be available via https://github.com/AtsuMiyai/PT-OOD.\n","authors":["Atsuyuki Miyai","Qing Yu","Go Irie","Kiyoharu Aizawa"],"pdf_url":"https://arxiv.org/pdf/2310.00847v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07522v2","updated":"2023-10-12T08:02:18Z","published":"2023-10-11T14:19:05Z","title":"S4C: Self-Supervised Semantic Scene Completion with Neural Fields","summary":"  3D semantic scene understanding is a fundamental challenge in computer\nvision. It enables mobile agents to autonomously plan and navigate arbitrary\nenvironments. SSC formalizes this challenge as jointly estimating dense\ngeometry and semantic information from sparse observations of a scene. Current\nmethods for SSC are generally trained on 3D ground truth based on aggregated\nLiDAR scans. This process relies on special sensors and annotation by hand\nwhich are costly and do not scale well. To overcome this issue, our work\npresents the first self-supervised approach to SSC called S4C that does not\nrely on 3D ground truth data. Our proposed method can reconstruct a scene from\na single image and only relies on videos and pseudo segmentation ground truth\ngenerated from off-the-shelf image segmentation network during training. Unlike\nexisting methods, which use discrete voxel grids, we represent scenes as\nimplicit semantic fields. This formulation allows querying any point within the\ncamera frustum for occupancy and semantic class. Our architecture is trained\nthrough rendering-based self-supervised losses. Nonetheless, our method\nachieves performance close to fully supervised state-of-the-art methods.\nAdditionally, our method demonstrates strong generalization capabilities and\ncan synthesize accurate segmentation maps for far away viewpoints.\n","authors":["Adrian Hayler","Felix Wimbauer","Dominik Muhle","Christian Rupprecht","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2310.07522v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08106v1","updated":"2023-10-12T08:01:11Z","published":"2023-10-12T08:01:11Z","title":"Generalized Logit Adjustment: Calibrating Fine-tuned Models by Removing\n  Label Bias in Foundation Models","summary":"  Foundation models like CLIP allow zero-shot transfer on various tasks without\nadditional training data. Yet, the zero-shot performance is less competitive\nthan a fully supervised one. Thus, to enhance the performance, fine-tuning and\nensembling are also commonly adopted to better fit the downstream tasks.\nHowever, we argue that such prior work has overlooked the inherent biases in\nfoundation models. Due to the highly imbalanced Web-scale training set, these\nfoundation models are inevitably skewed toward frequent semantics, and thus the\nsubsequent fine-tuning or ensembling is still biased. In this study, we\nsystematically examine the biases in foundation models and demonstrate the\nefficacy of our proposed Generalized Logit Adjustment (GLA) method. Note that\nbias estimation in foundation models is challenging, as most pre-train data\ncannot be explicitly accessed like in traditional long-tailed classification\ntasks. To this end, GLA has an optimization-based bias estimation approach for\ndebiasing foundation models. As our work resolves a fundamental flaw in the\npre-training, the proposed GLA demonstrates significant improvements across a\ndiverse range of tasks: it achieves 1.5 pp accuracy gains on ImageNet, an large\naverage improvement (1.4-4.6 pp) on 11 few-shot datasets, 2.4 pp gains on\nlong-tailed classification. Codes are in \\url{https://github.com/BeierZhu/GLA}.\n","authors":["Beier Zhu","Kaihua Tang","Qianru Sun","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08106v1.pdf","comment":"Accepted by NeurIPS2023"},{"id":"http://arxiv.org/abs/2212.09408v3","updated":"2023-10-12T07:55:38Z","published":"2022-12-19T12:40:13Z","title":"Universal Object Detection with Large Vision Model","summary":"  Over the past few years, there has been growing interest in developing a\nbroad, universal, and general-purpose computer vision system. Such systems have\nthe potential to address a wide range of vision tasks simultaneously, without\nbeing limited to specific problems or data domains. This universality is\ncrucial for practical, real-world computer vision applications. In this study,\nour focus is on a specific challenge: the large-scale, multi-domain universal\nobject detection problem, which contributes to the broader goal of achieving a\nuniversal vision system. This problem presents several intricate challenges,\nincluding cross-dataset category label duplication, label conflicts, and the\nnecessity to handle hierarchical taxonomies. To address these challenges, we\nintroduce our approach to label handling, hierarchy-aware loss design, and\nresource-efficient model training utilizing a pre-trained large vision model.\nOur method has demonstrated remarkable performance, securing a prestigious\nsecond-place ranking in the object detection track of the Robust Vision\nChallenge 2022 (RVC 2022) on a million-scale cross-dataset object detection\nbenchmark. We believe that our comprehensive study will serve as a valuable\nreference and offer an alternative approach for addressing similar challenges\nwithin the computer vision community. The source code for our work is openly\navailable at https://github.com/linfeng93/Large-UniDet.\n","authors":["Feng Lin","Wenze Hu","Yaowei Wang","Yonghong Tian","Guangming Lu","Fanglin Chen","Yong Xu","Xiaoyu Wang"],"pdf_url":"https://arxiv.org/pdf/2212.09408v3.pdf","comment":"Accepted by International Journal of Computer Vision (IJCV). The 2nd\n  place in the object detection track of the Robust Vision Challenge (RVC 2022)"},{"id":"http://arxiv.org/abs/2309.15505v2","updated":"2023-10-12T07:55:05Z","published":"2023-09-27T09:13:40Z","title":"Finite Scalar Quantization: VQ-VAE Made Simple","summary":"  We propose to replace vector quantization (VQ) in the latent representation\nof VQ-VAEs with a simple scheme termed finite scalar quantization (FSQ), where\nwe project the VAE representation down to a few dimensions (typically less than\n10). Each dimension is quantized to a small set of fixed values, leading to an\n(implicit) codebook given by the product of these sets. By appropriately\nchoosing the number of dimensions and values each dimension can take, we obtain\nthe same codebook size as in VQ. On top of such discrete representations, we\ncan train the same models that have been trained on VQ-VAE representations. For\nexample, autoregressive and masked transformer models for image generation,\nmultimodal generation, and dense prediction computer vision tasks. Concretely,\nwe employ FSQ with MaskGIT for image generation, and with UViM for depth\nestimation, colorization, and panoptic segmentation. Despite the much simpler\ndesign of FSQ, we obtain competitive performance in all these tasks. We\nemphasize that FSQ does not suffer from codebook collapse and does not need the\ncomplex machinery employed in VQ (commitment losses, codebook reseeding, code\nsplitting, entropy penalties, etc.) to learn expressive discrete\nrepresentations.\n","authors":["Fabian Mentzer","David Minnen","Eirikur Agustsson","Michael Tschannen"],"pdf_url":"https://arxiv.org/pdf/2309.15505v2.pdf","comment":"Code:\n  https://github.com/google-research/google-research/tree/master/fsq"},{"id":"http://arxiv.org/abs/2310.08094v1","updated":"2023-10-12T07:40:39Z","published":"2023-10-12T07:40:39Z","title":"SingleInsert: Inserting New Concepts from a Single Image into\n  Text-to-Image Models for Flexible Editing","summary":"  Recent progress in text-to-image (T2I) models enables high-quality image\ngeneration with flexible textual control. To utilize the abundant visual priors\nin the off-the-shelf T2I models, a series of methods try to invert an image to\nproper embedding that aligns with the semantic space of the T2I model. However,\nthese image-to-text (I2T) inversion methods typically need multiple source\nimages containing the same concept or struggle with the imbalance between\nediting flexibility and visual fidelity. In this work, we point out that the\ncritical problem lies in the foreground-background entanglement when learning\nan intended concept, and propose a simple and effective baseline for\nsingle-image I2T inversion, named SingleInsert. SingleInsert adopts a two-stage\nscheme. In the first stage, we regulate the learned embedding to concentrate on\nthe foreground area without being associated with the irrelevant background. In\nthe second stage, we finetune the T2I model for better visual resemblance and\ndevise a semantic loss to prevent the language drift problem. With the proposed\ntechniques, SingleInsert excels in single concept generation with high visual\nfidelity while allowing flexible editing. Additionally, SingleInsert can\nperform single-image novel view synthesis and multiple concepts composition\nwithout requiring joint training. To facilitate evaluation, we design an\nediting prompt list and introduce a metric named Editing Success Rate (ESR) for\nquantitative assessment of editing flexibility. Our project page is:\nhttps://jarrentwu1031.github.io/SingleInsert-web/\n","authors":["Zijie Wu","Chaohui Yu","Zhen Zhu","Fan Wang","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2310.08094v1.pdf","comment":"Project page: https://jarrentwu1031.github.io/SingleInsert-web/"},{"id":"http://arxiv.org/abs/2310.08092v1","updated":"2023-10-12T07:38:28Z","published":"2023-10-12T07:38:28Z","title":"Consistent123: Improve Consistency for One Image to 3D Object Synthesis","summary":"  Large image diffusion models enable novel view synthesis with high quality\nand excellent zero-shot capability. However, such models based on\nimage-to-image translation have no guarantee of view consistency, limiting the\nperformance for downstream tasks like 3D reconstruction and image-to-3D\ngeneration. To empower consistency, we propose Consistent123 to synthesize\nnovel views simultaneously by incorporating additional cross-view attention\nlayers and the shared self-attention mechanism. The proposed attention\nmechanism improves the interaction across all synthesized views, as well as the\nalignment between the condition view and novel views. In the sampling stage,\nsuch architecture supports simultaneously generating an arbitrary number of\nviews while training at a fixed length. We also introduce a progressive\nclassifier-free guidance strategy to achieve the trade-off between texture and\ngeometry for synthesized object views. Qualitative and quantitative experiments\nshow that Consistent123 outperforms baselines in view consistency by a large\nmargin. Furthermore, we demonstrate a significant improvement of Consistent123\non varying downstream tasks, showing its great potential in the 3D generation\nfield. The project page is available at consistent-123.github.io.\n","authors":["Haohan Weng","Tianyu Yang","Jianan Wang","Yu Li","Tong Zhang","C. L. Philip Chen","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08092v1.pdf","comment":"For more qualitative results, please see\n  https://consistent-123.github.io/"},{"id":"http://arxiv.org/abs/2212.00564v2","updated":"2023-10-12T07:21:23Z","published":"2022-12-01T15:11:21Z","title":"Leveraging Single-View Images for Unsupervised 3D Point Cloud Completion","summary":"  Point clouds captured by scanning devices are often incomplete due to\nocclusion. To overcome this limitation, point cloud completion methods have\nbeen developed to predict the complete shape of an object based on its partial\ninput. These methods can be broadly classified as supervised or unsupervised.\nHowever, both categories require a large number of 3D complete point clouds,\nwhich may be difficult to capture. In this paper, we propose Cross-PCC, an\nunsupervised point cloud completion method without requiring any 3D complete\npoint clouds. We only utilize 2D images of the complete objects, which are\neasier to capture than 3D complete and clean point clouds. Specifically, to\ntake advantage of the complementary information from 2D images, we use a\nsingle-view RGB image to extract 2D features and design a fusion module to fuse\nthe 2D and 3D features extracted from the partial point cloud. To guide the\nshape of predicted point clouds, we project the predicted points of the object\nto the 2D plane and use the foreground pixels of its silhouette maps to\nconstrain the position of the projected points. To reduce the outliers of the\npredicted point clouds, we propose a view calibrator to move the points\nprojected to the background into the foreground by the single-view silhouette\nimage. To the best of our knowledge, our approach is the first point cloud\ncompletion method that does not require any 3D supervision. The experimental\nresults of our method are superior to those of the state-of-the-art\nunsupervised methods by a large margin. Moreover, our method even achieves\ncomparable performance to some supervised methods. We will make the source code\npublicly available at https://github.com/ltwu6/cross-pcc.\n","authors":["Lintai Wu","Qijian Zhang","Junhui Hou","Yong Xu"],"pdf_url":"https://arxiv.org/pdf/2212.00564v2.pdf","comment":"14 pages, 10 figures"},{"id":"http://arxiv.org/abs/2310.08084v1","updated":"2023-10-12T07:17:14Z","published":"2023-10-12T07:17:14Z","title":"Volumetric Medical Image Segmentation via Scribble Annotations and Shape\n  Priors","summary":"  Recently, weakly-supervised image segmentation using weak annotations like\nscribbles has gained great attention in computer vision and medical image\nanalysis, since such annotations are much easier to obtain compared to\ntime-consuming and labor-intensive labeling at the pixel/voxel level. However,\ndue to a lack of structure supervision on regions of interest (ROIs), existing\nscribble-based methods suffer from poor boundary localization. Furthermore,\nmost current methods are designed for 2D image segmentation, which do not fully\nleverage the volumetric information if directly applied to each image slice. In\nthis paper, we propose a scribble-based volumetric image segmentation,\nScribble2D5, which tackles 3D anisotropic image segmentation and aims to its\nimprove boundary prediction. To achieve this, we augment a 2.5D attention UNet\nwith a proposed label propagation module to extend semantic information from\nscribbles and use a combination of static and active boundary prediction to\nlearn ROI's boundary and regularize its shape. Also, we propose an optional\nadd-on component, which incorporates the shape prior information from unpaired\nsegmentation masks to further improve model accuracy. Extensive experiments on\nthree public datasets and one private dataset demonstrate our Scribble2D5\nachieves state-of-the-art performance on volumetric image segmentation using\nscribbles and shape prior if available.\n","authors":["Qiuhui Chen","Haiying Lyu","Xinyue Hu","Yong Lu","Yi Hong"],"pdf_url":"https://arxiv.org/pdf/2310.08084v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2205.06779"},{"id":"http://arxiv.org/abs/2310.08082v1","updated":"2023-10-12T07:12:20Z","published":"2023-10-12T07:12:20Z","title":"Jointly Optimized Global-Local Visual Localization of UAVs","summary":"  Navigation and localization of UAVs present a challenge when global\nnavigation satellite systems (GNSS) are disrupted and unreliable. Traditional\ntechniques, such as simultaneous localization and mapping (SLAM) and visual\nodometry (VO), exhibit certain limitations in furnishing absolute coordinates\nand mitigating error accumulation. Existing visual localization methods achieve\nautonomous visual localization without error accumulation by matching with\northo satellite images. However, doing so cannot guarantee real-time\nperformance due to the complex matching process. To address these challenges,\nwe propose a novel Global-Local Visual Localization (GLVL) network. Our GLVL\nnetwork is a two-stage visual localization approach, combining a large-scale\nretrieval module that finds similar regions with the UAV flight scene, and a\nfine-grained matching module that localizes the precise UAV coordinate,\nenabling real-time and precise localization. The training process is jointly\noptimized in an end-to-end manner to further enhance the model capability.\nExperiments on six UAV flight scenes encompassing both texture-rich and\ntexture-sparse regions demonstrate the ability of our model to achieve the\nreal-time precise localization requirements of UAVs. Particularly, our method\nachieves a localization error of only 2.39 meters in 0.48 seconds in a village\nscene with sparse texture features.\n","authors":["Haoling Li","Jiuniu Wang","Zhiwei Wei","Wenjia Xu"],"pdf_url":"https://arxiv.org/pdf/2310.08082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08080v1","updated":"2023-10-12T07:10:12Z","published":"2023-10-12T07:10:12Z","title":"RT-SRTS: Angle-Agnostic Real-Time Simultaneous 3D Reconstruction and\n  Tumor Segmentation from Single X-Ray Projection","summary":"  Radiotherapy is one of the primary treatment methods for tumors, but the\norgan movement caused by respiratory motion limits its accuracy. Recently, 3D\nimaging from single X-ray projection receives extensive attentions as a\npromising way to address this issue. However, current methods can only\nreconstruct 3D image without direct location of the tumor and are only\nvalidated for fixed-angle imaging, which fails to fully meet the requirement of\nmotion control in radiotherapy. In this study, we propose a novel imaging\nmethod RT-SRTS which integrates 3D imaging and tumor segmentation into one\nnetwork based on the multi-task learning (MTL) and achieves real-time\nsimultaneous 3D reconstruction and tumor segmentation from single X-ray\nprojection at any angle. Futhermore, we propose the attention enhanced\ncalibrator (AEC) and uncertain-region elaboration (URE) modules to aid feature\nextraction and improve segmentation accuracy. We evaluated the proposed method\non ten patient cases and compared it with two state-of-the-art methods. Our\napproach not only delivered superior 3D reconstruction but also demonstrated\ncommendable tumor segmentation results. The simultaneous reconstruction and\nsegmentation could be completed in approximately 70 ms, significantly faster\nthan the required time threshold for real-time tumor tracking. The efficacy of\nboth AEC and URE was also validated through ablation studies.\n","authors":["Miao Zhu","Qiming Fu","Bo Liu","Mengxi Zhang","Bojian Li","Xiaoyan Luo","Fugen Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.08080v1.pdf","comment":"27 pages"},{"id":"http://arxiv.org/abs/2210.11318v2","updated":"2023-10-12T06:52:18Z","published":"2022-10-20T14:51:01Z","title":"A Survey of Computer Vision Technologies In Urban and\n  Controlled-environment Agriculture","summary":"  In the evolution of agriculture to its next stage, Agriculture 5.0,\nartificial intelligence will play a central role. Controlled-environment\nagriculture, or CEA, is a special form of urban and suburban agricultural\npractice that offers numerous economic, environmental, and social benefits,\nincluding shorter transportation routes to population centers, reduced\nenvironmental impact, and increased productivity. Due to its ability to control\nenvironmental factors, CEA couples well with computer vision (CV) in the\nadoption of real-time monitoring of the plant conditions and autonomous\ncultivation and harvesting. The objective of this paper is to familiarize CV\nresearchers with agricultural applications and agricultural practitioners with\nthe solutions offered by CV. We identify five major CV applications in CEA,\nanalyze their requirements and motivation, and survey the state of the art as\nreflected in 68 technical papers using deep learning methods. In addition, we\ndiscuss five key subareas of computer vision and how they related to these CEA\nproblems, as well as eleven vision-based CEA datasets. We hope the survey will\nhelp researchers quickly gain a bird-eye view of the striving research area and\nwill spark inspiration for new research and development.\n","authors":["Jiayun Luo","Boyang Li","Cyril Leung"],"pdf_url":"https://arxiv.org/pdf/2210.11318v2.pdf","comment":"1 overview figures, 37 pages, 8 tables, accepted by ACM Computing\n  Surveys"},{"id":"http://arxiv.org/abs/2310.08073v1","updated":"2023-10-12T06:50:43Z","published":"2023-10-12T06:50:43Z","title":"Samples on Thin Ice: Re-Evaluating Adversarial Pruning of Neural\n  Networks","summary":"  Neural network pruning has shown to be an effective technique for reducing\nthe network size, trading desirable properties like generalization and\nrobustness to adversarial attacks for higher sparsity. Recent work has claimed\nthat adversarial pruning methods can produce sparse networks while also\npreserving robustness to adversarial examples. In this work, we first\nre-evaluate three state-of-the-art adversarial pruning methods, showing that\ntheir robustness was indeed overestimated. We then compare pruned and dense\nversions of the same models, discovering that samples on thin ice, i.e., closer\nto the unpruned model's decision boundary, are typically misclassified after\npruning. We conclude by discussing how this intuition may lead to designing\nmore effective adversarial pruning methods in future work.\n","authors":["Giorgio Piras","Maura Pintor","Ambra Demontis","Battista Biggio"],"pdf_url":"https://arxiv.org/pdf/2310.08073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08071v1","updated":"2023-10-12T06:36:41Z","published":"2023-10-12T06:36:41Z","title":"Learning Transferable Conceptual Prototypes for Interpretable\n  Unsupervised Domain Adaptation","summary":"  Despite the great progress of unsupervised domain adaptation (UDA) with the\ndeep neural networks, current UDA models are opaque and cannot provide\npromising explanations, limiting their applications in the scenarios that\nrequire safe and controllable model decisions. At present, a surge of work\nfocuses on designing deep interpretable methods with adequate data annotations\nand only a few methods consider the distributional shift problem. Most existing\ninterpretable UDA methods are post-hoc ones, which cannot facilitate the model\nlearning process for performance enhancement. In this paper, we propose an\ninherently interpretable method, named Transferable Conceptual Prototype\nLearning (TCPL), which could simultaneously interpret and improve the processes\nof knowledge transfer and decision-making in UDA. To achieve this goal, we\ndesign a hierarchically prototypical module that transfers categorical basic\nconcepts from the source domain to the target domain and learns domain-shared\nprototypes for explaining the underlying reasoning process. With the learned\ntransferable prototypes, a self-predictive consistent pseudo-label strategy\nthat fuses confidence, predictions, and prototype information, is designed for\nselecting suitable target samples for pseudo annotations and gradually\nnarrowing down the domain gap. Comprehensive experiments show that the proposed\nmethod can not only provide effective and intuitive explanations but also\noutperform previous state-of-the-arts.\n","authors":["Junyu Gao","Xinhong Ma","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2310.08071v1.pdf","comment":"Submitted to IEEE TIP"},{"id":"http://arxiv.org/abs/2310.08068v1","updated":"2023-10-12T06:32:12Z","published":"2023-10-12T06:32:12Z","title":"Frequency-Aware Re-Parameterization for Over-Fitting Based Image\n  Compression","summary":"  Over-fitting-based image compression requires weights compactness for\ncompression and fast convergence for practical use, posing challenges for deep\nconvolutional neural networks (CNNs) based methods. This paper presents a\nsimple re-parameterization method to train CNNs with reduced weights storage\nand accelerated convergence. The convolution kernels are re-parameterized as a\nweighted sum of discrete cosine transform (DCT) kernels enabling direct\noptimization in the frequency domain. Combined with L1 regularization, the\nproposed method surpasses vanilla convolutions by achieving a significantly\nimproved rate-distortion with low computational cost. The proposed method is\nverified with extensive experiments of over-fitting-based image restoration on\nvarious datasets, achieving up to -46.12% BD-rate on top of HEIF with only 200\niterations.\n","authors":["Yun Ye","Yanjie Pan","Qually Jiang","Ming Lu","Xiaoran Fang","Beryl Xu"],"pdf_url":"https://arxiv.org/pdf/2310.08068v1.pdf","comment":"to be published at ICIP 2023, this version fixed a mistake in Eq. (1)\n  in the proceeding version"},{"id":"http://arxiv.org/abs/2310.08064v1","updated":"2023-10-12T06:26:39Z","published":"2023-10-12T06:26:39Z","title":"Age Estimation Based on Graph Convolutional Networks and Multi-head\n  Attention Mechanisms","summary":"  Age estimation technology is a part of facial recognition and has been\napplied to identity authentication. This technology achieves the development\nand application of a juvenile anti-addiction system by authenticating users in\nthe game. Convolutional Neural Network (CNN) and Transformer algorithms are\nwidely used in this application scenario. However, these two models cannot\nflexibly extract and model features of faces with irregular shapes, and they\nare ineffective in capturing key information. Furthermore, the above methods\nwill contain a lot of background information while extracting features, which\nwill interfere with the model. In consequence, it is easy to extract redundant\ninformation from images. In this paper, a new modeling idea is proposed to\nsolve this problem, which can flexibly model irregular objects. The Graph\nConvolutional Network (GCN) is used to extract features from irregular face\nimages effectively, and multi-head attention mechanisms are added to avoid\nredundant features and capture key region information in the image. This model\ncan effectively improve the accuracy of age estimation and reduce the MAE error\nvalue to about 3.64, which is better than the effect of today's age estimation\nmodel, to improve the accuracy of face recognition and identity authentication.\n","authors":["Miaomiao Yang","Changwei Yao","Shijin Yan"],"pdf_url":"https://arxiv.org/pdf/2310.08064v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07763v2","updated":"2023-10-12T06:24:02Z","published":"2023-07-15T10:06:43Z","title":"Tightly-Coupled LiDAR-Visual SLAM Based on Geometric Features for Mobile\n  Agents","summary":"  The mobile robot relies on SLAM (Simultaneous Localization and Mapping) to\nprovide autonomous navigation and task execution in complex and unknown\nenvironments. However, it is hard to develop a dedicated algorithm for mobile\nrobots due to dynamic and challenging situations, such as poor lighting\nconditions and motion blur. To tackle this issue, we propose a tightly-coupled\nLiDAR-visual SLAM based on geometric features, which includes two sub-systems\n(LiDAR and monocular visual SLAM) and a fusion framework. The fusion framework\nassociates the depth and semantics of the multi-modal geometric features to\ncomplement the visual line landmarks and to add direction optimization in\nBundle Adjustment (BA). This further constrains visual odometry. On the other\nhand, the entire line segment detected by the visual subsystem overcomes the\nlimitation of the LiDAR subsystem, which can only perform the local calculation\nfor geometric features. It adjusts the direction of linear feature points and\nfilters out outliers, leading to a higher accurate odometry system. Finally, we\nemploy a module to detect the subsystem's operation, providing the LiDAR\nsubsystem's output as a complementary trajectory to our system while visual\nsubsystem tracking fails. The evaluation results on the public dataset M2DGR,\ngathered from ground robots across various indoor and outdoor scenarios, show\nthat our system achieves more accurate and robust pose estimation compared to\ncurrent state-of-the-art multi-modal methods.\n","authors":["Ke Cao","Ruiping Liu","Ze Wang","Kunyu Peng","Jiaming Zhang","Junwei Zheng","Zhifeng Teng","Kailun Yang","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2307.07763v2.pdf","comment":"Accepted to ROBIO 2023"},{"id":"http://arxiv.org/abs/2306.06599v4","updated":"2023-10-12T05:51:30Z","published":"2023-06-11T06:27:06Z","title":"Variational Imbalanced Regression: Fair Uncertainty Quantification via\n  Probabilistic Smoothing","summary":"  Existing regression models tend to fall short in both accuracy and\nuncertainty estimation when the label distribution is imbalanced. In this\npaper, we propose a probabilistic deep learning model, dubbed variational\nimbalanced regression (VIR), which not only performs well in imbalanced\nregression but naturally produces reasonable uncertainty estimation as a\nbyproduct. Different from typical variational autoencoders assuming I.I.D.\nrepresentations (a data point's representation is not directly affected by\nother data points), our VIR borrows data with similar regression labels to\ncompute the latent representation's variational distribution; furthermore,\ndifferent from deterministic regression models producing point estimates, VIR\npredicts the entire normal-inverse-gamma distributions and modulates the\nassociated conjugate distributions to impose probabilistic reweighting on the\nimbalanced data, thereby providing better uncertainty estimation. Experiments\nin several real-world datasets show that our VIR can outperform\nstate-of-the-art imbalanced regression models in terms of both accuracy and\nuncertainty estimation. Code will soon be available at\n\\url{https://github.com/Wang-ML-Lab/variational-imbalanced-regression}.\n","authors":["Ziyan Wang","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2306.06599v4.pdf","comment":"Accepted at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2302.08594v3","updated":"2023-10-12T05:43:45Z","published":"2023-02-16T21:38:36Z","title":"TransUPR: A Transformer-based Uncertain Point Refiner for LiDAR Point\n  Cloud Semantic Segmentation","summary":"  Common image-based LiDAR point cloud semantic segmentation (LiDAR PCSS)\napproaches have bottlenecks resulting from the boundary-blurring problem of\nconvolution neural networks (CNNs) and quantitation loss of spherical\nprojection. In this work, we propose a transformer-based plug-and-play\nuncertain point refiner, i.e., TransUPR, to refine selected uncertain points in\na learnable manner, which leads to an improved segmentation performance.\nUncertain points are sampled from coarse semantic segmentation results of 2D\nimage segmentation where uncertain points are located close to the object\nboundaries in the 2D range image representation and 3D spherical projection\nbackground points. Following that, the geometry and coarse semantic features of\nuncertain points are aggregated by neighbor points in 3D space without adding\nexpensive computation and memory footprint. Finally, the transformer-based\nrefiner, which contains four stacked self-attention layers, along with an MLP\nmodule, is utilized for uncertain point classification on the concatenated\nfeatures of self-attention layers. As the proposed refiner is independent of 2D\nCNNs, our TransUPR can be easily integrated into any existing image-based LiDAR\nPCSS approaches, e.g., CENet. Our TransUPR with the CENet achieves\nstate-of-the-art performance, i.e., 68.2% mean Intersection over Union (mIoU)\non the Semantic KITTI benchmark, which provides a performance improvement of\n0.6% on the mIoU compared to the original CENet.\n","authors":["Zifan Yu","Meida Chen","Zhikang Zhang","Suya You","Raghuveer Rao","Sanjeev Agarwal","Fengbo Ren"],"pdf_url":"https://arxiv.org/pdf/2302.08594v3.pdf","comment":"6 pages; Accepted by 2023 IROS"},{"id":"http://arxiv.org/abs/2310.08044v1","updated":"2023-10-12T05:34:45Z","published":"2023-10-12T05:34:45Z","title":"EC-Depth: Exploring the consistency of self-supervised monocular depth\n  estimation under challenging scenes","summary":"  Self-supervised monocular depth estimation holds significant importance in\nthe fields of autonomous driving and robotics. However, existing methods are\ntypically designed to train and test on clear and pristine datasets,\noverlooking the impact of various adverse conditions prevalent in real-world\nscenarios. As a result, it is commonly observed that most self-supervised\nmonocular depth estimation methods struggle to perform adequately under\nchallenging conditions. To address this issue, we present EC-Depth, a novel\nself-supervised two-stage training framework to achieve a robust depth\nestimation, starting from the foundation of depth prediction consistency under\ndifferent perturbations. Leveraging the proposed perturbation-invariant depth\nconsistency constraint module and the consistency-based pseudo-label selection\nmodule, our model attains accurate and consistent depth predictions in both\nstandard and challenging scenarios. Extensive experiments substantiate the\neffectiveness of the proposed method. Moreover, our method surpasses existing\nstate-of-the-art methods on KITTI, KITTI-C and DrivingStereo benchmarks,\ndemonstrating its potential for enhancing the reliability of self-supervised\nmonocular depth estimation models in real-world applications.\n","authors":["Ruijie Zhu","Ziyang Song","Chuxin Wang","Jianfeng He","Tianzhu Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08044v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08042v1","updated":"2023-10-12T05:33:25Z","published":"2023-10-12T05:33:25Z","title":"X-HRNet: Towards Lightweight Human Pose Estimation with Spatially\n  Unidimensional Self-Attention","summary":"  High-resolution representation is necessary for human pose estimation to\nachieve high performance, and the ensuing problem is high computational\ncomplexity. In particular, predominant pose estimation methods estimate human\njoints by 2D single-peak heatmaps. Each 2D heatmap can be horizontally and\nvertically projected to and reconstructed by a pair of 1D heat vectors.\nInspired by this observation, we introduce a lightweight and powerful\nalternative, Spatially Unidimensional Self-Attention (SUSA), to the pointwise\n(1x1) convolution that is the main computational bottleneck in the depthwise\nseparable 3c3 convolution. Our SUSA reduces the computational complexity of the\npointwise (1x1) convolution by 96% without sacrificing accuracy. Furthermore,\nwe use the SUSA as the main module to build our lightweight pose estimation\nbackbone X-HRNet, where `X' represents the estimated cross-shape attention\nvectors. Extensive experiments on the COCO benchmark demonstrate the\nsuperiority of our X-HRNet, and comprehensive ablation studies show the\neffectiveness of the SUSA modules. The code is publicly available at\nhttps://github.com/cool-xuan/x-hrnet.\n","authors":["Yixuan Zhou","Xuanhan Wang","Xing Xu","Lei Zhao","Jingkuan Song"],"pdf_url":"https://arxiv.org/pdf/2310.08042v1.pdf","comment":"Accepted by ICME 2022"},{"id":"http://arxiv.org/abs/2310.05624v2","updated":"2023-10-12T05:33:19Z","published":"2023-10-09T11:26:58Z","title":"Locality-Aware Generalizable Implicit Neural Representation","summary":"  Generalizable implicit neural representation (INR) enables a single\ncontinuous function, i.e., a coordinate-based neural network, to represent\nmultiple data instances by modulating its weights or intermediate features\nusing latent codes. However, the expressive power of the state-of-the-art\nmodulation is limited due to its inability to localize and capture fine-grained\ndetails of data entities such as specific pixels and rays. To address this\nissue, we propose a novel framework for generalizable INR that combines a\ntransformer encoder with a locality-aware INR decoder. The transformer encoder\npredicts a set of latent tokens from a data instance to encode local\ninformation into each latent token. The locality-aware INR decoder extracts a\nmodulation vector by selectively aggregating the latent tokens via\ncross-attention for a coordinate input and then predicts the output by\nprogressively decoding with coarse-to-fine modulation through multiple\nfrequency bandwidths. The selective token aggregation and the multi-band\nfeature modulation enable us to learn locality-aware representation in spatial\nand spectral aspects, respectively. Our framework significantly outperforms\nprevious generalizable INRs and validates the usefulness of the locality-aware\nlatents for downstream tasks such as image generation.\n","authors":["Doyup Lee","Chiheon Kim","Minsu Cho","Wook-Shin Han"],"pdf_url":"https://arxiv.org/pdf/2310.05624v2.pdf","comment":"19 pages, 12 figures"},{"id":"http://arxiv.org/abs/2310.08038v1","updated":"2023-10-12T05:09:27Z","published":"2023-10-12T05:09:27Z","title":"Continual Learning via Manifold Expansion Replay","summary":"  In continual learning, the learner learns multiple tasks in sequence, with\ndata being acquired only once for each task. Catastrophic forgetting is a major\nchallenge to continual learning. To reduce forgetting, some existing\nrehearsal-based methods use episodic memory to replay samples of previous\ntasks. However, in the process of knowledge integration when learning a new\ntask, this strategy also suffers from catastrophic forgetting due to an\nimbalance between old and new knowledge. To address this problem, we propose a\nnovel replay strategy called Manifold Expansion Replay (MaER). We argue that\nexpanding the implicit manifold of the knowledge representation in the episodic\nmemory helps to improve the robustness and expressiveness of the model. To this\nend, we propose a greedy strategy to keep increasing the diameter of the\nimplicit manifold represented by the knowledge in the buffer during memory\nmanagement. In addition, we introduce Wasserstein distance instead of cross\nentropy as distillation loss to preserve previous knowledge. With extensive\nexperimental validation on MNIST, CIFAR10, CIFAR100, and TinyImageNet, we show\nthat the proposed method significantly improves the accuracy in continual\nlearning setup, outperforming the state of the arts.\n","authors":["Zihao Xu","Xuan Tang","Yufei Shi","Jianfeng Zhang","Jian Yang","Mingsong Chen","Xian Wei"],"pdf_url":"https://arxiv.org/pdf/2310.08038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08035v1","updated":"2023-10-12T05:03:19Z","published":"2023-10-12T05:03:19Z","title":"BaSAL: Size Balanced Warm Start Active Learning for LiDAR Semantic\n  Segmentation","summary":"  Active learning strives to reduce the need for costly data annotation, by\nrepeatedly querying an annotator to label the most informative samples from a\npool of unlabeled data and retraining a model from these samples. We identify\ntwo problems with existing active learning methods for LiDAR semantic\nsegmentation. First, they ignore the severe class imbalance inherent in LiDAR\nsemantic segmentation datasets. Second, to bootstrap the active learning loop,\nthey train their initial model from randomly selected data samples, which leads\nto low performance and is referred to as the cold start problem. To address\nthese problems we propose BaSAL, a size-balanced warm start active learning\nmodel, based on the observation that each object class has a characteristic\nsize. By sampling object clusters according to their size, we can thus create a\nsize-balanced dataset that is also more class-balanced. Furthermore, in\ncontrast to existing information measures like entropy or CoreSet, size-based\nsampling does not require an already trained model and thus can be used to\naddress the cold start problem. Results show that we are able to improve the\nperformance of the initial model by a large margin. Combining size-balanced\nsampling and warm start with established information measures, our approach\nachieves a comparable performance to training on the entire SemanticKITTI\ndataset, despite using only 5% of the annotations, which outperforms existing\nactive learning methods. We also match the existing state-of-the-art in active\nlearning on nuScenes. Our code will be made available upon paper acceptance.\n","authors":["Jiarong Wei","Yancong Lin","Holger Caesar"],"pdf_url":"https://arxiv.org/pdf/2310.08035v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08027v1","updated":"2023-10-12T04:14:28Z","published":"2023-10-12T04:14:28Z","title":"Exploring Large Language Models for Multi-Modal Out-of-Distribution\n  Detection","summary":"  Out-of-distribution (OOD) detection is essential for reliable and trustworthy\nmachine learning. Recent multi-modal OOD detection leverages textual\ninformation from in-distribution (ID) class names for visual OOD detection, yet\nit currently neglects the rich contextual information of ID classes. Large\nlanguage models (LLMs) encode a wealth of world knowledge and can be prompted\nto generate descriptive features for each class. Indiscriminately using such\nknowledge causes catastrophic damage to OOD detection due to LLMs'\nhallucinations, as is observed by our analysis. In this paper, we propose to\napply world knowledge to enhance OOD detection performance through selective\ngeneration from LLMs. Specifically, we introduce a consistency-based\nuncertainty calibration method to estimate the confidence score of each\ngeneration. We further extract visual objects from each image to fully\ncapitalize on the aforementioned world knowledge. Extensive experiments\ndemonstrate that our method consistently outperforms the state-of-the-art.\n","authors":["Yi Dai","Hao Lang","Kaisheng Zeng","Fei Huang","Yongbin Li"],"pdf_url":"https://arxiv.org/pdf/2310.08027v1.pdf","comment":"EMNLP2023 Findings Long Paper"},{"id":"http://arxiv.org/abs/2310.08026v1","updated":"2023-10-12T04:12:43Z","published":"2023-10-12T04:12:43Z","title":"Beyond Sharing Weights in Decoupling Feature Learning Network for UAV\n  RGB-Infrared Vehicle Re-Identification","summary":"  Owing to the capacity of performing full-time target search, cross-modality\nvehicle re-identification (Re-ID) based on unmanned aerial vehicle (UAV) is\ngaining more attention in both video surveillance and public security. However,\nthis promising and innovative research has not been studied sufficiently due to\nthe data inadequacy issue. Meanwhile, the cross-modality discrepancy and\norientation discrepancy challenges further aggravate the difficulty of this\ntask. To this end, we pioneer a cross-modality vehicle Re-ID benchmark named\nUAV Cross-Modality Vehicle Re-ID (UCM-VeID), containing 753 identities with\n16015 RGB and 13913 infrared images. Moreover, to meet cross-modality\ndiscrepancy and orientation discrepancy challenges, we present a hybrid weights\ndecoupling network (HWDNet) to learn the shared discriminative\norientation-invariant features. For the first challenge, we proposed a hybrid\nweights siamese network with a well-designed weight restrainer and its\ncorresponding objective function to learn both modality-specific and modality\nshared information. In terms of the second challenge, three effective\ndecoupling structures with two pretext tasks are investigated to learn\norientation-invariant feature. Comprehensive experiments are carried out to\nvalidate the effectiveness of the proposed method. The dataset and codes will\nbe released at https://github.com/moonstarL/UAV-CM-VeID.\n","authors":["Xingyue Liu","Jiahao Qi","Chen Chen","Kangcheng Bin","Ping Zhong"],"pdf_url":"https://arxiv.org/pdf/2310.08026v1.pdf","comment":"13 pages, 10 figures, 64 citations, submitted to TMM"},{"id":"http://arxiv.org/abs/2210.15889v4","updated":"2023-10-12T04:05:41Z","published":"2022-10-28T04:38:10Z","title":"Towards Data-and Knowledge-Driven Artificial Intelligence: A Survey on\n  Neuro-Symbolic Computing","summary":"  Neural-symbolic computing (NeSy), which pursues the integration of the\nsymbolic and statistical paradigms of cognition, has been an active research\narea of Artificial Intelligence (AI) for many years. As NeSy shows promise of\nreconciling the advantages of reasoning and interpretability of symbolic\nrepresentation and robust learning in neural networks, it may serve as a\ncatalyst for the next generation of AI. In the present paper, we provide a\nsystematic overview of the recent developments and important contributions of\nNeSy research. Firstly, we introduce study history of this area, covering early\nwork and foundations. We further discuss background concepts and identify key\ndriving factors behind the development of NeSy. Afterward, we categorize recent\nlandmark approaches along several main characteristics that underline this\nresearch paradigm, including neural-symbolic integration, knowledge\nrepresentation, knowledge embedding, and functionality. Next, we briefly\ndiscuss the successful application of modern NeSy approaches in several\ndomains. Then, we benchmark several NeSy methods on three representative\napplication tasks. Finally, we identify the open problems together with\npotential future research directions. This survey is expected to help new\nresearchers enter this rapidly evolving field and accelerate the progress\ntowards data-and knowledge-driven AI.\n","authors":["Wenguan Wang","Yi Yang","Fei Wu"],"pdf_url":"https://arxiv.org/pdf/2210.15889v4.pdf","comment":"Ongoing project"},{"id":"http://arxiv.org/abs/2308.03807v2","updated":"2023-10-12T03:36:17Z","published":"2023-08-06T15:47:03Z","title":"Nest-DGIL: Nesterov-optimized Deep Geometric Incremental Learning for CS\n  Image Reconstruction","summary":"  Proximal gradient-based optimization is one of the most common strategies to\nsolve inverse problem of images, and it is easy to implement. However, these\ntechniques often generate heavy artifacts in image reconstruction. One of the\nmost popular refinement methods is to fine-tune the regularization parameter to\nalleviate such artifacts, but it may not always be sufficient or applicable due\nto increased computational costs. In this work, we propose a deep geometric\nincremental learning framework based on the second Nesterov proximal gradient\noptimization. The proposed end-to-end network not only has the powerful\nlearning ability for high-/low-frequency image features, but also can\ntheoretically guarantee that geometric texture details will be reconstructed\nfrom preliminary linear reconstruction. Furthermore, it can avoid the risk of\nintermediate reconstruction results falling outside the geometric decomposition\ndomains and achieve fast convergence. Our reconstruction framework is\ndecomposed into four modules including general linear reconstruction, cascade\ngeometric incremental restoration, Nesterov acceleration, and post-processing.\nIn the image restoration step, a cascade geometric incremental learning module\nis designed to compensate for missing texture information from different\ngeometric spectral decomposition domains. Inspired by the overlap-tile\nstrategy, we also develop a post-processing module to remove the block effect\nin patch-wise-based natural image reconstruction. All parameters in the\nproposed model are learnable, an adaptive initialization technique of physical\nparameters is also employed to make model flexibility and ensure converging\nsmoothly. We compare the reconstruction performance of the proposed method with\nexisting state-of-the-art methods to demonstrate its superiority. Our source\ncodes are available at https://github.com/fanxiaohong/Nest-DGIL.\n","authors":["Xiaohong Fan","Yin Yang","Ke Chen","Yujie Feng","Jianping Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03807v2.pdf","comment":"15 pages,our source codes are available at\n  https://github.com/fanxiaohong/Nest-DGIL"},{"id":"http://arxiv.org/abs/2310.06488v2","updated":"2023-10-12T03:23:40Z","published":"2023-10-10T09:57:17Z","title":"SpikeCLIP: A Contrastive Language-Image Pretrained Spiking Neural\n  Network","summary":"  Spiking neural networks (SNNs) have demonstrated the capability to achieve\ncomparable performance to deep neural networks (DNNs) in both visual and\nlinguistic domains while offering the advantages of improved energy efficiency\nand adherence to biological plausibility. However, the extension of such\nsingle-modality SNNs into the realm of multimodal scenarios remains an\nunexplored territory. Drawing inspiration from the concept of contrastive\nlanguage-image pre-training (CLIP), we introduce a novel framework, named\nSpikeCLIP, to address the gap between two modalities within the context of\nspike-based computing through a two-step recipe involving ``Alignment\nPre-training + Dual-Loss Fine-tuning\". Extensive experiments demonstrate that\nSNNs achieve comparable results to their DNN counterparts while significantly\nreducing energy consumption across a variety of datasets commonly used for\nmultimodal model evaluation. Furthermore, SpikeCLIP maintains robust\nperformance in image classification tasks that involve class labels not\npredefined within specific categories.\n","authors":["Tianlong Li","Wenhao Liu","Changze Lv","Jianhan Xu","Cenyuan Zhang","Muling Wu","Xiaoqing Zheng","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2310.06488v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08009v1","updated":"2023-10-12T03:21:12Z","published":"2023-10-12T03:21:12Z","title":"Dual-Stream Knowledge-Preserving Hashing for Unsupervised Video\n  Retrieval","summary":"  Unsupervised video hashing usually optimizes binary codes by learning to\nreconstruct input videos. Such reconstruction constraint spends much effort on\nframe-level temporal context changes without focusing on video-level global\nsemantics that are more useful for retrieval. Hence, we address this problem by\ndecomposing video information into reconstruction-dependent and\nsemantic-dependent information, which disentangles the semantic extraction from\nreconstruction constraint. Specifically, we first design a simple dual-stream\nstructure, including a temporal layer and a hash layer. Then, with the help of\nsemantic similarity knowledge obtained from self-supervision, the hash layer\nlearns to capture information for semantic retrieval, while the temporal layer\nlearns to capture the information for reconstruction. In this way, the model\nnaturally preserves the disentangled semantics into binary codes. Validated by\ncomprehensive experiments, our method consistently outperforms the\nstate-of-the-arts on three video benchmarks.\n","authors":["Pandeng Li","Hongtao Xie","Jiannan Ge","Lei Zhang","Shaobo Min","Yongdong Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08009v1.pdf","comment":"17 pages, 8 figures, ECCV 2022"},{"id":"http://arxiv.org/abs/2310.08002v1","updated":"2023-10-12T03:14:02Z","published":"2023-10-12T03:14:02Z","title":"MLP-AMDC: An MLP Architecture for Adaptive-Mask-based Dual-Camera\n  snapshot hyperspectral imaging","summary":"  Coded Aperture Snapshot Spectral Imaging (CASSI) system has great advantages\nover traditional methods in dynamically acquiring Hyper-Spectral Image (HSI),\nbut there are the following problems. 1) Traditional mask relies on random\npatterns or analytical design, both of which limit the performance improvement\nof CASSI. 2) Existing high-quality reconstruction algorithms are slow in\nreconstruction and can only reconstruct scene information offline. To address\nthe above two problems, this paper designs the AMDC-CASSI system, introducing\nRGB camera with CASSI based on Adaptive-Mask as multimodal input to improve the\nreconstruction quality. The existing SOTA reconstruction schemes are based on\ntransformer, but the operation of self-attention pulls down the operation\nefficiency of the network. In order to improve the inference speed of the\nreconstruction network, this paper proposes An MLP Architecture for\nAdaptive-Mask-based Dual-Camera (MLP-AMDC) to replace the transformer structure\nof the network. Numerous experiments have shown that MLP performs no less well\nthan transformer-based structures for HSI reconstruction, while MLP greatly\nimproves the network inference speed and has less number of parameters and\noperations, our method has a 8 db improvement over SOTA and at least a 5-fold\nimprovement in reconstruction speed. (https://github.com/caizeyu1992/MLP-AMDC.)\n","authors":["Zeyu Cai","Can Zhang","Xunhao Chen","Shanghuan Liu","Chengqian Jin","Feipeng Da"],"pdf_url":"https://arxiv.org/pdf/2310.08002v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2308.01541"},{"id":"http://arxiv.org/abs/2209.05167v3","updated":"2023-10-12T03:09:25Z","published":"2022-09-12T11:51:20Z","title":"LF-VISLAM: A SLAM Framework for Large Field-of-View Cameras with\n  Negative Imaging Plane on Mobile Agents","summary":"  Simultaneous Localization And Mapping (SLAM) has become a crucial aspect in\nthe fields of autonomous driving and robotics. One crucial component of visual\nSLAM is the Field-of-View (FoV) of the camera, as a larger FoV allows for a\nwider range of surrounding elements and features to be perceived. However, when\nthe FoV of the camera reaches the negative half-plane, traditional methods for\nrepresenting image feature points using [u,v,1]^T become ineffective. While the\npanoramic FoV is advantageous for loop closure, its benefits are not easily\nrealized under large-attitude-angle differences where loop-closure frames\ncannot be easily matched by existing methods. As loop closure on wide-FoV\npanoramic data further comes with a large number of outliers, traditional\noutlier rejection methods are not directly applicable. To address these issues,\nwe propose LF-VISLAM, a Visual Inertial SLAM framework for cameras with\nextremely Large FoV with loop closure. A three-dimensional vector with unit\nlength is introduced to effectively represent feature points even on the\nnegative half-plane. The attitude information of the SLAM system is leveraged\nto guide the feature point detection of the loop closure. Additionally, a new\noutlier rejection method based on the unit length representation is integrated\ninto the loop closure module. We collect the PALVIO dataset using a Panoramic\nAnnular Lens (PAL) system with an entire FoV of 360{\\deg}x(40{\\deg}~120{\\deg})\nand an Inertial Measurement Unit (IMU) for Visual Inertial Odometry (VIO) to\naddress the lack of panoramic SLAM datasets. Experiments on the established\nPALVIO and public datasets show that the proposed LF-VISLAM outperforms\nstate-of-the-art SLAM methods. Our code will be open-sourced at\nhttps://github.com/flysoaryun/LF-VISLAM.\n","authors":["Ze Wang","Kailun Yang","Hao Shi","Peng Li","Fei Gao","Jian Bai","Kaiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2209.05167v3.pdf","comment":"Accepted to IEEE Transactions on Automation Science and Engineering\n  (T-ASE). Extended version of IROS2022 paper arXiv:2202.12613. Code and\n  dataset will be open-sourced at https://github.com/flysoaryun/LF-SLAM"},{"id":"http://arxiv.org/abs/2310.07997v1","updated":"2023-10-12T02:52:33Z","published":"2023-10-12T02:52:33Z","title":"Point-NeuS: Point-Guided Neural Implicit Surface Reconstruction by\n  Volume Rendering","summary":"  Recently, learning neural implicit surface by volume rendering has been a\npromising way for multi-view reconstruction. However, limited accuracy and\nexcessive time complexity remain bottlenecks that current methods urgently need\nto overcome. To address these challenges, we propose a new method called\nPoint-NeuS, utilizing point-guided mechanisms to achieve accurate and efficient\nreconstruction. Point modeling is organically embedded into the volume\nrendering to enhance and regularize the representation of implicit surface.\nSpecifically, to achieve precise point guidance and noise robustness, aleatoric\nuncertainty of the point cloud is modeled to capture the distribution of noise\nand estimate the reliability of points. Additionally, a Neural Projection\nmodule connecting points and images is introduced to add geometric constraints\nto the Signed Distance Function (SDF). To better compensate for geometric bias\nbetween volume rendering and point modeling, high-fidelity points are filtered\ninto an Implicit Displacement Network to improve the representation of SDF.\nBenefiting from our effective point guidance, lightweight networks are employed\nto achieve an impressive 11x speedup compared to NeuS. Extensive experiments\nshow that our method yields high-quality surfaces, especially for fine-grained\ndetails and smooth regions. Moreover, it exhibits strong robustness to both\nnoisy and sparse data.\n","authors":["Chen Zhang","Wanjuan Su","Wenbing Tao"],"pdf_url":"https://arxiv.org/pdf/2310.07997v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07996v1","updated":"2023-10-12T02:52:14Z","published":"2023-10-12T02:52:14Z","title":"Reset It and Forget It: Relearning Last-Layer Weights Improves Continual\n  and Transfer Learning","summary":"  This work identifies a simple pre-training mechanism that leads to\nrepresentations exhibiting better continual and transfer learning. This\nmechanism -- the repeated resetting of weights in the last layer, which we\nnickname \"zapping\" -- was originally designed for a meta-continual-learning\nprocedure, yet we show it is surprisingly applicable in many settings beyond\nboth meta-learning and continual learning. In our experiments, we wish to\ntransfer a pre-trained image classifier to a new set of classes, in a few\nshots. We show that our zapping procedure results in improved transfer accuracy\nand/or more rapid adaptation in both standard fine-tuning and continual\nlearning settings, while being simple to implement and computationally\nefficient. In many cases, we achieve performance on par with state of the art\nmeta-learning without needing the expensive higher-order gradients, by using a\ncombination of zapping and sequential learning. An intuitive explanation for\nthe effectiveness of this zapping procedure is that representations trained\nwith repeated zapping learn features that are capable of rapidly adapting to\nnewly initialized classifiers. Such an approach may be considered a\ncomputationally cheaper type of, or alternative to, meta-learning rapidly\nadaptable features with higher-order gradients. This adds to recent work on the\nusefulness of resetting neural network parameters during training, and invites\nfurther investigation of this mechanism.\n","authors":["Lapo Frati","Neil Traft","Jeff Clune","Nick Cheney"],"pdf_url":"https://arxiv.org/pdf/2310.07996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07995v1","updated":"2023-10-12T02:49:00Z","published":"2023-10-12T02:49:00Z","title":"HeightFormer: A Multilevel Interaction and Image-adaptive\n  Classification-regression Network for Monocular Height Estimation with Aerial\n  Images","summary":"  Height estimation has long been a pivotal topic within measurement and remote\nsensing disciplines, proving critical for endeavours such as 3D urban\nmodelling, MR and autonomous driving. Traditional methods utilise stereo\nmatching or multisensor fusion, both well-established techniques that typically\nnecessitate multiple images from varying perspectives and adjunct sensors like\nSAR, leading to substantial deployment costs. Single image height estimation\nhas emerged as an attractive alternative, boasting a larger data source variety\nand simpler deployment. However, current methods suffer from limitations such\nas fixed receptive fields, a lack of global information interaction, leading to\nnoticeable instance-level height deviations. The inherent complexity of height\nprediction can result in a blurry estimation of object edge depth when using\nmainstream regression methods based on fixed height division. This paper\npresents a comprehensive solution for monocular height estimation in remote\nsensing, termed HeightFormer, combining multilevel interactions and\nimage-adaptive classification-regression. It features the Multilevel\nInteraction Backbone (MIB) and Image-adaptive Classification-regression Height\nGenerator (ICG). MIB supplements the fixed sample grid in CNN of the\nconventional backbone network with tokens of different interaction ranges. It\nis complemented by a pixel-, patch-, and feature map-level hierarchical\ninteraction mechanism, designed to relay spatial geometry information across\ndifferent scales and introducing a global receptive field to enhance the\nquality of instance-level height estimation. The ICG dynamically generates\nheight partition for each image and reframes the traditional regression task,\nusing a refinement from coarse to fine classification-regression that\nsignificantly mitigates the innate ill-posedness issue and drastically improves\nedge sharpness.\n","authors":["Zhan Chen","Yidan Zhang","Xiyu Qi","Yongqiang Mao","Xin Zhou","Lulu Niu","Hui Wu","Lei Wang","Yunping Ge"],"pdf_url":"https://arxiv.org/pdf/2310.07995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12535v2","updated":"2023-10-12T02:38:50Z","published":"2023-03-21T17:28:44Z","title":"An Effective Motion-Centric Paradigm for 3D Single Object Tracking in\n  Point Clouds","summary":"  3D single object tracking in LiDAR point clouds (LiDAR SOT) plays a crucial\nrole in autonomous driving. Current approaches all follow the Siamese paradigm\nbased on appearance matching. However, LiDAR point clouds are usually\ntextureless and incomplete, which hinders effective appearance matching.\nBesides, previous methods greatly overlook the critical motion clues among\ntargets. In this work, beyond 3D Siamese tracking, we introduce a\nmotion-centric paradigm to handle LiDAR SOT from a new perspective. Following\nthis paradigm, we propose a matching-free two-stage tracker M^2-Track. At the\n1st-stage, M^2-Track localizes the target within successive frames via motion\ntransformation. Then it refines the target box through motion-assisted shape\ncompletion at the 2nd-stage. Due to the motion-centric nature, our method shows\nits impressive generalizability with limited training labels and provides good\ndifferentiability for end-to-end cycle training. This inspires us to explore\nsemi-supervised LiDAR SOT by incorporating a pseudo-label-based motion\naugmentation and a self-supervised loss term. Under the fully-supervised\nsetting, extensive experiments confirm that M^2-Track significantly outperforms\nprevious state-of-the-arts on three large-scale datasets while running at 57FPS\n(~3%, ~11% and ~22% precision gains on KITTI, NuScenes, and Waymo Open Dataset\nrespectively). While under the semi-supervised setting, our method performs on\npar with or even surpasses its fully-supervised counterpart using fewer than\nhalf of the labels from KITTI. Further analysis verifies each component's\neffectiveness and shows the motion-centric paradigm's promising potential for\nauto-labeling and unsupervised domain adaptation.\n","authors":["Chaoda Zheng","Xu Yan","Haiming Zhang","Baoyuan Wang","Shenghui Cheng","Shuguang Cui","Zhen Li"],"pdf_url":"https://arxiv.org/pdf/2303.12535v2.pdf","comment":"Accepted version of the journal extension of M^2-Track. Accepted by\n  TPAMI. arXiv admin note: substantial text overlap with arXiv:2203.01730"},{"id":"http://arxiv.org/abs/2207.12389v2","updated":"2023-10-12T02:01:50Z","published":"2022-07-25T17:55:28Z","title":"MemSAC: Memory Augmented Sample Consistency for Large Scale Unsupervised\n  Domain Adaptation","summary":"  Practical real world datasets with plentiful categories introduce new\nchallenges for unsupervised domain adaptation like small inter-class\ndiscriminability, that existing approaches relying on domain invariance alone\ncannot handle sufficiently well. In this work we propose MemSAC, which exploits\nsample level similarity across source and target domains to achieve\ndiscriminative transfer, along with architectures that scale to a large number\nof categories. For this purpose, we first introduce a memory augmented approach\nto efficiently extract pairwise similarity relations between labeled source and\nunlabeled target domain instances, suited to handle an arbitrary number of\nclasses. Next, we propose and theoretically justify a novel variant of the\ncontrastive loss to promote local consistency among within-class cross domain\nsamples while enforcing separation between classes, thus preserving\ndiscriminative transfer from source to target. We validate the advantages of\nMemSAC with significant improvements over previous state-of-the-art on multiple\nchallenging transfer tasks designed for large-scale adaptation, such as\nDomainNet with 345 classes and fine-grained adaptation on Caltech-UCSD birds\ndataset with 200 classes. We also provide in-depth analysis and insights into\nthe effectiveness of MemSAC.\n","authors":["Tarun Kalluri","Astuti Sharma","Manmohan Chandraker"],"pdf_url":"https://arxiv.org/pdf/2207.12389v2.pdf","comment":"Accepted at ECCV 2022. Project Webpage:\n  https://tarun005.github.io/MemSAC/"},{"id":"http://arxiv.org/abs/2310.07975v1","updated":"2023-10-12T01:47:55Z","published":"2023-10-12T01:47:55Z","title":"Self-supervised visual learning for analyzing firearms trafficking\n  activities on the Web","summary":"  Automated visual firearms classification from RGB images is an important\nreal-world task with applications in public space security, intelligence\ngathering and law enforcement investigations. When applied to images massively\ncrawled from the World Wide Web (including social media and dark Web sites), it\ncan serve as an important component of systems that attempt to identify\ncriminal firearms trafficking networks, by analyzing Big Data from open-source\nintelligence. Deep Neural Networks (DNN) are the state-of-the-art methodology\nfor achieving this, with Convolutional Neural Networks (CNN) being typically\nemployed. The common transfer learning approach consists of pretraining on a\nlarge-scale, generic annotated dataset for whole-image classification, such as\nImageNet-1k, and then finetuning the DNN on a smaller, annotated,\ntask-specific, downstream dataset for visual firearms classification. Neither\nVisual Transformer (ViT) neural architectures nor Self-Supervised Learning\n(SSL) approaches have been so far evaluated on this critical task. SSL\nessentially consists of replacing the traditional supervised pretraining\nobjective with an unsupervised pretext task that does not require ground-truth\nlabels..\n","authors":["Sotirios Konstantakos","Despina Ioanna Chalkiadaki","Ioannis Mademlis","Adamantia Anna Rebolledo Chrysochoou","Georgios Th. Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2310.07975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.01244v2","updated":"2023-10-12T01:44:33Z","published":"2022-10-03T21:50:14Z","title":"Event-based Temporally Dense Optical Flow Estimation with Sequential\n  Learning","summary":"  Event cameras provide an advantage over traditional frame-based cameras when\ncapturing fast-moving objects without a motion blur. They achieve this by\nrecording changes in light intensity (known as events), thus allowing them to\noperate at a much higher frequency and making them suitable for capturing\nmotions in a highly dynamic scene. Many recent studies have proposed methods to\ntrain neural networks (NNs) for predicting optical flow from events. However,\nthey often rely on a spatio-temporal representation constructed from events\nover a fixed interval, such as 10Hz used in training on the DSEC dataset. This\nlimitation restricts the flow prediction to the same interval (10Hz) whereas\nthe fast speed of event cameras, which can operate up to 3kHz, has not been\neffectively utilized. In this work, we show that a temporally dense flow\nestimation at 100Hz can be achieved by treating the flow estimation as a\nsequential problem using two different variants of recurrent networks -\nLong-short term memory (LSTM) and spiking neural network (SNN). First, We\nutilize the NN model constructed similar to the popular EV-FlowNet but with\nLSTM layers to demonstrate the efficiency of our training method. The model not\nonly produces 10x more frequent optical flow than the existing ones, but the\nestimated flows also have 13% lower errors than predictions from the baseline\nEV-FlowNet. Second, we construct an EV-FlowNet SNN but with leaky integrate and\nfire neurons to efficiently capture the temporal dynamics. We found that simple\ninherent recurrent dynamics of SNN lead to significant parameter reduction\ncompared to the LSTM model. In addition, because of its event-driven\ncomputation, the spiking model is estimated to consume only 1.5% energy of the\nLSTM model, highlighting the efficiency of SNN in processing events and the\npotential for achieving temporally dense flow.\n","authors":["Wachirawit Ponghiran","Chamika Mihiranga Liyanagedera","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2210.01244v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07969v1","updated":"2023-10-12T01:25:21Z","published":"2023-10-12T01:25:21Z","title":"CleftGAN: Adapting A Style-Based Generative Adversarial Network To\n  Create Images Depicting Cleft Lip Deformity","summary":"  A major obstacle when attempting to train a machine learning system to\nevaluate facial clefts is the scarcity of large datasets of high-quality,\nethics board-approved patient images. In response, we have built a deep\nlearning-based cleft lip generator designed to produce an almost unlimited\nnumber of artificial images exhibiting high-fidelity facsimiles of cleft lip\nwith wide variation. We undertook a transfer learning protocol testing\ndifferent versions of StyleGAN-ADA (a generative adversarial network image\ngenerator incorporating adaptive data augmentation (ADA)) as the base model.\nTraining images depicting a variety of cleft deformities were pre-processed to\nadjust for rotation, scaling, color adjustment and background blurring. The ADA\nmodification of the primary algorithm permitted construction of our new\ngenerative model while requiring input of a relatively small number of training\nimages. Adversarial training was carried out using 514 unique frontal\nphotographs of cleft-affected faces to adapt a pre-trained model based on\n70,000 normal faces. The Frechet Inception Distance (FID) was used to measure\nthe similarity of the newly generated facial images to the cleft training\ndataset, while Perceptual Path Length (PPL) and the novel Divergence Index of\nSeverity Histograms (DISH) measures were also used to assess the performance of\nthe image generator that we dub CleftGAN. We found that StyleGAN3 with\ntranslation invariance (StyleGAN3-t) performed optimally as a base model.\nGenerated images achieved a low FID reflecting a close similarity to our\ntraining input dataset of genuine cleft images. Low PPL and DISH measures\nreflected a smooth and semantically valid interpolation of images through the\ntransfer learning process and a similar distribution of severity in the\ntraining and generated images, respectively.\n","authors":["Abdullah Hayajneh","Erchin Serpedin","Mohammad Shaqfeh","Graeme Glass","Mitchell A. Stotland"],"pdf_url":"https://arxiv.org/pdf/2310.07969v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03270v3","updated":"2023-10-12T01:13:41Z","published":"2023-10-05T02:51:53Z","title":"EfficientDM: Efficient Quantization-Aware Fine-Tuning of Low-Bit\n  Diffusion Models","summary":"  Diffusion models have demonstrated remarkable capabilities in image synthesis\nand related generative tasks. Nevertheless, their practicality for low-latency\nreal-world applications is constrained by substantial computational costs and\nlatency issues. Quantization is a dominant way to compress and accelerate\ndiffusion models, where post-training quantization (PTQ) and quantization-aware\ntraining (QAT) are two main approaches, each bearing its own properties. While\nPTQ exhibits efficiency in terms of both time and data usage, it may lead to\ndiminished performance in low bit-width. On the other hand, QAT can alleviate\nperformance degradation but comes with substantial demands on computational and\ndata resources. To capitalize on the advantages while avoiding their respective\ndrawbacks, we introduce a data-free and parameter-efficient fine-tuning\nframework for low-bit diffusion models, dubbed EfficientDM, to achieve\nQAT-level performance with PTQ-like efficiency. Specifically, we propose a\nquantization-aware variant of the low-rank adapter (QALoRA) that can be merged\nwith model weights and jointly quantized to low bit-width. The fine-tuning\nprocess distills the denoising capabilities of the full-precision model into\nits quantized counterpart, eliminating the requirement for training data. We\nalso introduce scale-aware optimization and employ temporal learned step-size\nquantization to further enhance performance. Extensive experimental results\ndemonstrate that our method significantly outperforms previous PTQ-based\ndiffusion models while maintaining similar time and data efficiency.\nSpecifically, there is only a marginal 0.05 sFID increase when quantizing both\nweights and activations of LDM-4 to 4-bit on ImageNet 256x256. Compared to\nQAT-based methods, our EfficientDM also boasts a 16.2x faster quantization\nspeed with comparable generation quality.\n","authors":["Yefei He","Jing Liu","Weijia Wu","Hong Zhou","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2310.03270v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.00613v3","updated":"2023-10-12T00:27:09Z","published":"2022-12-01T16:09:54Z","title":"NeuWigs: A Neural Dynamic Model for Volumetric Hair Capture and\n  Animation","summary":"  The capture and animation of human hair are two of the major challenges in\nthe creation of realistic avatars for the virtual reality. Both problems are\nhighly challenging, because hair has complex geometry and appearance, as well\nas exhibits challenging motion. In this paper, we present a two-stage approach\nthat models hair independently from the head to address these challenges in a\ndata-driven manner. The first stage, state compression, learns a\nlow-dimensional latent space of 3D hair states containing motion and\nappearance, via a novel autoencoder-as-a-tracker strategy. To better\ndisentangle the hair and head in appearance learning, we employ multi-view hair\nsegmentation masks in combination with a differentiable volumetric renderer.\nThe second stage learns a novel hair dynamics model that performs temporal hair\ntransfer based on the discovered latent codes. To enforce higher stability\nwhile driving our dynamics model, we employ the 3D point-cloud autoencoder from\nthe compression stage for de-noising of the hair state. Our model outperforms\nthe state of the art in novel view synthesis and is capable of creating novel\nhair animations without having to rely on hair observations as a driving\nsignal. Project page is here https://ziyanw1.github.io/neuwigs/.\n","authors":["Ziyan Wang","Giljoo Nam","Tuur Stuyck","Stephen Lombardi","Chen Cao","Jason Saragih","Michael Zollhoefer","Jessica Hodgins","Christoph Lassner"],"pdf_url":"https://arxiv.org/pdf/2212.00613v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.08141v2","updated":"2023-10-12T23:50:05Z","published":"2023-01-11T22:47:12Z","title":"Self-supervised Learning for Segmentation and Quantification of Dopamine\n  Neurons in Parkinson's Disease","summary":"  Parkinson's Disease (PD) is the second most common neurodegenerative disease\nin humans. PD is characterized by the gradual loss of dopaminergic neurons in\nthe Substantia Nigra (SN). Counting the number of dopaminergic neurons in the\nSN is one of the most important indexes in evaluating drug efficacy in PD\nanimal models. Currently, analyzing and quantifying dopaminergic neurons is\nconducted manually by experts through analysis of digital pathology images\nwhich is laborious, time-consuming, and highly subjective. As such, a reliable\nand unbiased automated system is demanded for the quantification of\ndopaminergic neurons in digital pathology images. Recent years have seen a\nsurge in adopting deep learning solutions in medical image processing. However,\ndeveloping high-performing deep learning models hinges on the availability of\nlarge-scale, high-quality annotated data, which can be expensive to acquire,\nespecially in applications like digital pathology image analysis. To this end,\nwe propose an end-to-end deep learning framework based on self-supervised\nlearning for the segmentation and quantification of dopaminergic neurons in PD\nanimal models. To the best of our knowledge, this is the first deep learning\nmodel that detects the cell body of dopaminergic neurons, counts the number of\ndopaminergic neurons, and provides characteristics of individual dopaminergic\nneurons as a numerical output. Extensive experiments demonstrate the\neffectiveness of our model in quantifying neurons with high precision, which\ncan provide a faster turnaround for drug efficacy studies, better understanding\nof dopaminergic neuronal health status, and unbiased results in PD pre-clinical\nresearch. As part of our contributions, we also provide the first publicly\navailable dataset of histology digital images along with expert annotations for\nthe segmentation of TH-positive DA neuronal soma.\n","authors":["Fatemeh Haghighi","Soumitra Ghosh","Hai Ngu","Sarah Chu","Han Lin","Mohsen Hejrati","Baris Bingol","Somaye Hashemifar"],"pdf_url":"https://arxiv.org/pdf/2301.08141v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08772v1","updated":"2023-10-12T23:38:52Z","published":"2023-10-12T23:38:52Z","title":"Investigating the Robustness and Properties of Detection Transformers\n  (DETR) Toward Difficult Images","summary":"  Transformer-based object detectors (DETR) have shown significant performance\nacross machine vision tasks, ultimately in object detection. This detector is\nbased on a self-attention mechanism along with the transformer encoder-decoder\narchitecture to capture the global context in the image. The critical issue to\nbe addressed is how this model architecture can handle different image\nnuisances, such as occlusion and adversarial perturbations. We studied this\nissue by measuring the performance of DETR with different experiments and\nbenchmarking the network with convolutional neural network (CNN) based\ndetectors like YOLO and Faster-RCNN. We found that DETR performs well when it\ncomes to resistance to interference from information loss in occlusion images.\nDespite that, we found that the adversarial stickers put on the image require\nthe network to produce a new unnecessary set of keys, queries, and values,\nwhich in most cases, results in a misdirection of the network. DETR also\nperformed poorer than YOLOv5 in the image corruption benchmark. Furthermore, we\nfound that DETR depends heavily on the main query when making a prediction,\nwhich leads to imbalanced contributions between queries since the main query\nreceives most of the gradient flow.\n","authors":["Zhao Ning Zou","Yuhang Zhang","Robert Wijaya"],"pdf_url":"https://arxiv.org/pdf/2310.08772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08756v1","updated":"2023-10-12T22:51:51Z","published":"2023-10-12T22:51:51Z","title":"Intelligent Scoliosis Screening and Diagnosis: A Survey","summary":"  Scoliosis is a three-dimensional spinal deformity, which may lead to abnormal\nmorphologies, such as thoracic deformity, and pelvic tilt. Severe patients may\nsuffer from nerve damage and urinary abnormalities. At present, the number of\nscoliosis patients in primary and secondary schools has exceeded five million\nin China, the incidence rate is about 3% to 5% which is growing every year. The\nresearch on scoliosis, therefore, has important clinical value. This paper\nsystematically introduces computer-assisted scoliosis screening and diagnosis\nas well as analyzes the advantages and limitations of different algorithm\nmodels in the current issue field. Moreover, the paper also discusses the\ncurrent development bottlenecks in this field and looks forward to future\ndevelopment trends.\n","authors":["Zhang Zhenlin","Pu Lixin","Li Ang","Zhang Jun","Li Xianjie","Fan Jipeng"],"pdf_url":"https://arxiv.org/pdf/2310.08756v1.pdf","comment":"in Chinese language"},{"id":"http://arxiv.org/abs/2306.14941v2","updated":"2023-10-12T22:49:49Z","published":"2023-06-26T17:54:24Z","title":"SIMMF: Semantics-aware Interactive Multiagent Motion Forecasting for\n  Autonomous Vehicle Driving","summary":"  Autonomous vehicles require motion forecasting of their surrounding\nmultiagents (pedestrians and vehicles) to make optimal decisions for\nnavigation. The existing methods focus on techniques to utilize the positions\nand velocities of these agents and fail to capture semantic information from\nthe scene. Moreover, to mitigate the increase in computational complexity\nassociated with the number of agents in the scene, some works leverage\nEuclidean distance to prune far-away agents. However, distance-based metric\nalone is insufficient to select relevant agents and accurately perform their\npredictions. To resolve these issues, we propose the Semantics-aware\nInteractive Multiagent Motion Forecasting (SIMMF) method to capture semantics\nalong with spatial information and optimally select relevant agents for motion\nprediction. Specifically, we achieve this by implementing a semantic-aware\nselection of relevant agents from the scene and passing them through an\nattention mechanism to extract global encodings. These encodings along with\nagents' local information, are passed through an encoder to obtain\ntime-dependent latent variables for a motion policy predicting the future\ntrajectories. Our results show that the proposed approach outperforms\nstate-of-the-art baselines and provides more accurate and scene-consistent\npredictions.\n","authors":["Vidyaa Krishnan Nivash","Ahmed H. Qureshi"],"pdf_url":"https://arxiv.org/pdf/2306.14941v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08755v1","updated":"2023-10-12T22:45:03Z","published":"2023-10-12T22:45:03Z","title":"PU-Ray: Point Cloud Upsampling via Ray Marching on Implicit Surface","summary":"  While the recent advancements in deep-learning-based point cloud upsampling\nmethods improve the input to autonomous driving systems, they still suffer from\nthe uncertainty of denser point generation resulting from end-to-end learning.\nFor example, due to the vague training objectives of the models, their\nperformance depends on the point distributions of the input and the ground\ntruth. This causes problems of domain dependency between synthetic and\nreal-scanned point clouds and issues with substantial model sizes and dataset\nrequirements. Additionally, many existing methods upsample point clouds with a\nfixed scaling rate, making them inflexible and computationally redundant. This\npaper addresses the above problems by proposing a ray-based upsampling approach\nwith an arbitrary rate, where a depth prediction is made for each query ray.\nThe method simulates the ray marching algorithm to achieve more precise and\nstable ray-depth predictions through implicit surface learning. The rule-based\nmid-point query sampling method enables a uniform output point distribution\nwithout requiring model training using the Chamfer distance loss function,\nwhich can exhibit bias towards the training dataset. Self-supervised learning\nbecomes possible with accurate ground truths within the input point cloud. The\nresults demonstrate the method's versatility across different domains and\ntraining scenarios with limited computational resources and training data. This\nallows the upsampling task to transition from academic research to real-world\napplications.\n","authors":["Sangwon Lim","Karim El-Basyouny","Yee Hong Yang"],"pdf_url":"https://arxiv.org/pdf/2310.08755v1.pdf","comment":"13 pages (10 main + 3 supplement), 19 figures (10 main + 9\n  supplement), 6 tables"},{"id":"http://arxiv.org/abs/2310.07682v2","updated":"2023-10-12T22:28:05Z","published":"2023-10-11T17:32:24Z","title":"Prediction of MET Overexpression in Non-Small Cell Lung Adenocarcinomas\n  from Hematoxylin and Eosin Images","summary":"  MET protein overexpression is a targetable event in non-small cell lung\ncancer (NSCLC) and is the subject of active drug development. Challenges in\nidentifying patients for these therapies include lack of access to validated\ntesting, such as standardized immunohistochemistry (IHC) assessment, and\nconsumption of valuable tissue for a single gene/protein assay. Development of\npre-screening algorithms using routinely available digitized hematoxylin and\neosin (H&E)-stained slides to predict MET overexpression could promote testing\nfor those who will benefit most. While assessment of MET expression using IHC\nis currently not routinely performed in NSCLC, next-generation sequencing is\ncommon and in some cases includes RNA expression panel testing. In this work,\nwe leveraged a large database of matched H&E slides and RNA expression data to\ntrain a weakly supervised model to predict MET RNA overexpression directly from\nH&E images. This model was evaluated on an independent holdout test set of 300\nover-expressed and 289 normal patients, demonstrating an ROC-AUC of 0.70 (95th\npercentile interval: 0.66 - 0.74) with stable performance characteristics\nacross different patient clinical variables and robust to synthetic noise on\nthe test set. These results suggest that H&E-based predictive models could be\nuseful to prioritize patients for confirmatory testing of MET protein or MET\ngene expression status.\n","authors":["Kshitij Ingale","Sun Hae Hong","Josh S. K. Bell","Abbas Rizvi","Amy Welch","Lingdao Sha","Irvin Ho","Kunal Nagpal","Aicha BenTaieb","Rohan P Joshi","Martin C Stumpe"],"pdf_url":"https://arxiv.org/pdf/2310.07682v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.06366v4","updated":"2023-10-12T22:25:43Z","published":"2022-10-12T16:18:25Z","title":"A Generalist Framework for Panoptic Segmentation of Images and Videos","summary":"  Panoptic segmentation assigns semantic and instance ID labels to every pixel\nof an image. As permutations of instance IDs are also valid solutions, the task\nrequires learning of high-dimensional one-to-many mapping. As a result,\nstate-of-the-art approaches use customized architectures and task-specific loss\nfunctions. We formulate panoptic segmentation as a discrete data generation\nproblem, without relying on inductive bias of the task. A diffusion model is\nproposed to model panoptic masks, with a simple architecture and generic loss\nfunction. By simply adding past predictions as a conditioning signal, our\nmethod is capable of modeling video (in a streaming setting) and thereby learns\nto track object instances automatically. With extensive experiments, we\ndemonstrate that our simple approach can perform competitively to\nstate-of-the-art specialist methods in similar settings.\n","authors":["Ting Chen","Lala Li","Saurabh Saxena","Geoffrey Hinton","David J. Fleet"],"pdf_url":"https://arxiv.org/pdf/2210.06366v4.pdf","comment":"ICCV'23. Code at https://github.com/google-research/pix2seq"},{"id":"http://arxiv.org/abs/2310.08745v1","updated":"2023-10-12T22:15:06Z","published":"2023-10-12T22:15:06Z","title":"AcTExplore: Active Tactile Exploration on Unknown Objects","summary":"  Tactile exploration plays a crucial role in understanding object structures\nfor fundamental robotics tasks such as grasping and manipulation. However,\nefficiently exploring such objects using tactile sensors is challenging,\nprimarily due to the large-scale unknown environments and limited sensing\ncoverage of these sensors. To this end, we present AcTExplore, an active\ntactile exploration method driven by reinforcement learning for object\nreconstruction at scales that automatically explores the object surfaces in a\nlimited number of steps. Through sufficient exploration, our algorithm\nincrementally collects tactile data and reconstructs 3D shapes of the objects\nas well, which can serve as a representation for higher-level downstream tasks.\nOur method achieves an average of 95.97% IoU coverage on unseen YCB objects\nwhile just being trained on primitive shapes.\n","authors":["Amir-Hossein Shahidzadeh","Seong Jong Yoo","Pavan Mantripragada","Chahat Deep Singh","Cornelia Fermüller","Yiannis Aloimonos"],"pdf_url":"https://arxiv.org/pdf/2310.08745v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2310.08743v1","updated":"2023-10-12T22:09:53Z","published":"2023-10-12T22:09:53Z","title":"Development and Validation of a Deep Learning-Based Microsatellite\n  Instability Predictor from Prostate Cancer Whole-Slide Images","summary":"  Microsatellite instability-high (MSI-H) is a tumor agnostic biomarker for\nimmune checkpoint inhibitor therapy. However, MSI status is not routinely\ntested in prostate cancer, in part due to low prevalence and assay cost. As\nsuch, prediction of MSI status from hematoxylin and eosin (H&E) stained\nwhole-slide images (WSIs) could identify prostate cancer patients most likely\nto benefit from confirmatory testing and becoming eligible for immunotherapy.\nProstate biopsies and surgical resections from de-identified records of\nconsecutive prostate cancer patients referred to our institution were analyzed.\nTheir MSI status was determined by next generation sequencing. Patients before\na cutoff date were split into an algorithm development set (n=4015, MSI-H 1.8%)\nand a paired validation set (n=173, MSI-H 19.7%) that consisted of two serial\nsections from each sample, one stained and scanned internally and the other at\nan external site. Patients after the cutoff date formed the temporal validation\nset (n=1350, MSI-H 2.3%). Attention-based multiple instance learning models\nwere trained to predict MSI-H from H&E WSIs. The MSI-H predictor achieved area\nunder the receiver operating characteristic curve values of 0.78 (95% CI\n[0.69-0.86]), 0.72 (95% CI [0.63-0.81]), and 0.72 (95% CI [0.62-0.82]) on the\ninternally prepared, externally prepared, and temporal validation sets,\nrespectively. While MSI-H status is significantly correlated with Gleason\nscore, the model remained predictive within each Gleason score subgroup. In\nsummary, we developed and validated an AI-based MSI-H diagnostic model on a\nlarge real-world cohort of routine H&E slides, which effectively generalized to\nexternally stained and scanned samples and a temporally independent validation\ncohort. This algorithm has the potential to direct prostate cancer patients\ntoward immunotherapy and to identify MSI-H cases secondary to Lynch syndrome.\n","authors":["Qiyuan Hu","Abbas A. Rizvi","Geoffery Schau","Kshitij Ingale","Yoni Muller","Rachel Baits","Sebastian Pretzer","Aïcha BenTaieb","Abigail Gordhamer","Roberto Nussenzveig","Adam Cole","Matthew O. Leavitt","Rohan P. Joshi","Nike Beaubier","Martin C. Stumpe","Kunal Nagpal"],"pdf_url":"https://arxiv.org/pdf/2310.08743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.11744v2","updated":"2023-10-12T21:20:16Z","published":"2022-11-21T18:59:33Z","title":"Visual Dexterity: In-hand Dexterous Manipulation from Depth","summary":"  In-hand object reorientation is necessary for performing many dexterous\nmanipulation tasks, such as tool use in less structured environments that\nremain beyond the reach of current robots. Prior works built reorientation\nsystems assuming one or many of the following: reorienting only specific\nobjects with simple shapes, limited range of reorientation, slow or quasistatic\nmanipulation, simulation-only results, the need for specialized and costly\nsensor suites, and other constraints which make the system infeasible for\nreal-world deployment. We present a general object reorientation controller\nthat does not make these assumptions. It uses readings from a single commodity\ndepth camera to dynamically reorient complex and new object shapes by any\nrotation in real-time, with the median reorientation time being close to seven\nseconds. The controller is trained using reinforcement learning in simulation\nand evaluated in the real world on new object shapes not used for training,\nincluding the most challenging scenario of reorienting objects held in the air\nby a downward-facing hand that must counteract gravity during reorientation.\nOur hardware platform only uses open-source components that cost less than five\nthousand dollars. While we demonstrate the ability to overcome assumptions in\nprior work, there is ample scope for improving absolute performance. For\ninstance, the challenging duck-shaped object not used for training was dropped\nin 56% of the trials. When it was not dropped, our controller reoriented the\nobject within 0.4 radians (i.e., 23 degrees) 75% of the time. Videos are\navailable at: https://taochenshh.github.io/projects/visual-dexterity.\n","authors":["Tao Chen","Megha Tippur","Siyang Wu","Vikash Kumar","Edward Adelson","Pulkit Agrawal"],"pdf_url":"https://arxiv.org/pdf/2211.11744v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.11359v6","updated":"2023-10-12T20:31:52Z","published":"2022-09-23T01:09:06Z","title":"CUTS: A Framework for Multigranular Unsupervised Medical Image\n  Segmentation","summary":"  Segmenting medical images is critical to facilitating both patient diagnoses\nand quantitative research. A major limiting factor is the lack of labeled data,\nas obtaining expert annotations for each new set of imaging data or task can be\nexpensive, labor intensive, and inconsistent among annotators. To address this,\nwe present CUTS (Contrastive and Unsupervised Training for multi-granular\nmedical image Segmentation), a fully unsupervised deep learning framework for\nmedical image segmentation to better utilize the vast majority of imaging data\nthat are not labeled or annotated. CUTS works by leveraging a novel two-stage\napproach. First, it produces an image-specific embedding map via intra-image\ncontrastive loss and a local patch reconstruction objective. Second, these\nembeddings are partitioned at dynamic levels of granularity that correspond to\nthe data topology. Ultimately, CUTS yields a series of coarse-to-fine-grained\nsegmentations that highlight image features at various scales. We apply CUTS to\nretinal fundus images and two types of brain MRI images in order to delineate\nstructures and patterns at different scales, providing distinct information\nrelevant for clinicians. When evaluated against predefined anatomical masks at\na given granularity, CUTS demonstrates improvements ranging from 10% to 200% on\ndice coefficient and Hausdorff distance compared to existing unsupervised\nmethods. Further, CUTS shows performance on par with the latest Segment\nAnything Model which was pre-trained in a supervised fashion on 11 million\nimages and 1.1 billion masks. In summary, with CUTS we demonstrate that medical\nimage segmentation can be effectively solved without relying on large, labeled\ndatasets or vast computational resources.\n","authors":["Chen Liu","Matthew Amodio","Liangbo L. Shen","Feng Gao","Arman Avesta","Sanjay Aneja","Jay C. Wang","Lucian V. Del Priore","Smita Krishnaswamy"],"pdf_url":"https://arxiv.org/pdf/2209.11359v6.pdf","comment":"Additional experiments and updated figures"},{"id":"http://arxiv.org/abs/2310.08705v1","updated":"2023-10-12T20:31:20Z","published":"2023-10-12T20:31:20Z","title":"A Benchmarking Protocol for SAR Colorization: From Regression to Deep\n  Learning Approaches","summary":"  Synthetic aperture radar (SAR) images are widely used in remote sensing.\nInterpreting SAR images can be challenging due to their intrinsic speckle noise\nand grayscale nature. To address this issue, SAR colorization has emerged as a\nresearch direction to colorize gray scale SAR images while preserving the\noriginal spatial information and radiometric information. However, this\nresearch field is still in its early stages, and many limitations can be\nhighlighted. In this paper, we propose a full research line for supervised\nlearning-based approaches to SAR colorization. Our approach includes a protocol\nfor generating synthetic color SAR images, several baselines, and an effective\nmethod based on the conditional generative adversarial network (cGAN) for SAR\ncolorization. We also propose numerical assessment metrics for the problem at\nhand. To our knowledge, this is the first attempt to propose a research line\nfor SAR colorization that includes a protocol, a benchmark, and a complete\nperformance evaluation. Our extensive tests demonstrate the effectiveness of\nour proposed cGAN-based network for SAR colorization. The code will be made\npublicly available.\n","authors":["Kangqing Shen","Gemine Vivone","Xiaoyuan Yang","Simone Lolli","Michael Schmitt"],"pdf_url":"https://arxiv.org/pdf/2310.08705v1.pdf","comment":"16 pages, 16 figures, 6 tables"},{"id":"http://arxiv.org/abs/2304.08960v2","updated":"2023-10-12T20:08:52Z","published":"2023-04-18T12:51:18Z","title":"Generative modeling of living cells with SO(3)-equivariant implicit\n  neural representations","summary":"  Data-driven cell tracking and segmentation methods in biomedical imaging\nrequire diverse and information-rich training data. In cases where the number\nof training samples is limited, synthetic computer-generated data sets can be\nused to improve these methods. This requires the synthesis of cell shapes as\nwell as corresponding microscopy images using generative models. To synthesize\nrealistic living cell shapes, the shape representation used by the generative\nmodel should be able to accurately represent fine details and changes in\ntopology, which are common in cells. These requirements are not met by 3D voxel\nmasks, which are restricted in resolution, and polygon meshes, which do not\neasily model processes like cell growth and mitosis. In this work, we propose\nto represent living cell shapes as level sets of signed distance functions\n(SDFs) which are estimated by neural networks. We optimize a fully-connected\nneural network to provide an implicit representation of the SDF value at any\npoint in a 3D+time domain, conditioned on a learned latent code that is\ndisentangled from the rotation of the cell shape. We demonstrate the\neffectiveness of this approach on cells that exhibit rapid deformations\n(Platynereis dumerilii), cells that grow and divide (C. elegans), and cells\nthat have growing and branching filopodial protrusions (A549 human lung\ncarcinoma cells). A quantitative evaluation using shape features and Dice\nsimilarity coefficients of real and synthetic cell shapes shows that our model\ncan generate topologically plausible complex cell shapes in 3D+time with high\nsimilarity to real living cell shapes. Finally, we show how microscopy images\nof living cells that correspond to our generated cell shapes can be synthesized\nusing an image-to-image model.\n","authors":["David Wiesner","Julian Suk","Sven Dummer","Tereza Nečasová","Vladimír Ulman","David Svoboda","Jelmer M. Wolterink"],"pdf_url":"https://arxiv.org/pdf/2304.08960v2.pdf","comment":"Medical Image Analysis (MedIA) 2023 (Accepted)"},{"id":"http://arxiv.org/abs/2310.08681v1","updated":"2023-10-12T19:33:53Z","published":"2023-10-12T19:33:53Z","title":"Fed-Safe: Securing Federated Learning in Healthcare Against Adversarial\n  Attacks","summary":"  This paper explores the security aspects of federated learning applications\nin medical image analysis. Current robustness-oriented methods like adversarial\ntraining, secure aggregation, and homomorphic encryption often risk privacy\ncompromises. The central aim is to defend the network against potential privacy\nbreaches while maintaining model robustness against adversarial manipulations.\nWe show that incorporating distributed noise, grounded in the privacy\nguarantees in federated settings, enables the development of a adversarially\nrobust model that also meets federated privacy standards. We conducted\ncomprehensive evaluations across diverse attack scenarios, parameters, and use\ncases in cancer imaging, concentrating on pathology, meningioma, and glioma.\nThe results reveal that the incorporation of distributed noise allows for the\nattainment of security levels comparable to those of conventional adversarial\ntraining while requiring fewer retraining samples to establish a robust model.\n","authors":["Erfan Darzi","Nanna M. Sijtsema","P. M. A van Ooijen"],"pdf_url":"https://arxiv.org/pdf/2310.08681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.14791v3","updated":"2023-10-12T19:10:57Z","published":"2022-10-26T15:38:28Z","title":"ViNL: Visual Navigation and Locomotion Over Obstacles","summary":"  We present Visual Navigation and Locomotion over obstacles (ViNL), which\nenables a quadrupedal robot to navigate unseen apartments while stepping over\nsmall obstacles that lie in its path (e.g., shoes, toys, cables), similar to\nhow humans and pets lift their feet over objects as they walk. ViNL consists\nof: (1) a visual navigation policy that outputs linear and angular velocity\ncommands that guides the robot to a goal coordinate in unfamiliar indoor\nenvironments; and (2) a visual locomotion policy that controls the robot's\njoints to avoid stepping on obstacles while following provided velocity\ncommands. Both the policies are entirely \"model-free\", i.e. sensors-to-actions\nneural networks trained end-to-end. The two are trained independently in two\nentirely different simulators and then seamlessly co-deployed by feeding the\nvelocity commands from the navigator to the locomotor, entirely \"zero-shot\"\n(without any co-training). While prior works have developed learning methods\nfor visual navigation or visual locomotion, to the best of our knowledge, this\nis the first fully learned approach that leverages vision to accomplish both\n(1) intelligent navigation in new environments, and (2) intelligent visual\nlocomotion that aims to traverse cluttered environments without disrupting\nobstacles. On the task of navigation to distant goals in unknown environments,\nViNL using just egocentric vision significantly outperforms prior work on\nrobust locomotion using privileged terrain maps (+32.8% success and -4.42\ncollisions per meter). Additionally, we ablate our locomotion policy to show\nthat each aspect of our approach helps reduce obstacle collisions. Videos and\ncode at http://www.joannetruong.com/projects/vinl.html\n","authors":["Simar Kareer","Naoki Yokoyama","Dhruv Batra","Sehoon Ha","Joanne Truong"],"pdf_url":"https://arxiv.org/pdf/2210.14791v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08671v1","updated":"2023-10-12T19:08:03Z","published":"2023-10-12T19:08:03Z","title":"SSG2: A new modelling paradigm for semantic segmentation","summary":"  State-of-the-art models in semantic segmentation primarily operate on single,\nstatic images, generating corresponding segmentation masks. This one-shot\napproach leaves little room for error correction, as the models lack the\ncapability to integrate multiple observations for enhanced accuracy. Inspired\nby work on semantic change detection, we address this limitation by introducing\na methodology that leverages a sequence of observables generated for each\nstatic input image. By adding this \"temporal\" dimension, we exploit strong\nsignal correlations between successive observations in the sequence to reduce\nerror rates. Our framework, dubbed SSG2 (Semantic Segmentation Generation 2),\nemploys a dual-encoder, single-decoder base network augmented with a sequence\nmodel. The base model learns to predict the set intersection, union, and\ndifference of labels from dual-input images. Given a fixed target input image\nand a set of support images, the sequence model builds the predicted mask of\nthe target by synthesizing the partial views from each sequence step and\nfiltering out noise. We evaluate SSG2 across three diverse datasets:\nUrbanMonitor, featuring orthoimage tiles from Darwin, Australia with five\nspectral bands and 0.2m spatial resolution; ISPRS Potsdam, which includes true\northophoto images with multiple spectral bands and a 5cm ground sampling\ndistance; and ISIC2018, a medical dataset focused on skin lesion segmentation,\nparticularly melanoma. The SSG2 model demonstrates rapid convergence within the\nfirst few tens of epochs and significantly outperforms UNet-like baseline\nmodels with the same number of gradient updates. However, the addition of the\ntemporal dimension results in an increased memory footprint. While this could\nbe a limitation, it is offset by the advent of higher-memory GPUs and coding\noptimizations.\n","authors":["Foivos I. Diakogiannis","Suzanne Furby","Peter Caccetta","Xiaoliang Wu","Rodrigo Ibata","Ondrej Hlinka","John Taylor"],"pdf_url":"https://arxiv.org/pdf/2310.08671v1.pdf","comment":"19 pages, Under review"},{"id":"http://arxiv.org/abs/2310.08669v1","updated":"2023-10-12T19:01:06Z","published":"2023-10-12T19:01:06Z","title":"Multimodal Large Language Model for Visual Navigation","summary":"  Recent efforts to enable visual navigation using large language models have\nmainly focused on developing complex prompt systems. These systems incorporate\ninstructions, observations, and history into massive text prompts, which are\nthen combined with pre-trained large language models to facilitate visual\nnavigation. In contrast, our approach aims to fine-tune large language models\nfor visual navigation without extensive prompt engineering. Our design involves\na simple text prompt, current observations, and a history collector model that\ngathers information from previous observations as input. For output, our design\nprovides a probability distribution of possible actions that the agent can take\nduring navigation. We train our model using human demonstrations and collision\nsignals from the Habitat-Matterport 3D Dataset (HM3D). Experimental results\ndemonstrate that our method outperforms state-of-the-art behavior cloning\nmethods and effectively reduces collision rates.\n","authors":["Yao-Hung Hubert Tsai","Vansh Dhar","Jialu Li","Bowen Zhang","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08669v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07891v2","updated":"2023-10-12T18:46:09Z","published":"2023-09-14T17:42:08Z","title":"HandNeRF: Learning to Reconstruct Hand-Object Interaction Scene from a\n  Single RGB Image","summary":"  This paper presents a method to learn hand-object interaction prior for\nreconstructing a 3D hand-object scene from a single RGB image. The inference as\nwell as training-data generation for 3D hand-object scene reconstruction is\nchallenging due to the depth ambiguity of a single image and occlusions by the\nhand and object. We turn this challenge into an opportunity by utilizing the\nhand shape to constrain the possible relative configuration of the hand and\nobject geometry. We design a generalizable implicit function, HandNeRF, that\nexplicitly encodes the correlation of the 3D hand shape features and 2D object\nfeatures to predict the hand and object scene geometry. With experiments on\nreal-world datasets, we show that HandNeRF is able to reconstruct hand-object\nscenes of novel grasp configurations more accurately than comparable methods.\nMoreover, we demonstrate that object reconstruction from HandNeRF ensures more\naccurate execution of downstream tasks, such as grasping and motion planning\nfor robotic hand-over and manipulation. The code will be release here:\nhttps://gitbhub.com/SamsungLabs/HandNeRF\n","authors":["Hongsuk Choi","Nikhil Chavan-Dafle","Jiacheng Yuan","Volkan Isler","Hyunsoo Park"],"pdf_url":"https://arxiv.org/pdf/2309.07891v2.pdf","comment":"12 pages including the supplementary material, 8 tables, 12 figures"},{"id":"http://arxiv.org/abs/2210.05633v3","updated":"2023-10-12T18:40:37Z","published":"2022-10-11T17:25:51Z","title":"Habitat-Matterport 3D Semantics Dataset","summary":"  We present the Habitat-Matterport 3D Semantics (HM3DSEM) dataset. HM3DSEM is\nthe largest dataset of 3D real-world spaces with densely annotated semantics\nthat is currently available to the academic community. It consists of 142,646\nobject instance annotations across 216 3D spaces and 3,100 rooms within those\nspaces. The scale, quality, and diversity of object annotations far exceed\nthose of prior datasets. A key difference setting apart HM3DSEM from other\ndatasets is the use of texture information to annotate pixel-accurate object\nboundaries. We demonstrate the effectiveness of HM3DSEM dataset for the Object\nGoal Navigation task using different methods. Policies trained using HM3DSEM\nperform outperform those trained on prior datasets. Introduction of HM3DSEM in\nthe Habitat ObjectNav Challenge lead to an increase in participation from 400\nsubmissions in 2021 to 1022 submissions in 2022.\n","authors":["Karmesh Yadav","Ram Ramrakhya","Santhosh Kumar Ramakrishnan","Theo Gervet","John Turner","Aaron Gokaslan","Noah Maestre","Angel Xuan Chang","Dhruv Batra","Manolis Savva","Alexander William Clegg","Devendra Singh Chaplot"],"pdf_url":"https://arxiv.org/pdf/2210.05633v3.pdf","comment":"15 Pages, 11 Figures, 6 Tables"},{"id":"http://arxiv.org/abs/2310.08654v1","updated":"2023-10-12T18:26:48Z","published":"2023-10-12T18:26:48Z","title":"Histogram- and Diffusion-Based Medical Out-of-Distribution Detection","summary":"  Out-of-distribution (OOD) detection is crucial for the safety and reliability\nof artificial intelligence algorithms, especially in the medical domain. In the\ncontext of the Medical OOD (MOOD) detection challenge 2023, we propose a\npipeline that combines a histogram-based method and a diffusion-based method.\nThe histogram-based method is designed to accurately detect homogeneous\nanomalies in the toy examples of the challenge, such as blobs with constant\nintensity values. The diffusion-based method is based on one of the latest\nmethods for unsupervised anomaly detection, called DDPM-OOD. We explore this\nmethod and propose extensive post-processing steps for pixel-level and\nsample-level anomaly detection on brain MRI and abdominal CT data provided by\nthe challenge. Our results show that the proposed DDPM method is sensitive to\nblur and bias field samples, but faces challenges with anatomical deformation,\nblack slice, and swapped patches. These findings suggest that further research\nis needed to improve the performance of DDPM for OOD detection in medical\nimages.\n","authors":["Evi M. C. Huijben","Sina Amirrajab","Josien P. W. Pluim"],"pdf_url":"https://arxiv.org/pdf/2310.08654v1.pdf","comment":"9 pages, 5 figures, submission to Medical Out-of-Distribution (MOOD)\n  challenge at MICCAI 2023"},{"id":"http://arxiv.org/abs/2310.08645v1","updated":"2023-10-12T18:10:36Z","published":"2023-10-12T18:10:36Z","title":"Defect Analysis of 3D Printed Cylinder Object Using Transfer Learning\n  Approaches","summary":"  Additive manufacturing (AM) is gaining attention across various industries\nlike healthcare, aerospace, and automotive. However, identifying defects early\nin the AM process can reduce production costs and improve productivity - a key\nchallenge. This study explored the effectiveness of machine learning (ML)\napproaches, specifically transfer learning (TL) models, for defect detection in\n3D-printed cylinders. Images of cylinders were analyzed using models including\nVGG16, VGG19, ResNet50, ResNet101, InceptionResNetV2, and MobileNetV2.\nPerformance was compared across two datasets using accuracy, precision, recall,\nand F1-score metrics. In the first study, VGG16, InceptionResNetV2, and\nMobileNetV2 achieved perfect scores. In contrast, ResNet50 had the lowest\nperformance, with an average F1-score of 0.32. Similarly, in the second study,\nMobileNetV2 correctly classified all instances, while ResNet50 struggled with\nmore false positives and fewer true positives, resulting in an F1-score of\n0.75. Overall, the findings suggest certain TL models like MobileNetV2 can\ndeliver high accuracy for AM defect classification, although performance varies\nacross algorithms. The results provide insights into model optimization and\nintegration needs for reliable automated defect analysis during 3D printing. By\nidentifying the top-performing TL techniques, this study aims to enhance AM\nproduct quality through robust image-based monitoring and inspection.\n","authors":["Md Manjurul Ahsan","Shivakumar Raman","Zahed Siddique"],"pdf_url":"https://arxiv.org/pdf/2310.08645v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08785v1","updated":"2023-10-12T15:43:12Z","published":"2023-10-12T15:43:12Z","title":"DeltaSpace: A Semantic-aligned Feature Space for Flexible Text-guided\n  Image Editing","summary":"  Text-guided image editing faces significant challenges to training and\ninference flexibility. Much literature collects large amounts of annotated\nimage-text pairs to train text-conditioned generative models from scratch,\nwhich is expensive and not efficient. After that, some approaches that leverage\npre-trained vision-language models are put forward to avoid data collection,\nbut they are also limited by either per text-prompt optimization or\ninference-time hyper-parameters tuning. To address these issues, we investigate\nand identify a specific space, referred to as CLIP DeltaSpace, where the CLIP\nvisual feature difference of two images is semantically aligned with the CLIP\ntextual feature difference of their corresponding text descriptions. Based on\nDeltaSpace, we propose a novel framework called DeltaEdit, which maps the CLIP\nvisual feature differences to the latent space directions of a generative model\nduring the training phase, and predicts the latent space directions from the\nCLIP textual feature differences during the inference phase. And this design\nendows DeltaEdit with two advantages: (1) text-free training; (2)\ngeneralization to various text prompts for zero-shot inference. Extensive\nexperiments validate the effectiveness and versatility of DeltaEdit with\ndifferent generative models, including both the GAN model and the diffusion\nmodel, in achieving flexible text-guided image editing. Code is available at\nhttps://github.com/Yueming6568/DeltaEdit.\n","authors":["Yueming Lyu","Kang Zhao","Bo Peng","Yue Jiang","Yingya Zhang","Jing Dong"],"pdf_url":"https://arxiv.org/pdf/2310.08785v1.pdf","comment":"17 pages. arXiv admin note: text overlap with arXiv:2303.06285"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2310.08319v1","updated":"2023-10-12T13:32:35Z","published":"2023-10-12T13:32:35Z","title":"Fine-Tuning LLaMA for Multi-Stage Text Retrieval","summary":"  The effectiveness of multi-stage text retrieval has been solidly demonstrated\nsince before the era of pre-trained language models. However, most existing\nstudies utilize models that predate recent advances in large language models\n(LLMs). This study seeks to explore potential improvements that\nstate-of-the-art LLMs can bring. We conduct a comprehensive study, fine-tuning\nthe latest LLaMA model both as a dense retriever (RepLLaMA) and as a pointwise\nreranker (RankLLaMA) for both passage retrieval and document retrieval using\nthe MS MARCO datasets. Our findings demonstrate that the effectiveness of large\nlanguage models indeed surpasses that of smaller models. Additionally, since\nLLMs can inherently handle longer contexts, they can represent entire documents\nholistically, obviating the need for traditional segmenting and pooling\nstrategies. Furthermore, evaluations on BEIR demonstrate that our\nRepLLaMA-RankLLaMA pipeline exhibits strong zero-shot effectiveness. Model\ncheckpoints from this study are available on HuggingFace.\n","authors":["Xueguang Ma","Liang Wang","Nan Yang","Furu Wei","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2310.08319v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08083v1","updated":"2023-10-12T07:14:22Z","published":"2023-10-12T07:14:22Z","title":"On Using GUI Interaction Data to Improve Text Retrieval-based Bug\n  Localization","summary":"  One of the most important tasks related to managing bug reports is localizing\nthe fault so that a fix can be applied. As such, prior work has aimed to\nautomate this task of bug localization by formulating it as an information\nretrieval problem, where potentially buggy files are retrieved and ranked\naccording to their textual similarity with a given bug report. However, there\nis often a notable semantic gap between the information contained in bug\nreports and identifiers or natural language contained within source code files.\nFor user-facing software, there is currently a key source of information that\ncould aid in bug localization, but has not been thoroughly investigated -\ninformation from the GUI.\n  We investigate the hypothesis that, for end user-facing applications,\nconnecting information in a bug report with information from the GUI, and using\nthis to aid in retrieving potentially buggy files, can improve upon existing\ntechniques for bug localization. To examine this phenomenon, we conduct a\ncomprehensive empirical study that augments four baseline techniques for bug\nlocalization with GUI interaction information from a reproduction scenario to\n(i) filter out potentially irrelevant files, (ii) boost potentially relevant\nfiles, and (iii) reformulate text-retrieval queries. To carry out our study, we\nsource the current largest dataset of fully-localized and reproducible real\nbugs for Android apps, with corresponding bug reports, consisting of 80 bug\nreports from 39 popular open-source apps. Our results illustrate that\naugmenting traditional techniques with GUI information leads to a marked\nincrease in effectiveness across multiple metrics, including a relative\nincrease in Hits@10 of 13-18%. Additionally, through further analysis, we find\nthat our studied augmentations largely complement existing techniques.\n","authors":["Junayed Mahmud","Nadeeshan De Silva","Safwat Ali Khan","Seyed Hooman Mostafavi","SM Hasan Mansur","Oscar Chaparro","Andrian Marcus","Kevin Moran"],"pdf_url":"https://arxiv.org/pdf/2310.08083v1.pdf","comment":"13 pages, to appear in the Proceedings of the 46th International\n  Conference on Software Engineering (ICSE'24)"},{"id":"http://arxiv.org/abs/2310.08069v1","updated":"2023-10-12T06:32:42Z","published":"2023-10-12T06:32:42Z","title":"Rethinking Negative Pairs in Code Search","summary":"  Recently, contrastive learning has become a key component in fine-tuning code\nsearch models for software development efficiency and effectiveness. It pulls\ntogether positive code snippets while pushing negative samples away given\nsearch queries. Among contrastive learning, InfoNCE is the most widely used\nloss function due to its better performance. However, the following problems in\nnegative samples of InfoNCE may deteriorate its representation learning: 1) The\nexistence of false negative samples in large code corpora due to duplications.\n2). The failure to explicitly differentiate between the potential relevance of\nnegative samples. As an example, a bubble sorting algorithm example is less\n``negative'' than a file saving function for the quick sorting algorithm query.\nIn this paper, we tackle the above problems by proposing a simple yet effective\nSoft-InfoNCE loss that inserts weight terms into InfoNCE. In our proposed loss\nfunction, we apply three methods to estimate the weights of negative pairs and\nshow that the vanilla InfoNCE loss is a special case of Soft-InfoNCE.\nTheoretically, we analyze the effects of Soft-InfoNCE on controlling the\ndistribution of learnt code representations and on deducing a more precise\nmutual information estimation. We furthermore discuss the superiority of\nproposed loss functions with other design alternatives. Extensive experiments\ndemonstrate the effectiveness of Soft-InfoNCE and weights estimation methods\nunder state-of-the-art code search models on a large-scale public dataset\nconsisting of six programming languages. Source code is available at\n\\url{https://github.com/Alex-HaochenLi/Soft-InfoNCE}.\n","authors":["Haochen Li","Xin Zhou","Luu Anh Tuan","Chunyan Miao"],"pdf_url":"https://arxiv.org/pdf/2310.08069v1.pdf","comment":"Accepted to EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.08039v1","updated":"2023-10-12T05:14:42Z","published":"2023-10-12T05:14:42Z","title":"Rethinking Large-scale Pre-ranking System: Entire-chain Cross-domain\n  Models","summary":"  Industrial systems such as recommender systems and online advertising, have\nbeen widely equipped with multi-stage architectures, which are divided into\nseveral cascaded modules, including matching, pre-ranking, ranking and\nre-ranking. As a critical bridge between matching and ranking, existing\npre-ranking approaches mainly endure sample selection bias (SSB) problem owing\nto ignoring the entire-chain data dependence, resulting in sub-optimal\nperformances. In this paper, we rethink pre-ranking system from the perspective\nof the entire sample space, and propose Entire-chain Cross-domain Models (ECM),\nwhich leverage samples from the whole cascaded stages to effectively alleviate\nSSB problem. Besides, we design a fine-grained neural structure named ECMM to\nfurther improve the pre-ranking accuracy. Specifically, we propose a\ncross-domain multi-tower neural network to comprehensively predict for each\nstage result, and introduce the sub-networking routing strategy with $L0$\nregularization to reduce computational costs. Evaluations on real-world\nlarge-scale traffic logs demonstrate that our pre-ranking models outperform\nSOTA methods while time consumption is maintained within an acceptable level,\nwhich achieves better trade-off between efficiency and effectiveness.\n","authors":["Jinbo Song","Ruoran Huang","Xinyang Wang","Wei Huang","Qian Yu","Mingming Chen","Yafei Yao","Chaosheng Fan","Changping Peng","Zhangang Lin","Jinghe Hu","Jingping Shao"],"pdf_url":"https://arxiv.org/pdf/2310.08039v1.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2310.08038v1","updated":"2023-10-12T05:09:27Z","published":"2023-10-12T05:09:27Z","title":"Continual Learning via Manifold Expansion Replay","summary":"  In continual learning, the learner learns multiple tasks in sequence, with\ndata being acquired only once for each task. Catastrophic forgetting is a major\nchallenge to continual learning. To reduce forgetting, some existing\nrehearsal-based methods use episodic memory to replay samples of previous\ntasks. However, in the process of knowledge integration when learning a new\ntask, this strategy also suffers from catastrophic forgetting due to an\nimbalance between old and new knowledge. To address this problem, we propose a\nnovel replay strategy called Manifold Expansion Replay (MaER). We argue that\nexpanding the implicit manifold of the knowledge representation in the episodic\nmemory helps to improve the robustness and expressiveness of the model. To this\nend, we propose a greedy strategy to keep increasing the diameter of the\nimplicit manifold represented by the knowledge in the buffer during memory\nmanagement. In addition, we introduce Wasserstein distance instead of cross\nentropy as distillation loss to preserve previous knowledge. With extensive\nexperimental validation on MNIST, CIFAR10, CIFAR100, and TinyImageNet, we show\nthat the proposed method significantly improves the accuracy in continual\nlearning setup, outperforming the state of the arts.\n","authors":["Zihao Xu","Xuan Tang","Yufei Shi","Jianfeng Zhang","Jian Yang","Mingsong Chen","Xian Wei"],"pdf_url":"https://arxiv.org/pdf/2310.08038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07990v1","updated":"2023-10-12T02:34:56Z","published":"2023-10-12T02:34:56Z","title":"Multi-View Variational Autoencoder for Missing Value Imputation in\n  Untargeted Metabolomics","summary":"  Background: Missing data is a common challenge in mass spectrometry-based\nmetabolomics, which can lead to biased and incomplete analyses. The integration\nof whole-genome sequencing (WGS) data with metabolomics data has emerged as a\npromising approach to enhance the accuracy of data imputation in metabolomics\nstudies. Method: In this study, we propose a novel method that leverages the\ninformation from WGS data and reference metabolites to impute unknown\nmetabolites. Our approach utilizes a multi-view variational autoencoder to\njointly model the burden score, polygenetic risk score (PGS), and linkage\ndisequilibrium (LD) pruned single nucleotide polymorphisms (SNPs) for feature\nextraction and missing metabolomics data imputation. By learning the latent\nrepresentations of both omics data, our method can effectively impute missing\nmetabolomics values based on genomic information. Results: We evaluate the\nperformance of our method on empirical metabolomics datasets with missing\nvalues and demonstrate its superiority compared to conventional imputation\ntechniques. Using 35 template metabolites derived burden scores, PGS and\nLD-pruned SNPs, the proposed methods achieved r2-scores > 0.01 for 71.55% of\nmetabolites. Conclusion: The integration of WGS data in metabolomics imputation\nnot only improves data completeness but also enhances downstream analyses,\npaving the way for more comprehensive and accurate investigations of metabolic\npathways and disease associations. Our findings offer valuable insights into\nthe potential benefits of utilizing WGS data for metabolomics data imputation\nand underscore the importance of leveraging multi-modal data integration in\nprecision medicine research.\n","authors":["Chen Zhao","Kuan-Jui Su","Chong Wu","Xuewei Cao","Qiuying Sha","Wu Li","Zhe Luo","Tian Qin","Chuan Qiu","Lan Juan Zhao","Anqi Liu","Lindong Jiang","Xiao Zhang","Hui Shen","Weihua Zhou","Hong-Wen Deng"],"pdf_url":"https://arxiv.org/pdf/2310.07990v1.pdf","comment":"19 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.08759v1","updated":"2023-10-12T22:56:53Z","published":"2023-10-12T22:56:53Z","title":"Question Answering for Electronic Health Records: A Scoping Review of\n  datasets and models","summary":"  Question Answering (QA) systems on patient-related data can assist both\nclinicians and patients. They can, for example, assist clinicians in\ndecision-making and enable patients to have a better understanding of their\nmedical history. Significant amounts of patient data are stored in Electronic\nHealth Records (EHRs), making EHR QA an important research area. In EHR QA, the\nanswer is obtained from the medical record of the patient. Because of the\ndifferences in data format and modality, this differs greatly from other\nmedical QA tasks that employ medical websites or scientific papers to retrieve\nanswers, making it critical to research EHR question answering. This study\naimed to provide a methodological review of existing works on QA over EHRs. We\nsearched for articles from January 1st, 2005 to September 30th, 2023 in four\ndigital sources including Google Scholar, ACL Anthology, ACM Digital Library,\nand PubMed to collect relevant publications on EHR QA. 4111 papers were\nidentified for our study, and after screening based on our inclusion criteria,\nwe obtained a total of 47 papers for further study. Out of the 47 papers, 25\npapers were about EHR QA datasets, and 37 papers were about EHR QA models. It\nwas observed that QA on EHRs is relatively new and unexplored. Most of the\nworks are fairly recent. Also, it was observed that emrQA is by far the most\npopular EHR QA dataset, both in terms of citations and usage in other papers.\nFurthermore, we identified the different models used in EHR QA along with the\nevaluation metrics used for these models.\n","authors":["Jayetri Bardhan","Kirk Roberts","Daisy Zhe Wang"],"pdf_url":"https://arxiv.org/pdf/2310.08759v1.pdf","comment":"5 tables, 6 figures"},{"id":"http://arxiv.org/abs/2306.15010v2","updated":"2023-10-12T06:45:38Z","published":"2023-06-26T18:49:09Z","title":"Efficient High-Resolution Template Matching with Vector Quantized\n  Nearest Neighbour Fields","summary":"  Template matching is a fundamental problem in computer vision with\napplications in fields including object detection, image registration, and\nobject tracking. Current methods rely on nearest-neighbour (NN) matching, where\nthe query feature space is converted to NN space by representing each query\npixel with its NN in the template. NN-based methods have been shown to perform\nbetter in occlusions, appearance changes, and non-rigid transformations;\nhowever, they scale poorly with high-resolution data and high feature\ndimensions. We present an NN-based method which efficiently reduces the NN\ncomputations and introduces filtering in the NN fields (NNFs). A vector\nquantization step is introduced before the NN calculation to represent the\ntemplate with $k$ features, and the filter response over the NNFs is used to\ncompare the template and query distributions over the features. We show that\nstate-of-the-art performance is achieved in low-resolution data, and our method\noutperforms previous methods at higher resolution.\n","authors":["Ankit Gupta","Ida-Maria Sintorn"],"pdf_url":"https://arxiv.org/pdf/2306.15010v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08613v1","updated":"2023-10-12T06:26:20Z","published":"2023-10-12T06:26:20Z","title":"Individual Variation Affects Outbreak Magnitude and Predictability in an\n  Extended Multi-Pathogen SIR Model of Pigeons Vising Dairy Farms","summary":"  Zoonotic disease transmission between animals and humans is a growing risk\nand the agricultural context acts as a likely point of transition, with\nindividual heterogeneity acting as an important contributor. Thus,\nunderstanding the dynamics of disease spread in the wildlife-livestock\ninterface is crucial for mitigating these risks of transmission. Specifically,\nthe interactions between pigeons and in-door cows at dairy farms can lead to\nsignificant disease transmission and economic losses for farmers; putting\nlivestock, adjacent human populations, and other wildlife species at risk. In\nthis paper, we propose a novel spatio-temporal multi-pathogen model with\ncontinuous spatial movement. The model expands on the\nSusceptible-Exposed-Infected-Recovered-Dead (SEIRD) framework and accounts for\nboth within-species and cross-species transmission of pathogens, as well as the\nexploration-exploitation movement dynamics of pigeons, which play a critical\nrole in the spread of infection agents. In addition to model formulation, we\nalso implement it as an agent-based simulation approach and use empirical field\ndata to investigate different biologically realistic scenarios, evaluating the\neffect of various parameters on the epidemic spread. Namely, in agreement with\ntheoretical expectations, the model predicts that the heterogeneity of the\npigeons' movement dynamics can drastically affect both the magnitude and\nstability of outbreaks. In addition, joint infection by multiple pathogens can\nhave an interactive effect unobservable in single-pathogen SIR models,\nreflecting a non-intuitive inhibition of the outbreak. Our findings highlight\nthe impact of heterogeneity in host behavior on their pathogens and allow\nrealistic predictions of outbreak dynamics in the multi-pathogen\nwildlife-livestock interface with consequences to zoonotic diseases in various\nsystems.\n","authors":["Teddy Lazebnik","Orr Spiegel"],"pdf_url":"https://arxiv.org/pdf/2310.08613v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2310.08588v1","updated":"2023-10-12T17:59:58Z","published":"2023-10-12T17:59:58Z","title":"Octopus: Embodied Vision-Language Programmer from Environmental Feedback","summary":"  Large vision-language models (VLMs) have achieved substantial progress in\nmultimodal perception and reasoning. Furthermore, when seamlessly integrated\ninto an embodied agent, it signifies a crucial stride towards the creation of\nautonomous and context-aware systems capable of formulating plans and executing\ncommands with precision. In this paper, we introduce Octopus, a novel VLM\ndesigned to proficiently decipher an agent's vision and textual task objectives\nand to formulate intricate action sequences and generate executable code. Our\ndesign allows the agent to adeptly handle a wide spectrum of tasks, ranging\nfrom mundane daily chores in simulators to sophisticated interactions in\ncomplex video games. Octopus is trained by leveraging GPT-4 to control an\nexplorative agent to generate training data, i.e., action blueprints and the\ncorresponding executable code, within our experimental environment called\nOctoVerse. We also collect the feedback that allows the enhanced training\nscheme of Reinforcement Learning with Environmental Feedback (RLEF). Through a\nseries of experiments, we illuminate Octopus's functionality and present\ncompelling results, and the proposed RLEF turns out to refine the agent's\ndecision-making. By open-sourcing our model architecture, simulator, and\ndataset, we aspire to ignite further innovation and foster collaborative\napplications within the broader embodied AI community.\n","authors":["Jingkang Yang","Yuhao Dong","Shuai Liu","Bo Li","Ziyue Wang","Chencheng Jiang","Haoran Tan","Jiamu Kang","Yuanhan Zhang","Kaiyang Zhou","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2310.08588v1.pdf","comment":"Project Page: https://choiszt.github.io/Octopus/, Codebase:\n  https://github.com/dongyh20/Octopus"},{"id":"http://arxiv.org/abs/2310.08582v1","updated":"2023-10-12T17:59:50Z","published":"2023-10-12T17:59:50Z","title":"Tree-Planner: Efficient Close-loop Task Planning with Large Language\n  Models","summary":"  This paper studies close-loop task planning, which refers to the process of\ngenerating a sequence of skills (a plan) to accomplish a specific goal while\nadapting the plan based on real-time observations. Recently, prompting Large\nLanguage Models (LLMs) to generate actions iteratively has become a prevalent\nparadigm due to its superior performance and user-friendliness. However, this\nparadigm is plagued by two inefficiencies: high token consumption and redundant\nerror correction, both of which hinder its scalability for large-scale testing\nand applications. To address these issues, we propose Tree-Planner, which\nreframes task planning with LLMs into three distinct phases: plan sampling,\naction tree construction, and grounded deciding. Tree-Planner starts by using\nan LLM to sample a set of potential plans before execution, followed by the\naggregation of them to form an action tree. Finally, the LLM performs a\ntop-down decision-making process on the tree, taking into account real-time\nenvironmental information. Experiments show that Tree-Planner achieves\nstate-of-the-art performance while maintaining high efficiency. By decomposing\nLLM queries into a single plan-sampling call and multiple grounded-deciding\ncalls, a considerable part of the prompt are less likely to be repeatedly\nconsumed. As a result, token consumption is reduced by 92.2% compared to the\npreviously best-performing model. Additionally, by enabling backtracking on the\naction tree as needed, the correction process becomes more flexible, leading to\na 40.5% decrease in error corrections. Project page:\nhttps://tree-planner.github.io/\n","authors":["Mengkang Hu","Yao Mu","Xinmiao Yu","Mingyu Ding","Shiguang Wu","Wenqi Shao","Qiguang Chen","Bin Wang","Yu Qiao","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2310.08582v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08577v1","updated":"2023-10-12T17:59:30Z","published":"2023-10-12T17:59:30Z","title":"Visual Data-Type Understanding does not emerge from Scaling\n  Vision-Language Models","summary":"  Recent advances in the development of vision-language models (VLMs) are\nyielding remarkable success in recognizing visual semantic content, including\nimpressive instances of compositional image understanding. Here, we introduce\nthe novel task of \\textit{Visual Data-Type Identification}, a basic perceptual\nskill with implications for data curation (e.g., noisy data-removal from large\ndatasets, domain-specific retrieval) and autonomous vision (e.g.,\ndistinguishing changing weather conditions from camera lens staining). We\ndevelop two datasets consisting of animal images altered across a diverse set\nof 27 visual \\textit{data-types}, spanning four broad categories. An extensive\nzero-shot evaluation of 39 VLMs, ranging from 100M to 80B parameters, shows a\nnuanced performance landscape. While VLMs are reasonably good at identifying\ncertain stylistic \\textit{data-types}, such as cartoons and sketches, they\nstruggle with simpler \\textit{data-types} arising from basic manipulations like\nimage rotations or additive noise. Our findings reveal that (i) model scaling\nalone yields marginal gains for contrastively-trained models like CLIP, and\n(ii) there is a pronounced drop in performance for the largest\nauto-regressively trained VLMs like OpenFlamingo. This finding points to a\nblind spot in current frontier VLMs: they excel in recognizing semantic content\nbut fail to acquire an understanding of visual \\textit{data-types} through\nscaling. By analyzing the pre-training distributions of these models and\nincorporating \\textit{data-type} information into the captions during\nfine-tuning, we achieve a significant enhancement in performance. By exploring\nthis previously uncharted task, we aim to set the stage for further advancing\nVLMs to equip them with visual data-type understanding. Code and datasets are\nreleased \\href{https://github.com/bethgelab/DataTypeIdentification}{here}.\n","authors":["Vishaal Udandarao","Max F. Burg","Samuel Albanie","Matthias Bethge"],"pdf_url":"https://arxiv.org/pdf/2310.08577v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08576v1","updated":"2023-10-12T17:59:23Z","published":"2023-10-12T17:59:23Z","title":"Learning to Act from Actionless Videos through Dense Correspondences","summary":"  In this work, we present an approach to construct a video-based robot policy\ncapable of reliably executing diverse tasks across different robots and\nenvironments from few video demonstrations without using any action\nannotations. Our method leverages images as a task-agnostic representation,\nencoding both the state and action information, and text as a general\nrepresentation for specifying robot goals. By synthesizing videos that\n``hallucinate'' robot executing actions and in combination with dense\ncorrespondences between frames, our approach can infer the closed-formed action\nto execute to an environment without the need of any explicit action labels.\nThis unique capability allows us to train the policy solely based on RGB videos\nand deploy learned policies to various robotic tasks. We demonstrate the\nefficacy of our approach in learning policies on table-top manipulation and\nnavigation tasks. Additionally, we contribute an open-source framework for\nefficient video modeling, enabling the training of high-fidelity policy models\nwith four GPUs within a single day.\n","authors":["Po-Chen Ko","Jiayuan Mao","Yilun Du","Shao-Hua Sun","Joshua B. Tenenbaum"],"pdf_url":"https://arxiv.org/pdf/2310.08576v1.pdf","comment":"Project page: https://flow-diffusion.github.io/"},{"id":"http://arxiv.org/abs/2209.07481v2","updated":"2023-10-12T17:59:22Z","published":"2022-09-15T17:22:04Z","title":"Quasi-Arithmetic Mixtures, Divergence Minimization, and Bregman\n  Information","summary":"  Markov Chain Monte Carlo methods for sampling from complex distributions and\nestimating normalization constants often simulate samples from a sequence of\nintermediate distributions along an annealing path, which bridges between a\ntractable initial distribution and a target density of interest. Prior work has\nconstructed annealing paths using quasi-arithmetic means, and interpreted the\nresulting intermediate densities as minimizing an expected divergence to the\nendpoints. We provide a comprehensive analysis of this 'centroid' property\nusing Bregman divergences under a monotonic embedding of the density function,\nthereby associating common divergences such as Amari's and Renyi's\n${\\alpha}$-divergences, ${(\\alpha,\\beta)}$-divergences, and the Jensen-Shannon\ndivergence with intermediate densities along an annealing path. Our analysis\nhighlights the interplay between parametric families, quasi-arithmetic means,\nand divergence functions using the rho-tau Bregman divergence framework of\nZhang 2004,2013.\n","authors":["Rob Brekelmans","Frank Nielsen"],"pdf_url":"https://arxiv.org/pdf/2209.07481v2.pdf","comment":"19 pages + appendix (rewritten + changed title in revision)"},{"id":"http://arxiv.org/abs/2310.08574v1","updated":"2023-10-12T17:57:57Z","published":"2023-10-12T17:57:57Z","title":"Jigsaw: Supporting Designers in Prototyping Multimodal Applications by\n  Assembling AI Foundation Models","summary":"  Recent advancements in AI foundation models have made it possible for them to\nbe utilized off-the-shelf for creative tasks, including ideating design\nconcepts or generating visual prototypes. However, integrating these models\ninto the creative process can be challenging as they often exist as standalone\napplications tailored to specific tasks. To address this challenge, we\nintroduce Jigsaw, a prototype system that employs puzzle pieces as metaphors to\nrepresent foundation models. Jigsaw allows designers to combine different\nfoundation model capabilities across various modalities by assembling\ncompatible puzzle pieces. To inform the design of Jigsaw, we interviewed ten\ndesigners and distilled design goals. In a user study, we showed that Jigsaw\nenhanced designers' understanding of available foundation model capabilities,\nprovided guidance on combining capabilities across different modalities and\ntasks, and served as a canvas to support design exploration, prototyping, and\ndocumentation.\n","authors":["David Chuan-En Lin","Nikolas Martelaro"],"pdf_url":"https://arxiv.org/pdf/2310.08574v1.pdf","comment":"Webpage: https://preview.jigsaw.to"},{"id":"http://arxiv.org/abs/2310.08571v1","updated":"2023-10-12T17:56:53Z","published":"2023-10-12T17:56:53Z","title":"Bucks for Buckets (B4B): Active Defenses Against Stealing Encoders","summary":"  Machine Learning as a Service (MLaaS) APIs provide ready-to-use and\nhigh-utility encoders that generate vector representations for given inputs.\nSince these encoders are very costly to train, they become lucrative targets\nfor model stealing attacks during which an adversary leverages query access to\nthe API to replicate the encoder locally at a fraction of the original training\ncosts. We propose Bucks for Buckets (B4B), the first active defense that\nprevents stealing while the attack is happening without degrading\nrepresentation quality for legitimate API users. Our defense relies on the\nobservation that the representations returned to adversaries who try to steal\nthe encoder's functionality cover a significantly larger fraction of the\nembedding space than representations of legitimate users who utilize the\nencoder to solve a particular downstream task.vB4B leverages this to adaptively\nadjust the utility of the returned representations according to a user's\ncoverage of the embedding space. To prevent adaptive adversaries from eluding\nour defense by simply creating multiple user accounts (sybils), B4B also\nindividually transforms each user's representations. This prevents the\nadversary from directly aggregating representations over multiple accounts to\ncreate their stolen encoder copy. Our active defense opens a new path towards\nsecurely sharing and democratizing encoders over public APIs.\n","authors":["Jan Dubiński","Stanisław Pawlak","Franziska Boenisch","Tomasz Trzciński","Adam Dziedzic"],"pdf_url":"https://arxiv.org/pdf/2310.08571v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08566v1","updated":"2023-10-12T17:55:02Z","published":"2023-10-12T17:55:02Z","title":"Transformers as Decision Makers: Provable In-Context Reinforcement\n  Learning via Supervised Pretraining","summary":"  Large transformer models pretrained on offline reinforcement learning\ndatasets have demonstrated remarkable in-context reinforcement learning (ICRL)\ncapabilities, where they can make good decisions when prompted with interaction\ntrajectories from unseen environments. However, when and how transformers can\nbe trained to perform ICRL have not been theoretically well-understood. In\nparticular, it is unclear which reinforcement-learning algorithms transformers\ncan perform in context, and how distribution mismatch in offline training data\naffects the learned algorithms. This paper provides a theoretical framework\nthat analyzes supervised pretraining for ICRL. This includes two recently\nproposed training methods -- algorithm distillation and decision-pretrained\ntransformers. First, assuming model realizability, we prove the\nsupervised-pretrained transformer will imitate the conditional expectation of\nthe expert algorithm given the observed trajectory. The generalization error\nwill scale with model capacity and a distribution divergence factor between the\nexpert and offline algorithms. Second, we show transformers with ReLU attention\ncan efficiently approximate near-optimal online reinforcement learning\nalgorithms like LinUCB and Thompson sampling for stochastic linear bandits, and\nUCB-VI for tabular Markov decision processes. This provides the first\nquantitative analysis of the ICRL capabilities of transformers pretrained from\noffline trajectories.\n","authors":["Licong Lin","Yu Bai","Song Mei"],"pdf_url":"https://arxiv.org/pdf/2310.08566v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08558v1","updated":"2023-10-12T17:50:09Z","published":"2023-10-12T17:50:09Z","title":"Offline Retraining for Online RL: Decoupled Policy Learning to Mitigate\n  Exploration Bias","summary":"  It is desirable for policies to optimistically explore new states and\nbehaviors during online reinforcement learning (RL) or fine-tuning, especially\nwhen prior offline data does not provide enough state coverage. However,\nexploration bonuses can bias the learned policy, and our experiments find that\nnaive, yet standard use of such bonuses can fail to recover a performant\npolicy. Concurrently, pessimistic training in offline RL has enabled recovery\nof performant policies from static datasets. Can we leverage offline RL to\nrecover better policies from online interaction? We make a simple observation\nthat a policy can be trained from scratch on all interaction data with\npessimistic objectives, thereby decoupling the policies used for data\ncollection and for evaluation. Specifically, we propose offline retraining, a\npolicy extraction step at the end of online fine-tuning in our\nOffline-to-Online-to-Offline (OOO) framework for reinforcement learning (RL).\nAn optimistic (exploration) policy is used to interact with the environment,\nand a separate pessimistic (exploitation) policy is trained on all the observed\ndata for evaluation. Such decoupling can reduce any bias from online\ninteraction (intrinsic rewards, primacy bias) in the evaluation policy, and can\nallow more exploratory behaviors during online interaction which in turn can\ngenerate better data for exploitation. OOO is complementary to several\noffline-to-online RL and online RL methods, and improves their average\nperformance by 14% to 26% in our fine-tuning experiments, achieves\nstate-of-the-art performance on several environments in the D4RL benchmarks,\nand improves online RL performance by 165% on two OpenAI gym environments.\nFurther, OOO can enable fine-tuning from incomplete offline datasets where\nprior methods can fail to recover a performant policy. Implementation:\nhttps://github.com/MaxSobolMark/OOO\n","authors":["Max Sobol Mark","Archit Sharma","Fahim Tajwar","Rafael Rafailov","Sergey Levine","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2310.08558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04370v2","updated":"2023-10-12T17:48:22Z","published":"2023-09-08T15:02:46Z","title":"Seeing-Eye Quadruped Navigation with Force Responsive Locomotion Control","summary":"  Seeing-eye robots are very useful tools for guiding visually impaired people,\npotentially producing a huge societal impact given the low availability and\nhigh cost of real guide dogs. Although a few seeing-eye robot systems have\nalready been demonstrated, none considered external tugs from humans, which\nfrequently occur in a real guide dog setting. In this paper, we simultaneously\ntrain a locomotion controller that is robust to external tugging forces via\nReinforcement Learning (RL), and an external force estimator via supervised\nlearning. The controller ensures stable walking, and the force estimator\nenables the robot to respond to the external forces from the human. These\nforces are used to guide the robot to the global goal, which is unknown to the\nrobot, while the robot guides the human around nearby obstacles via a local\nplanner. Experimental results in simulation and on hardware show that our\ncontroller is robust to external forces, and our seeing-eye system can\naccurately detect force direction. We demonstrate our full seeing-eye robot\nsystem on a real quadruped robot with a blindfolded human. The video can be\nseen at our project page: https://bu-air-lab.github.io/guide_dog/\n","authors":["David DeFazio","Eisuke Hirota","Shiqi Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.04370v2.pdf","comment":"Accepted to CoRL 2023"},{"id":"http://arxiv.org/abs/2310.08549v1","updated":"2023-10-12T17:45:05Z","published":"2023-10-12T17:45:05Z","title":"Cross-Episodic Curriculum for Transformer Agents","summary":"  We present a new algorithm, Cross-Episodic Curriculum (CEC), to boost the\nlearning efficiency and generalization of Transformer agents. Central to CEC is\nthe placement of cross-episodic experiences into a Transformer's context, which\nforms the basis of a curriculum. By sequentially structuring online learning\ntrials and mixed-quality demonstrations, CEC constructs curricula that\nencapsulate learning progression and proficiency increase across episodes. Such\nsynergy combined with the potent pattern recognition capabilities of\nTransformer models delivers a powerful cross-episodic attention mechanism. The\neffectiveness of CEC is demonstrated under two representative scenarios: one\ninvolving multi-task reinforcement learning with discrete control, such as in\nDeepMind Lab, where the curriculum captures the learning progression in both\nindividual and progressively complex settings; and the other involving\nimitation learning with mixed-quality data for continuous control, as seen in\nRoboMimic, where the curriculum captures the improvement in demonstrators'\nexpertise. In all instances, policies resulting from CEC exhibit superior\nperformance and strong generalization. Code is open-sourced at\nhttps://cec-agent.github.io/ to facilitate research on Transformer agent\nlearning.\n","authors":["Lucy Xiaoyang Shi","Yunfan Jiang","Jake Grigsby","Linxi \"Jim\" Fan","Yuke Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.08549v1.pdf","comment":"To appear in NeurIPS 2023; The first two authors contributed equally"},{"id":"http://arxiv.org/abs/2310.08548v1","updated":"2023-10-12T17:44:59Z","published":"2023-10-12T17:44:59Z","title":"Stronger Coreset Bounds for Kernel Density Estimators via Chaining","summary":"  We apply the discrepancy method and a chaining approach to give improved\nbounds on the coreset complexity of a wide class of kernel functions. Our\nresults give randomized polynomial time algorithms to produce coresets of size\n$O\\big(\\frac{\\sqrt{d}}{\\varepsilon}\\sqrt{\\log\\log \\frac{1}{\\varepsilon}}\\big)$\nfor the Gaussian and Laplacian kernels in the case that the data set is\nuniformly bounded, an improvement that was not possible with previous\ntechniques. We also obtain coresets of size\n$O\\big(\\frac{1}{\\varepsilon}\\sqrt{\\log\\log \\frac{1}{\\varepsilon}}\\big)$ for the\nLaplacian kernel for $d$ constant. Finally, we give the best known bounds of\n$O\\big(\\frac{\\sqrt{d}}{\\varepsilon}\\sqrt{\\log(2\\max\\{1,\\alpha\\})}\\big)$ on the\ncoreset complexity of the exponential, Hellinger, and JS Kernels, where\n$1/\\alpha$ is the bandwidth parameter of the kernel.\n","authors":["Rainie Bozzai","Thomas Rothvoss"],"pdf_url":"https://arxiv.org/pdf/2310.08548v1.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2310.08540v1","updated":"2023-10-12T17:32:09Z","published":"2023-10-12T17:32:09Z","title":"Do pretrained Transformers Really Learn In-context by Gradient Descent?","summary":"  Is In-Context Learning (ICL) implicitly equivalent to Gradient Descent (GD)?\nSeveral recent works draw analogies between the dynamics of GD and the emergent\nbehavior of ICL in large language models. However, these works make assumptions\nfar from the realistic natural language setting in which language models are\ntrained. Such discrepancies between theory and practice, therefore, necessitate\nfurther investigation to validate their applicability.\n  We start by highlighting the weaknesses in prior works that construct\nTransformer weights to simulate gradient descent. Their experiments with\ntraining Transformers on ICL objective, inconsistencies in the order\nsensitivity of ICL and GD, sparsity of the constructed weights, and sensitivity\nto parameter changes are some examples of a mismatch from the real-world\nsetting.\n  Furthermore, we probe and compare the ICL vs. GD hypothesis in a natural\nsetting. We conduct comprehensive empirical analyses on language models\npretrained on natural data (LLaMa-7B). Our comparisons on various performance\nmetrics highlight the inconsistent behavior of ICL and GD as a function of\nvarious factors such as datasets, models, and number of demonstrations. We\nobserve that ICL and GD adapt the output distribution of language models\ndifferently. These results indicate that the equivalence between ICL and GD is\nan open hypothesis, requires nuanced considerations and calls for further\nstudies.\n","authors":["Lingfeng Shen","Aayush Mishra","Daniel Khashabi"],"pdf_url":"https://arxiv.org/pdf/2310.08540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/1910.09143v5","updated":"2023-10-12T17:27:48Z","published":"2019-10-21T04:24:29Z","title":"Dynamic Subgoal-based Exploration via Bayesian Optimization","summary":"  Reinforcement learning in sparse-reward navigation environments with\nexpensive and limited interactions is challenging and poses a need for\neffective exploration. Motivated by complex navigation tasks that require\nreal-world training (when cheap simulators are not available), we consider an\nagent that faces an unknown distribution of environments and must decide on an\nexploration strategy. It may leverage a series of training environments to\nimprove its policy before it is evaluated in a test environment drawn from the\nsame environment distribution. Most existing approaches focus on fixed\nexploration strategies, while the few that view exploration as a\nmeta-optimization problem tend to ignore the need for cost-efficient\nexploration. We propose a cost-aware Bayesian optimization approach that\nefficiently searches over a class of dynamic subgoal-based exploration\nstrategies. The algorithm adjusts a variety of levers -- the locations of the\nsubgoals, the length of each episode, and the number of replications per trial\n-- in order to overcome the challenges of sparse rewards, expensive\ninteractions, and noise. An experimental evaluation demonstrates that the new\napproach outperforms existing baselines across a number of problem domains. We\nalso provide a theoretical foundation and prove that the method asymptotically\nidentifies a near-optimal subgoal design.\n","authors":["Yijia Wang","Matthias Poloczek","Daniel R. Jiang"],"pdf_url":"https://arxiv.org/pdf/1910.09143v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05173v2","updated":"2023-10-12T17:25:44Z","published":"2023-09-11T00:02:05Z","title":"DePT: Decomposed Prompt Tuning for Parameter-Efficient Fine-tuning","summary":"  Prompt tuning (PT), where a small amount of trainable soft (continuous)\nprompt vectors is affixed to the input of language models (LM), has shown\npromising results across various tasks and models for parameter-efficient\nfine-tuning (PEFT). PT stands out from other PEFT approaches because it\nmaintains competitive performance with fewer trainable parameters and does not\ndrastically scale up its parameters as the model size expands. However, PT\nintroduces additional soft prompt tokens, leading to longer input sequences,\nwhich significantly impacts training and inference time and memory usage due to\nthe Transformer's quadratic complexity. Particularly concerning for Large\nLanguage Models (LLMs) that face heavy daily querying. To address this issue,\nwe propose Decomposed Prompt Tuning (DePT), which decomposes the soft prompt\ninto a shorter soft prompt and a pair of low-rank matrices that are then\noptimised with two different learning rates. This allows DePT to achieve better\nperformance while saving over 20% memory and time costs compared to vanilla PT\nand its variants, without changing trainable parameter sizes. Through extensive\nexperiments on 23 natural language processing (NLP) and vision-language (VL)\ntasks, we demonstrate that DePT outperforms state-of-the-art PEFT approaches,\nincluding the full fine-tuning baseline in some scenarios. Additionally, we\nempirically show that DEPT grows more efficient as the model size increases.\nOur further study reveals that DePT integrates seamlessly with\nparameter-efficient transfer learning in the few-shot learning setting and\nhighlights its adaptability to various model architectures and sizes.\n","authors":["Zhengxiang Shi","Aldo Lipani"],"pdf_url":"https://arxiv.org/pdf/2309.05173v2.pdf","comment":"Code is available at https://github.com/ZhengxiangShi/DePT"},{"id":"http://arxiv.org/abs/1908.04628v3","updated":"2023-10-12T17:19:09Z","published":"2019-08-13T13:20:50Z","title":"L2P: Learning to Place for Estimating Heavy-Tailed Distributed Outcomes","summary":"  Many real-world prediction tasks have outcome variables that have\ncharacteristic heavy-tail distributions. Examples include copies of books sold,\nauction prices of art pieces, demand for commodities in warehouses, etc. By\nlearning heavy-tailed distributions, \"big and rare\" instances (e.g., the\nbest-sellers) will have accurate predictions. Most existing approaches are not\ndedicated to learning heavy-tailed distribution; thus, they heavily\nunder-predict such instances. To tackle this problem, we introduce Learning to\nPlace (L2P), which exploits the pairwise relationships between instances for\nlearning. In its training phase, L2P learns a pairwise preference classifier:\nis instance A > instance B? In its placing phase, L2P obtains a prediction by\nplacing the new instance among the known instances. Based on its placement, the\nnew instance is then assigned a value for its outcome variable. Experiments on\nreal data show that L2P outperforms competing approaches in terms of accuracy\nand ability to reproduce heavy-tailed outcome distribution. In addition, L2P\nprovides an interpretable model by placing each predicted instance in relation\nto its comparable neighbors. Interpretable models are highly desirable when\nlives and treasure are at stake.\n","authors":["Xindi Wang","Onur Varol","Tina Eliassi-Rad"],"pdf_url":"https://arxiv.org/pdf/1908.04628v3.pdf","comment":"9 pages, 6 figures, 2 tables Nature of changes from previous version:\n  1. Added complexity analysis in Section 2.2 2. Datasets change 3. Added\n  LambdaMART in the baseline methods, also a brief discussion on why LambdaMart\n  failed in our problem. 4. Figure updates"},{"id":"http://arxiv.org/abs/2310.05898v2","updated":"2023-10-12T17:16:37Z","published":"2023-10-09T17:41:29Z","title":"Lion Secretly Solves Constrained Optimization: As Lyapunov Predicts","summary":"  Lion (Evolved Sign Momentum), a new optimizer discovered through program\nsearch, has shown promising results in training large AI models. It performs\ncomparably or favorably to AdamW but with greater memory efficiency. As we can\nexpect from the results of a random search program, Lion incorporates elements\nfrom several existing algorithms, including signed momentum, decoupled weight\ndecay, Polak, and Nesterov momentum, but does not fit into any existing\ncategory of theoretically grounded optimizers. Thus, even though Lion appears\nto perform well as a general-purpose optimizer for a wide range of tasks, its\ntheoretical basis remains uncertain. This lack of theoretical clarity limits\nopportunities to further enhance and expand Lion's efficacy.\n  This work aims to demystify Lion. Based on both continuous-time and\ndiscrete-time analysis, we demonstrate that Lion is a theoretically novel and\nprincipled approach for minimizing a general loss function $f(x)$ while\nenforcing a bound constraint $\\|x\\|_\\infty \\leq 1/\\lambda$. Lion achieves this\nthrough the incorporation of decoupled weight decay, where $\\lambda$ represents\nthe weight decay coefficient. Our analysis is made possible by the development\nof a new Lyapunov function for the Lion updates. It applies to a broader family\nof Lion-$\\kappa$ algorithms, where the $\\text{sign}(\\cdot)$ operator in Lion is\nreplaced by the subgradient of a convex function $\\kappa$, leading to the\nsolution of a general composite optimization problem of $\\min_x f(x) +\n\\kappa^*(x)$. Our findings provide valuable insights into the dynamics of Lion\nand pave the way for further improvements and extensions of Lion-related\nalgorithms.\n","authors":["Lizhang Chen","Bo Liu","Kaizhao Liang","Qiang Liu"],"pdf_url":"https://arxiv.org/pdf/2310.05898v2.pdf","comment":"26 pages, 6 figures"},{"id":"http://arxiv.org/abs/2310.06225v2","updated":"2023-10-12T17:06:17Z","published":"2023-10-10T00:39:04Z","title":"GPT-4 as an Agronomist Assistant? Answering Agriculture Exams Using\n  Large Language Models","summary":"  Large language models (LLMs) have demonstrated remarkable capabilities in\nnatural language understanding across various domains, including healthcare and\nfinance. For some tasks, LLMs achieve similar or better performance than\ntrained human beings, therefore it is reasonable to employ human exams (e.g.,\ncertification tests) to assess the performance of LLMs. We present a\ncomprehensive evaluation of popular LLMs, such as Llama 2 and GPT, on their\nability to answer agriculture-related questions. In our evaluation, we also\nemploy RAG (Retrieval-Augmented Generation) and ER (Ensemble Refinement)\ntechniques, which combine information retrieval, generation capabilities, and\nprompting strategies to improve the LLMs' performance. To demonstrate the\ncapabilities of LLMs, we selected agriculture exams and benchmark datasets from\nthree of the largest agriculture producer countries: Brazil, India, and the\nUSA. Our analysis highlights GPT-4's ability to achieve a passing score on\nexams to earn credits for renewing agronomist certifications, answering 93% of\nthe questions correctly and outperforming earlier general-purpose models, which\nachieved 88% accuracy. On one of our experiments, GPT-4 obtained the highest\nperformance when compared to human subjects. This performance suggests that\nGPT-4 could potentially pass on major graduate education admission tests or\neven earn credits for renewing agronomy certificates. We also explore the\nmodels' capacity to address general agriculture-related questions and generate\ncrop management guidelines for Brazilian and Indian farmers, utilizing robust\ndatasets from the Brazilian Agency of Agriculture (Embrapa) and graduate\nprogram exams from India. The results suggest that GPT-4, ER, and RAG can\ncontribute meaningfully to agricultural education, assessment, and crop\nmanagement practice, offering valuable insights to farmers and agricultural\nprofessionals.\n","authors":["Bruno Silva","Leonardo Nunes","Roberto Estevão","Vijay Aski","Ranveer Chandra"],"pdf_url":"https://arxiv.org/pdf/2310.06225v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08501v1","updated":"2023-10-12T16:59:50Z","published":"2023-10-12T16:59:50Z","title":"Unsupervised Learning of Object-Centric Embeddings for Cell Instance\n  Segmentation in Microscopy Images","summary":"  Segmentation of objects in microscopy images is required for many biomedical\napplications. We introduce object-centric embeddings (OCEs), which embed image\npatches such that the spatial offsets between patches cropped from the same\nobject are preserved. Those learnt embeddings can be used to delineate\nindividual objects and thus obtain instance segmentations. Here, we show\ntheoretically that, under assumptions commonly found in microscopy images, OCEs\ncan be learnt through a self-supervised task that predicts the spatial offset\nbetween image patches. Together, this forms an unsupervised cell instance\nsegmentation method which we evaluate on nine diverse large-scale microscopy\ndatasets. Segmentations obtained with our method lead to substantially improved\nresults, compared to state-of-the-art baselines on six out of nine datasets,\nand perform on par on the remaining three datasets. If ground-truth annotations\nare available, our method serves as an excellent starting point for supervised\ntraining, reducing the required amount of ground-truth needed by one order of\nmagnitude, thus substantially increasing the practical applicability of our\nmethod. Source code is available at https://github.com/funkelab/cellulus.\n","authors":["Steffen Wolf","Manan Lalit","Henry Westmacott","Katie McDole","Jan Funke"],"pdf_url":"https://arxiv.org/pdf/2310.08501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08497v1","updated":"2023-10-12T16:56:37Z","published":"2023-10-12T16:56:37Z","title":"Impact of time and note duration tokenizations on deep learning symbolic\n  music modeling","summary":"  Symbolic music is widely used in various deep learning tasks, including\ngeneration, transcription, synthesis, and Music Information Retrieval (MIR). It\nis mostly employed with discrete models like Transformers, which require music\nto be tokenized, i.e., formatted into sequences of distinct elements called\ntokens. Tokenization can be performed in different ways. As Transformer can\nstruggle at reasoning, but capture more easily explicit information, it is\nimportant to study how the way the information is represented for such model\nimpact their performances. In this work, we analyze the common tokenization\nmethods and experiment with time and note duration representations. We compare\nthe performances of these two impactful criteria on several tasks, including\ncomposer and emotion classification, music generation, and sequence\nrepresentation learning. We demonstrate that explicit information leads to\nbetter results depending on the task.\n","authors":["Nathan Fradet","Nicolas Gutowski","Fabien Chhel","Jean-Pierre Briot"],"pdf_url":"https://arxiv.org/pdf/2310.08497v1.pdf","comment":"ISMIR 2023"},{"id":"http://arxiv.org/abs/2310.08495v1","updated":"2023-10-12T16:55:04Z","published":"2023-10-12T16:55:04Z","title":"Characterizing climate pathways using feature importance on echo state\n  networks","summary":"  The 2022 National Defense Strategy of the United States listed climate change\nas a serious threat to national security. Climate intervention methods, such as\nstratospheric aerosol injection, have been proposed as mitigation strategies,\nbut the downstream effects of such actions on a complex climate system are not\nwell understood. The development of algorithmic techniques for quantifying\nrelationships between source and impact variables related to a climate event\n(i.e., a climate pathway) would help inform policy decisions. Data-driven deep\nlearning models have become powerful tools for modeling highly nonlinear\nrelationships and may provide a route to characterize climate variable\nrelationships. In this paper, we explore the use of an echo state network (ESN)\nfor characterizing climate pathways. ESNs are a computationally efficient\nneural network variation designed for temporal data, and recent work proposes\nESNs as a useful tool for forecasting spatio-temporal climate data. Like other\nneural networks, ESNs are non-interpretable black-box models, which poses a\nhurdle for understanding variable relationships. We address this issue by\ndeveloping feature importance methods for ESNs in the context of\nspatio-temporal data to quantify variable relationships captured by the model.\nWe conduct a simulation study to assess and compare the feature importance\ntechniques, and we demonstrate the approach on reanalysis climate data. In the\nclimate application, we select a time period that includes the 1991 volcanic\neruption of Mount Pinatubo. This event was a significant stratospheric aerosol\ninjection, which we use as a proxy for an artificial stratospheric aerosol\ninjection. Using the proposed approach, we are able to characterize\nrelationships between pathway variables associated with this event.\n","authors":["Katherine Goode","Daniel Ries","Kellie McClernon"],"pdf_url":"https://arxiv.org/pdf/2310.08495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08491v1","updated":"2023-10-12T16:50:08Z","published":"2023-10-12T16:50:08Z","title":"Prometheus: Inducing Fine-grained Evaluation Capability in Language\n  Models","summary":"  Recently, using a powerful proprietary Large Language Model (LLM) (e.g.,\nGPT-4) as an evaluator for long-form responses has become the de facto\nstandard. However, for practitioners with large-scale evaluation tasks and\ncustom criteria in consideration (e.g., child-readability), using proprietary\nLLMs as an evaluator is unreliable due to the closed-source nature,\nuncontrolled versioning, and prohibitive costs. In this work, we propose\nPrometheus, a fully open-source LLM that is on par with GPT-4's evaluation\ncapabilities when the appropriate reference materials (reference answer, score\nrubric) are accompanied. We first construct the Feedback Collection, a new\ndataset that consists of 1K fine-grained score rubrics, 20K instructions, and\n100K responses and language feedback generated by GPT-4. Using the Feedback\nCollection, we train Prometheus, a 13B evaluator LLM that can assess any given\nlong-form text based on customized score rubric provided by the user.\nExperimental results show that Prometheus scores a Pearson correlation of 0.897\nwith human evaluators when evaluating with 45 customized score rubrics, which\nis on par with GPT-4 (0.882), and greatly outperforms ChatGPT (0.392).\nFurthermore, measuring correlation with GPT-4 with 1222 customized score\nrubrics across four benchmarks (MT Bench, Vicuna Bench, Feedback Bench, Flask\nEval) shows similar trends, bolstering Prometheus's capability as an evaluator\nLLM. Lastly, Prometheus achieves the highest accuracy on two human preference\nbenchmarks (HHH Alignment & MT Bench Human Judgment) compared to open-sourced\nreward models explicitly trained on human preference datasets, highlighting its\npotential as an universal reward model. We open-source our code, dataset, and\nmodel at https://github.com/kaistAI/Prometheus.\n","authors":["Seungone Kim","Jamin Shin","Yejin Cho","Joel Jang","Shayne Longpre","Hwaran Lee","Sangdoo Yun","Seongjin Shin","Sungdong Kim","James Thorne","Minjoon Seo"],"pdf_url":"https://arxiv.org/pdf/2310.08491v1.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2310.06823v2","updated":"2023-10-12T16:42:55Z","published":"2023-10-10T17:53:36Z","title":"NECO: NEural Collapse Based Out-of-distribution detection","summary":"  Detecting out-of-distribution (OOD) data is a critical challenge in machine\nlearning due to model overconfidence, often without awareness of their\nepistemological limits. We hypothesize that ``neural collapse'', a phenomenon\naffecting in-distribution data for models trained beyond loss convergence, also\ninfluences OOD data. To benefit from this interplay, we introduce NECO, a novel\npost-hoc method for OOD detection, which leverages the geometric properties of\n``neural collapse'' and of principal component spaces to identify OOD data. Our\nextensive experiments demonstrate that NECO achieves state-of-the-art results\non both small and large-scale OOD detection tasks while exhibiting strong\ngeneralization capabilities across different network architectures.\nFurthermore, we provide a theoretical explanation for the effectiveness of our\nmethod in OOD detection. We plan to release the code after the anonymity\nperiod.\n","authors":["Mouïn Ben Ammar","Nacim Belkhir","Sebastian Popescu","Antoine Manzanera","Gianni Franchi"],"pdf_url":"https://arxiv.org/pdf/2310.06823v2.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2310.08475v1","updated":"2023-10-12T16:32:44Z","published":"2023-10-12T16:32:44Z","title":"Can We Edit Multimodal Large Language Models?","summary":"  In this paper, we focus on editing Multimodal Large Language Models (MLLMs).\nCompared to editing single-modal LLMs, multimodal model editing is more\nchallenging, which demands a higher level of scrutiny and careful consideration\nin the editing process. To facilitate research in this area, we construct a new\nbenchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite\nof innovative metrics for evaluation. We conduct comprehensive experiments\ninvolving various model editing baselines and analyze the impact of editing\ndifferent components for multimodal LLMs. Empirically, we notice that previous\nbaselines can implement editing multimodal LLMs to some extent, but the effect\nis still barely satisfactory, indicating the potential difficulty of this task.\nWe hope that our work can provide the NLP community with insights\\footnote{Code\nand dataset are available in https://github.com/zjunlp/EasyEdit.\n","authors":["Siyuan Cheng","Bozhong Tian","Qingbin Liu","Xi Chen","Yongheng Wang","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08475v1.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.08470v1","updated":"2023-10-12T16:28:25Z","published":"2023-10-12T16:28:25Z","title":"Strategies and impact of learning curve estimation for CNN-based image\n  classification","summary":"  Learning curves are a measure for how the performance of machine learning\nmodels improves given a certain volume of training data. Over a wide variety of\napplications and models it was observed that learning curves follow -- to a\nlarge extent -- a power law behavior. This makes the performance of different\nmodels for a given task somewhat predictable and opens the opportunity to\nreduce the training time for practitioners, who are exploring the space of\npossible models and hyperparameters for the problem at hand. By estimating the\nlearning curve of a model from training on small subsets of data only the best\nmodels need to be considered for training on the full dataset. How to choose\nsubset sizes and how often to sample models on these to obtain estimates is\nhowever not researched. Given that the goal is to reduce overall training time\nstrategies are needed that sample the performance in a time-efficient way and\nyet leads to accurate learning curve estimates. In this paper we formulate the\nframework for these strategies and propose several strategies. Further we\nevaluate the strategies for simulated learning curves and in experiments with\npopular datasets and models for image classification tasks.\n","authors":["Laura Didyk","Brayden Yarish","Michael A. Beck","Christopher P. Bidinosti","Christopher J. Henry"],"pdf_url":"https://arxiv.org/pdf/2310.08470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08461v1","updated":"2023-10-12T16:21:04Z","published":"2023-10-12T16:21:04Z","title":"DistillSpec: Improving Speculative Decoding via Knowledge Distillation","summary":"  Speculative decoding (SD) accelerates large language model inference by\nemploying a faster draft model for generating multiple tokens, which are then\nverified in parallel by the larger target model, resulting in the text\ngenerated according to the target model distribution. However, identifying a\ncompact draft model that is well-aligned with the target model is challenging.\nTo tackle this issue, we propose DistillSpec that uses knowledge distillation\nto better align the draft model with the target model, before applying SD.\nDistillSpec makes two key design choices, which we demonstrate via systematic\nstudy to be crucial to improving the draft and target alignment: utilizing\non-policy data generation from the draft model, and tailoring the divergence\nfunction to the task and decoding strategy. Notably, DistillSpec yields\nimpressive 10 - 45% speedups over standard SD on a range of standard\nbenchmarks, using both greedy and non-greedy sampling. Furthermore, we combine\nDistillSpec with lossy SD to achieve fine-grained control over the latency vs.\ntask performance trade-off. Finally, in practical scenarios with models of\nvarying sizes, first using distillation to boost the performance of the target\nmodel and then applying DistillSpec to train a well-aligned draft model can\nreduce decoding latency by 6-10x with minimal performance drop, compared to\nstandard decoding without distillation.\n","authors":["Yongchao Zhou","Kaifeng Lyu","Ankit Singh Rawat","Aditya Krishna Menon","Afshin Rostamizadeh","Sanjiv Kumar","Jean-François Kagy","Rishabh Agarwal"],"pdf_url":"https://arxiv.org/pdf/2310.08461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08459v1","updated":"2023-10-12T16:19:58Z","published":"2023-10-12T16:19:58Z","title":"A Survey on Heterogeneous Transfer Learning","summary":"  The application of transfer learning, an approach utilizing knowledge from a\nsource domain to enhance model performance in a target domain, has seen a\ntremendous rise in recent years, underpinning many real-world scenarios. The\nkey to its success lies in the shared common knowledge between the domains, a\nprerequisite in most transfer learning methodologies. These methods typically\npresuppose identical feature spaces and label spaces in both domains, known as\nhomogeneous transfer learning, which, however, is not always a practical\nassumption. Oftentimes, the source and target domains vary in feature spaces,\ndata distributions, and label spaces, making it challenging or costly to secure\nsource domain data with identical feature and label spaces as the target\ndomain. Arbitrary elimination of these differences is not always feasible or\noptimal. Thus, heterogeneous transfer learning, acknowledging and dealing with\nsuch disparities, has emerged as a promising approach for a variety of tasks.\nDespite the existence of a survey in 2017 on this topic, the fast-paced\nadvances post-2017 necessitate an updated, in-depth review. We therefore\npresent a comprehensive survey of recent developments in heterogeneous transfer\nlearning methods, offering a systematic guide for future research. Our paper\nreviews methodologies for diverse learning scenarios, discusses the limitations\nof current studies, and covers various application contexts, including Natural\nLanguage Processing, Computer Vision, Multimodality, and Biomedicine, to foster\na deeper understanding and spur future research.\n","authors":["Runxue Bao","Yiming Sun","Yuhe Gao","Jindong Wang","Qiang Yang","Haifeng Chen","Zhi-Hong Mao","Xing Xie","Ye Ye"],"pdf_url":"https://arxiv.org/pdf/2310.08459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.15363v3","updated":"2023-10-12T16:12:57Z","published":"2022-11-28T14:38:45Z","title":"On the Security Vulnerabilities of Text-to-SQL Models","summary":"  Although it has been demonstrated that Natural Language Processing (NLP)\nalgorithms are vulnerable to deliberate attacks, the question of whether such\nweaknesses can lead to software security threats is under-explored. To bridge\nthis gap, we conducted vulnerability tests on Text-to-SQL systems that are\ncommonly used to create natural language interfaces to databases. We showed\nthat the Text-to-SQL modules within six commercial applications can be\nmanipulated to produce malicious code, potentially leading to data breaches and\nDenial of Service attacks. This is the first demonstration that NLP models can\nbe exploited as attack vectors in the wild. In addition, experiments using four\nopen-source language models verified that straightforward backdoor attacks on\nText-to-SQL systems achieve a 100% success rate without affecting their\nperformance. The aim of this work is to draw the community's attention to\npotential software security issues associated with NLP algorithms and encourage\nexploration of methods to mitigate against them.\n","authors":["Xutan Peng","Yipeng Zhang","Jingfeng Yang","Mark Stevenson"],"pdf_url":"https://arxiv.org/pdf/2211.15363v3.pdf","comment":"ISSRE 2023: Best Paper Candidate"},{"id":"http://arxiv.org/abs/2305.14259v3","updated":"2023-10-12T16:10:51Z","published":"2023-05-23T17:12:08Z","title":"Learning to Generate Novel Scientific Directions with Contextualized\n  Literature-based Discovery","summary":"  Literature-Based Discovery (LBD) aims to discover new scientific knowledge by\nmining papers and generating hypotheses. Standard LBD is limited to predicting\npairwise relations between discrete concepts (e.g., drug-disease links), and\nignores critical contexts like experimental settings (e.g., a specific patient\npopulation where a drug is evaluated) and background motivations (e.g., to find\ndrugs without specific side effects). We address these limitations with a novel\nformulation of contextualized-LBD (C-LBD): generating scientific hypotheses in\nnatural language, while grounding them in a context that controls the\nhypothesis search space. We present a modeling framework using retrieval of\n``inspirations'' from past scientific papers. Our evaluations reveal that GPT-4\ntends to generate ideas with overall low technical depth and novelty, while our\ninspiration prompting approaches partially mitigate this issue. Our work\nrepresents a first step toward building language models that generate new ideas\nderived from scientific literature.\n","authors":["Qingyun Wang","Doug Downey","Heng Ji","Tom Hope"],"pdf_url":"https://arxiv.org/pdf/2305.14259v3.pdf","comment":"24 pages. Code and resource is available at\n  https://github.com/EagleW/CLBD"},{"id":"http://arxiv.org/abs/2310.08446v1","updated":"2023-10-12T16:06:18Z","published":"2023-10-12T16:06:18Z","title":"Towards Robust Multi-Modal Reasoning via Model Selection","summary":"  The reasoning capabilities of LLM (Large Language Model) are widely\nacknowledged in recent research, inspiring studies on tool learning and\nautonomous agents. LLM serves as the \"brain\" of agent, orchestrating multiple\ntools for collaborative multi-step task solving. Unlike methods invoking tools\nlike calculators or weather APIs for straightforward tasks, multi-modal agents\nexcel by integrating diverse AI models for complex challenges. However, current\nmulti-modal agents neglect the significance of model selection: they primarily\nfocus on the planning and execution phases, and will only invoke predefined\ntask-specific models for each subtask, making the execution fragile. Meanwhile,\nother traditional model selection methods are either incompatible with or\nsuboptimal for the multi-modal agent scenarios, due to ignorance of\ndependencies among subtasks arising by multi-step reasoning.\n  To this end, we identify the key challenges therein and propose the\n$\\textit{M}^3$ framework as a plug-in with negligible runtime overhead at\ntest-time. This framework improves model selection and bolsters the robustness\nof multi-modal agents in multi-step reasoning. In the absence of suitable\nbenchmarks, we create MS-GQA, a new dataset specifically designed to\ninvestigate the model selection challenge in multi-modal agents. Our\nexperiments reveal that our framework enables dynamic model selection,\nconsidering both user inputs and subtask dependencies, thereby robustifying the\noverall reasoning process. Our code and benchmark:\nhttps://github.com/LINs-lab/M3.\n","authors":["Xiangyan Liu","Rongxue Li","Wei Ji","Tao Lin"],"pdf_url":"https://arxiv.org/pdf/2310.08446v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.16198v2","updated":"2023-10-12T15:57:52Z","published":"2023-08-25T21:30:16Z","title":"Learning Collaborative Information Dissemination with Graph-based\n  Multi-Agent Reinforcement Learning","summary":"  In modern communication systems, efficient and reliable information\ndissemination is crucial for supporting critical operations across domains like\ndisaster response, autonomous vehicles, and sensor networks. This paper\nintroduces a Multi-Agent Reinforcement Learning (MARL) approach as a\nsignificant step forward in achieving more decentralized, efficient, and\ncollaborative solutions. We propose a Partially Observable Stochastic Game\n(POSG) formulation for information dissemination empowering each agent to\ndecide on message forwarding independently, based on their one-hop\nneighborhood. This constitutes a significant paradigm shift from traditional\nheuristics based on Multi-Point Relay (MPR) selection. Our approach harnesses\nGraph Convolutional Reinforcement Learning, employing Graph Attention Networks\n(GAT) with dynamic attention to capture essential network features. We propose\ntwo approaches, L-DGN and HL-DGN, which differ in the information that is\nexchanged among agents. We evaluate the performance of our decentralized\napproaches, by comparing them with a widely-used MPR heuristic, and we show\nthat our trained policies are able to efficiently cover the network while\nbypassing the MPR set selection process. Our approach is a first step toward\nsupporting the resilience of real-world broadcast communication infrastructures\nvia learned, collaborative information dissemination.\n","authors":["Raffaele Galliera","Kristen Brent Venable","Matteo Bassani","Niranjan Suri"],"pdf_url":"https://arxiv.org/pdf/2308.16198v2.pdf","comment":"11 pages (2 of Supplementary Materials), 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2310.08431v1","updated":"2023-10-12T15:56:02Z","published":"2023-10-12T15:56:02Z","title":"Neural Sampling in Hierarchical Exponential-family Energy-based Models","summary":"  Bayesian brain theory suggests that the brain employs generative models to\nunderstand the external world. The sampling-based perspective posits that the\nbrain infers the posterior distribution through samples of stochastic neuronal\nresponses. Additionally, the brain continually updates its generative model to\napproach the true distribution of the external world. In this study, we\nintroduce the Hierarchical Exponential-family Energy-based (HEE) model, which\ncaptures the dynamics of inference and learning. In the HEE model, we decompose\nthe partition function into individual layers and leverage a group of neurons\nwith shorter time constants to sample the gradient of the decomposed\nnormalization term. This allows our model to estimate the partition function\nand perform inference simultaneously, circumventing the negative phase\nencountered in conventional energy-based models (EBMs). As a result, the\nlearning process is localized both in time and space, and the model is easy to\nconverge. To match the brain's rapid computation, we demonstrate that neural\nadaptation can serve as a momentum term, significantly accelerating the\ninference process. On natural image datasets, our model exhibits\nrepresentations akin to those observed in the biological visual system.\nFurthermore, for the machine learning community, our model can generate\nobservations through joint or marginal generation. We show that marginal\ngeneration outperforms joint generation and achieves performance on par with\nother EBMs.\n","authors":["Xingsi Dong","Si Wu"],"pdf_url":"https://arxiv.org/pdf/2310.08431v1.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2304.09663v2","updated":"2023-10-12T15:52:37Z","published":"2023-04-19T13:50:13Z","title":"Generative modeling of time-dependent densities via optimal transport\n  and projection pursuit","summary":"  Motivated by the computational difficulties incurred by popular deep learning\nalgorithms for the generative modeling of temporal densities, we propose a\ncheap alternative which requires minimal hyperparameter tuning and scales\nfavorably to high dimensional problems. In particular, we use a\nprojection-based optimal transport solver [Meng et al., 2019] to join\nsuccessive samples and subsequently use transport splines [Chewi et al., 2020]\nto interpolate the evolving density. When the sampling frequency is\nsufficiently high, the optimal maps are close to the identity and are thus\ncomputationally efficient to compute. Moreover, the training process is highly\nparallelizable as all optimal maps are independent and can thus be learned\nsimultaneously. Finally, the approach is based solely on numerical linear\nalgebra rather than minimizing a nonconvex objective function, allowing us to\neasily analyze and control the algorithm. We present several numerical\nexperiments on both synthetic and real-world datasets to demonstrate the\nefficiency of our method. In particular, these experiments show that the\nproposed approach is highly competitive compared with state-of-the-art\nnormalizing flows conditioned on time across a wide range of dimensionalities.\n","authors":["Jonah Botvinick-Greenhouse","Yunan Yang","Romit Maulik"],"pdf_url":"https://arxiv.org/pdf/2304.09663v2.pdf","comment":"This article may be downloaded for personal use only. Any other use\n  requires prior permission of the author and AIP Publishing. This article\n  appeared in Chaos: An Interdisciplinary Journal of Nonlinear Science, Volume\n  33, Issue 10, October 2023 and may be found at\n  https://doi.org/10.1063/5.0155783"},{"id":"http://arxiv.org/abs/2310.08425v1","updated":"2023-10-12T15:48:14Z","published":"2023-10-12T15:48:14Z","title":"Differentially Private Non-convex Learning for Multi-layer Neural\n  Networks","summary":"  This paper focuses on the problem of Differentially Private Stochastic\nOptimization for (multi-layer) fully connected neural networks with a single\noutput node. In the first part, we examine cases with no hidden nodes,\nspecifically focusing on Generalized Linear Models (GLMs). We investigate the\nwell-specific model where the random noise possesses a zero mean, and the link\nfunction is both bounded and Lipschitz continuous. We propose several\nalgorithms and our analysis demonstrates the feasibility of achieving an excess\npopulation risk that remains invariant to the data dimension. We also delve\ninto the scenario involving the ReLU link function, and our findings mirror\nthose of the bounded link function. We conclude this section by contrasting\nwell-specified and misspecified models, using ReLU regression as a\nrepresentative example.\n  In the second part of the paper, we extend our ideas to two-layer neural\nnetworks with sigmoid or ReLU activation functions in the well-specified model.\nIn the third part, we study the theoretical guarantees of DP-SGD in Abadi et\nal. (2016) for fully connected multi-layer neural networks. By utilizing recent\nadvances in Neural Tangent Kernel theory, we provide the first excess\npopulation risk when both the sample size and the width of the network are\nsufficiently large. Additionally, we discuss the role of some parameters in\nDP-SGD regarding their utility, both theoretically and empirically.\n","authors":["Hanpu Shen","Cheng-Long Wang","Zihang Xiang","Yiming Ying","Di Wang"],"pdf_url":"https://arxiv.org/pdf/2310.08425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.12345v4","updated":"2023-10-12T15:44:42Z","published":"2022-11-22T15:34:59Z","title":"Understanding Sparse Feature Updates in Deep Networks using Iterative\n  Linearisation","summary":"  Larger and deeper networks generalise well despite their increased capacity\nto overfit. Understanding why this happens is theoretically and practically\nimportant. One recent approach looks at the infinitely wide limits of such\nnetworks and their corresponding kernels. However, these theoretical tools\ncannot fully explain finite networks as the empirical kernel changes\nsignificantly during gradient-descent-based training in contrast to infinite\nnetworks. In this work, we derive an iterative linearised training method as a\nnovel empirical tool to further investigate this distinction, allowing us to\ncontrol for sparse (i.e. infrequent) feature updates and quantify the frequency\nof feature learning needed to achieve comparable performance. We justify\niterative linearisation as an interpolation between a finite analog of the\ninfinite width regime, which does not learn features, and standard gradient\ndescent training, which does. Informally, we also show that it is analogous to\na damped version of the Gauss-Newton algorithm -- a second-order method. We\nshow that in a variety of cases, iterative linearised training surprisingly\nperforms on par with standard training, noting in particular how much less\nfrequent feature learning is required to achieve comparable performance. We\nalso show that feature learning is essential for good performance. Since such\nfeature learning inevitably causes changes in the NTK kernel, we provide direct\nnegative evidence for the NTK theory, which states the NTK kernel remains\nconstant during training.\n","authors":["Adrian Goldwaser","Hong Ge"],"pdf_url":"https://arxiv.org/pdf/2211.12345v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08419v1","updated":"2023-10-12T15:38:28Z","published":"2023-10-12T15:38:28Z","title":"Jailbreaking Black Box Large Language Models in Twenty Queries","summary":"  There is growing interest in ensuring that large language models (LLMs) align\nwith human values. However, the alignment of such models is vulnerable to\nadversarial jailbreaks, which coax LLMs into overriding their safety\nguardrails. The identification of these vulnerabilities is therefore\ninstrumental in understanding inherent weaknesses and preventing future misuse.\nTo this end, we propose Prompt Automatic Iterative Refinement (PAIR), an\nalgorithm that generates semantic jailbreaks with only black-box access to an\nLLM. PAIR -- which is inspired by social engineering attacks -- uses an\nattacker LLM to automatically generate jailbreaks for a separate targeted LLM\nwithout human intervention. In this way, the attacker LLM iteratively queries\nthe target LLM to update and refine a candidate jailbreak. Empirically, PAIR\noften requires fewer than twenty queries to produce a jailbreak, which is\norders of magnitude more efficient than existing algorithms. PAIR also achieves\ncompetitive jailbreaking success rates and transferability on open and\nclosed-source LLMs, including GPT-3.5/4, Vicuna, and PaLM-2.\n","authors":["Patrick Chao","Alexander Robey","Edgar Dobriban","Hamed Hassani","George J. Pappas","Eric Wong"],"pdf_url":"https://arxiv.org/pdf/2310.08419v1.pdf","comment":"21 pages, 10 figures"},{"id":"http://arxiv.org/abs/2303.10108v2","updated":"2023-10-12T15:24:28Z","published":"2023-03-17T16:39:21Z","title":"Data-Centric Learning from Unlabeled Graphs with Diffusion Model","summary":"  Graph property prediction tasks are important and numerous. While each task\noffers a small size of labeled examples, unlabeled graphs have been collected\nfrom various sources and at a large scale. A conventional approach is training\na model with the unlabeled graphs on self-supervised tasks and then fine-tuning\nthe model on the prediction tasks. However, the self-supervised task knowledge\ncould not be aligned or sometimes conflicted with what the predictions needed.\nIn this paper, we propose to extract the knowledge underlying the large set of\nunlabeled graphs as a specific set of useful data points to augment each\nproperty prediction model. We use a diffusion model to fully utilize the\nunlabeled graphs and design two new objectives to guide the model's denoising\nprocess with each task's labeled data to generate task-specific graph examples\nand their labels. Experiments demonstrate that our data-centric approach\nperforms significantly better than fifteen existing various methods on fifteen\ntasks. The performance improvement brought by unlabeled data is visible as the\ngenerated labeled examples unlike the self-supervised learning.\n","authors":["Gang Liu","Eric Inae","Tong Zhao","Jiaxin Xu","Tengfei Luo","Meng Jiang"],"pdf_url":"https://arxiv.org/pdf/2303.10108v2.pdf","comment":"Accepted by NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.08394v1","updated":"2023-10-12T15:07:11Z","published":"2023-10-12T15:07:11Z","title":"Towards Better Evaluation of Instruction-Following: A Case-Study in\n  Summarization","summary":"  Despite recent advances, evaluating how well large language models (LLMs)\nfollow user instructions remains an open problem. While evaluation methods of\nlanguage models have seen a rise in prompt-based approaches, limited work on\nthe correctness of these methods has been conducted. In this work, we perform a\nmeta-evaluation of a variety of metrics to quantify how accurately they measure\nthe instruction-following abilities of LLMs. Our investigation is performed on\ngrounded query-based summarization by collecting a new short-form, real-world\ndataset riSum, containing $300$ document-instruction pairs with $3$ answers\neach. All $900$ answers are rated by $3$ human annotators. Using riSum, we\nanalyze agreement between evaluation methods and human judgment. Finally, we\npropose new LLM-based reference-free evaluation methods that improve upon\nestablished baselines and perform on-par with costly reference-based metrics\nwhich require high-quality summaries.\n","authors":["Ondrej Skopek","Rahul Aralikatte","Sian Gooding","Victor Carbune"],"pdf_url":"https://arxiv.org/pdf/2310.08394v1.pdf","comment":"Accepted to CoNLL 2023"},{"id":"http://arxiv.org/abs/2308.12243v2","updated":"2023-10-12T15:06:50Z","published":"2023-08-23T16:42:27Z","title":"Multi-Objective Optimization for Sparse Deep Neural Network Training","summary":"  Different conflicting optimization criteria arise naturally in various Deep\nLearning scenarios. These can address different main tasks (i.e., in the\nsetting of Multi-Task Learning), but also main and secondary tasks such as loss\nminimization versus sparsity. The usual approach is a simple weighting of the\ncriteria, which formally only works in the convex setting. In this paper, we\npresent a Multi-Objective Optimization algorithm using a modified Weighted\nChebyshev scalarization for training Deep Neural Networks (DNNs) with respect\nto several tasks. By employing this scalarization technique, the algorithm can\nidentify all optimal solutions of the original problem while reducing its\ncomplexity to a sequence of single-objective problems. The simplified problems\nare then solved using an Augmented Lagrangian method, enabling the use of\npopular optimization techniques such as Adam and Stochastic Gradient Descent,\nwhile efficaciously handling constraints. Our work aims to address the\n(economical and also ecological) sustainability issue of DNN models, with a\nparticular focus on Deep Multi-Task models, which are typically designed with a\nvery large number of weights to perform equally well on multiple tasks. Through\nexperiments conducted on two Machine Learning datasets, we demonstrate the\npossibility of adaptively sparsifying the model during training without\nsignificantly impacting its performance, if we are willing to apply\ntask-specific adaptations to the network weights. Code is available at\nhttps://github.com/salomonhotegni/MDMTN.\n","authors":["S. S. Hotegni","S. Peitz","M. Berkemeier"],"pdf_url":"https://arxiv.org/pdf/2308.12243v2.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2304.02621v2","updated":"2023-10-12T15:05:15Z","published":"2023-04-05T17:43:57Z","title":"High-fidelity Pseudo-labels for Boosting Weakly-Supervised Segmentation","summary":"  Image-level weakly-supervised semantic segmentation (WSSS) reduces the\nusually vast data annotation cost by surrogate segmentation masks during\ntraining. The typical approach involves training an image classification\nnetwork using global average pooling (GAP) on convolutional feature maps. This\nenables the estimation of object locations based on class activation maps\n(CAMs), which identify the importance of image regions. The CAMs are then used\nto generate pseudo-labels, in the form of segmentation masks, to supervise a\nsegmentation model in the absence of pixel-level ground truth. Our work is\nbased on two techniques for improving CAMs; importance sampling, which is a\nsubstitute for GAP, and the feature similarity loss, which utilizes a heuristic\nthat object contours almost always align with color edges in images. However,\nboth are based on the multinomial posterior with softmax, and implicitly assume\nthat classes are mutually exclusive, which turns out suboptimal in our\nexperiments. Thus, we reformulate both techniques based on binomial posteriors\nof multiple independent binary problems. This has two benefits; their\nperformance is improved and they become more general, resulting in an add-on\nmethod that can boost virtually any WSSS method. This is demonstrated on a wide\nvariety of baselines on the PASCAL VOC dataset, improving the region similarity\nand contour quality of all implemented state-of-the-art methods. Experiments on\nthe MS COCO dataset show that our proposed add-on is well-suited for\nlarge-scale settings. Our code is available at https://github.com/arvijj/hfpl.\n","authors":["Arvi Jonnarth","Yushan Zhang","Michael Felsberg"],"pdf_url":"https://arxiv.org/pdf/2304.02621v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08392v1","updated":"2023-10-12T15:03:50Z","published":"2023-10-12T15:03:50Z","title":"Introducing a Deep Neural Network-based Model Predictive Control\n  Framework for Rapid Controller Implementation","summary":"  Model Predictive Control (MPC) provides an optimal control solution based on\na cost function while allowing for the implementation of process constraints.\nAs a model-based optimal control technique, the performance of MPC strongly\ndepends on the model used where a trade-off between model computation time and\nprediction performance exists. One solution is the integration of MPC with a\nmachine learning (ML) based process model which are quick to evaluate online.\nThis work presents the experimental implementation of a deep neural network\n(DNN) based nonlinear MPC for Homogeneous Charge Compression Ignition (HCCI)\ncombustion control. The DNN model consists of a Long Short-Term Memory (LSTM)\nnetwork surrounded by fully connected layers which was trained using\nexperimental engine data and showed acceptable prediction performance with\nunder 5% error for all outputs. Using this model, the MPC is designed to track\nthe Indicated Mean Effective Pressure (IMEP) and combustion phasing\ntrajectories, while minimizing several parameters. Using the acados software\npackage to enable the real-time implementation of the MPC on an ARM Cortex A72,\nthe optimization calculations are completed within 1.4 ms. The external A72\nprocessor is integrated with the prototyping engine controller using a UDP\nconnection allowing for rapid experimental deployment of the NMPC. The IMEP\ntrajectory following of the developed controller was excellent, with a\nroot-mean-square error of 0.133 bar, in addition to observing process\nconstraints.\n","authors":["David C. Gordon","Alexander Winkler","Julian Bedei","Patrick Schaber","Jakob Andert","Charles R. Koch"],"pdf_url":"https://arxiv.org/pdf/2310.08392v1.pdf","comment":"Submitted to 2024 American Control Conference (ACC), July 8-12, 2024\n  in Toronto, Canada. ACC is the annual conference of the American Automatic\n  Control Council (AACC), the U.S. national member organization of the\n  International Federation for Automatic Control (IFAC)"},{"id":"http://arxiv.org/abs/2310.08391v1","updated":"2023-10-12T15:01:43Z","published":"2023-10-12T15:01:43Z","title":"How Many Pretraining Tasks Are Needed for In-Context Learning of Linear\n  Regression?","summary":"  Transformers pretrained on diverse tasks exhibit remarkable in-context\nlearning (ICL) capabilities, enabling them to solve unseen tasks solely based\non input contexts without adjusting model parameters. In this paper, we study\nICL in one of its simplest setups: pretraining a linearly parameterized\nsingle-layer linear attention model for linear regression with a Gaussian\nprior. We establish a statistical task complexity bound for the attention model\npretraining, showing that effective pretraining only requires a small number of\nindependent tasks. Furthermore, we prove that the pretrained model closely\nmatches the Bayes optimal algorithm, i.e., optimally tuned ridge regression, by\nachieving nearly Bayes optimal risk on unseen tasks under a fixed context\nlength. These theoretical findings complement prior experimental research and\nshed light on the statistical foundations of ICL.\n","authors":["Jingfeng Wu","Difan Zou","Zixiang Chen","Vladimir Braverman","Quanquan Gu","Peter L. Bartlett"],"pdf_url":"https://arxiv.org/pdf/2310.08391v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15395v2","updated":"2023-10-12T15:00:55Z","published":"2023-09-27T04:33:09Z","title":"Model-Free, Regret-Optimal Best Policy Identification in Online CMDPs","summary":"  This paper considers the best policy identification (BPI) problem in online\nConstrained Markov Decision Processes (CMDPs). We are interested in algorithms\nthat are model-free, have low regret, and identify an optimal policy with a\nhigh probability. Existing model-free algorithms for online CMDPs with\nsublinear regret and constraint violation do not provide any convergence\nguarantee to an optimal policy and provide only average performance guarantees\nwhen a policy is uniformly sampled at random from all previously used policies.\nIn this paper, we develop a new algorithm, named\nPruning-Refinement-Identification (PRI), based on a fundamental structural\nproperty of CMDPs we discover, called limited stochasticity. The property says\nfor a CMDP with $N$ constraints, there exists an optimal policy with at most\n$N$ stochastic decisions.\n  The proposed algorithm first identifies at which step and in which state a\nstochastic decision has to be taken and then fine-tunes the distributions of\nthese stochastic decisions. PRI achieves trio objectives: (i) PRI is a\nmodel-free algorithm; and (ii) it outputs a near-optimal policy with a high\nprobability at the end of learning; and (iii) in the tabular setting, PRI\nguarantees $\\tilde{\\mathcal{O}}(\\sqrt{K})$ regret and constraint violation,\nwhich significantly improves the best existing regret bound\n$\\tilde{\\mathcal{O}}(K^{\\frac{4}{5}})$ under a model-free algorithm, where $K$\nis the total number of episodes.\n","authors":["Zihan Zhou","Honghao Wei","Lei Ying"],"pdf_url":"https://arxiv.org/pdf/2309.15395v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08387v1","updated":"2023-10-12T14:59:22Z","published":"2023-10-12T14:59:22Z","title":"MeanAP-Guided Reinforced Active Learning for Object Detection","summary":"  Active learning presents a promising avenue for training high-performance\nmodels with minimal labeled data, achieved by judiciously selecting the most\ninformative instances to label and incorporating them into the task learner.\nDespite notable advancements in active learning for image recognition, metrics\ndevised or learned to gauge the information gain of data, crucial for query\nstrategy design, do not consistently align with task model performance metrics,\nsuch as Mean Average Precision (MeanAP) in object detection tasks. This paper\nintroduces MeanAP-Guided Reinforced Active Learning for Object Detection\n(MAGRAL), a novel approach that directly utilizes the MeanAP metric of the task\nmodel to devise a sampling strategy employing a reinforcement learning-based\nsampling agent. Built upon LSTM architecture, the agent efficiently explores\nand selects subsequent training instances, and optimizes the process through\npolicy gradient with MeanAP serving as reward. Recognizing the time-intensive\nnature of MeanAP computation at each step, we propose fast look-up tables to\nexpedite agent training. We assess MAGRAL's efficacy across popular benchmarks,\nPASCAL VOC and MS COCO, utilizing different backbone architectures. Empirical\nfindings substantiate MAGRAL's superiority over recent state-of-the-art\nmethods, showcasing substantial performance gains. MAGRAL establishes a robust\nbaseline for reinforced active object detection, signifying its potential in\nadvancing the field.\n","authors":["Zhixuan Liang","Xingyu Zeng","Rui Zhao","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2310.08387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08381v1","updated":"2023-10-12T14:55:31Z","published":"2023-10-12T14:55:31Z","title":"AutoVP: An Automated Visual Prompting Framework and Benchmark","summary":"  Visual prompting (VP) is an emerging parameter-efficient fine-tuning approach\nto adapting pre-trained vision models to solve various downstream\nimage-classification tasks. However, there has hitherto been little systematic\nstudy of the design space of VP and no clear benchmark for evaluating its\nperformance. To bridge this gap, we propose AutoVP, an end-to-end expandable\nframework for automating VP design choices, along with 12 downstream\nimage-classification tasks that can serve as a holistic VP-performance\nbenchmark. Our design space covers 1) the joint optimization of the prompts; 2)\nthe selection of pre-trained models, including image classifiers and text-image\nencoders; and 3) model output mapping strategies, including nonparametric and\ntrainable label mapping. Our extensive experimental results show that AutoVP\noutperforms the best-known current VP methods by a substantial margin, having\nup to 6.7% improvement in accuracy; and attains a maximum performance increase\nof 27.5% compared to linear-probing (LP) baseline. AutoVP thus makes a two-fold\ncontribution: serving both as an efficient tool for hyperparameter tuning on VP\ndesign choices, and as a comprehensive benchmark that can reasonably be\nexpected to accelerate VP's development. The source code is available at\nhttps://github.com/IBM/AutoVP.\n","authors":["Hsi-Ai Tsao","Lei Hsiung","Pin-Yu Chen","Sijia Liu","Tsung-Yi Ho"],"pdf_url":"https://arxiv.org/pdf/2310.08381v1.pdf","comment":"Preprint. The code is available at https://github.com/IBM/AutoVP"},{"id":"http://arxiv.org/abs/2306.16335v2","updated":"2023-10-12T14:46:15Z","published":"2023-06-28T16:11:50Z","title":"Emulating the dynamics of complex systems using autoregressive models on\n  manifolds (mNARX)","summary":"  We propose a novel surrogate modelling approach to efficiently and accurately\napproximate the response of complex dynamical systems driven by time-varying\nexogenous excitations over extended time periods. Our approach, namely manifold\nnonlinear autoregressive modelling with exogenous input (mNARX), involves\nconstructing a problem-specific exogenous input manifold that is optimal for\nconstructing autoregressive surrogates. The manifold, which forms the core of\nmNARX, is constructed incrementally by incorporating the physics of the system,\nas well as prior expert- and domain- knowledge. Because mNARX decomposes the\nfull problem into a series of smaller sub-problems, each with a lower\ncomplexity than the original, it scales well with the complexity of the\nproblem, both in terms of training and evaluation costs of the final surrogate.\nFurthermore, mNARX synergizes well with traditional dimensionality reduction\ntechniques, making it highly suitable for modelling dynamical systems with\nhigh-dimensional exogenous inputs, a class of problems that is typically\nchallenging to solve. Since domain knowledge is particularly abundant in\nphysical systems, such as those found in civil and mechanical engineering,\nmNARX is well suited for these applications. We demonstrate that mNARX\noutperforms traditional autoregressive surrogates in predicting the response of\na classical coupled spring-mass system excited by a one-dimensional random\nexcitation. Additionally, we show that mNARX is well suited for emulating very\nhigh-dimensional time- and state-dependent systems, even when affected by\nactive controllers, by surrogating the dynamics of a realistic\naero-servo-elastic onshore wind turbine simulator. In general, our results\ndemonstrate that mNARX offers promising prospects for modelling complex\ndynamical systems, in terms of accuracy and efficiency.\n","authors":["Styfen Schär","Stefano Marelli","Bruno Sudret"],"pdf_url":"https://arxiv.org/pdf/2306.16335v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08367v1","updated":"2023-10-12T14:38:25Z","published":"2023-10-12T14:38:25Z","title":"MCU: A Task-centric Framework for Open-ended Agent Evaluation in\n  Minecraft","summary":"  To pursue the goal of creating an open-ended agent in Minecraft, an\nopen-ended game environment with unlimited possibilities, this paper introduces\na task-centric framework named MCU for Minecraft agent evaluation. The MCU\nframework leverages the concept of atom tasks as fundamental building blocks,\nenabling the generation of diverse or even arbitrary tasks. Within the MCU\nframework, each task is measured with six distinct difficulty scores (time\nconsumption, operational effort, planning complexity, intricacy, creativity,\nnovelty). These scores offer a multi-dimensional assessment of a task from\ndifferent angles, and thus can reveal an agent's capability on specific facets.\nThe difficulty scores also serve as the feature of each task, which creates a\nmeaningful task space and unveils the relationship between tasks. For efficient\nevaluation of Minecraft agents employing the MCU framework, we maintain a\nunified benchmark, namely SkillForge, which comprises representative tasks with\ndiverse categories and difficulty distribution. We also provide convenient\nfilters for users to select tasks to assess specific capabilities of agents. We\nshow that MCU has the high expressivity to cover all tasks used in recent\nliterature on Minecraft agent, and underscores the need for advancements in\nareas such as creativity, precise control, and out-of-distribution\ngeneralization under the goal of open-ended Minecraft agent development.\n","authors":["Haowei Lin","Zihao Wang","Jianzhu Ma","Yitao Liang"],"pdf_url":"https://arxiv.org/pdf/2310.08367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08358v1","updated":"2023-10-12T14:29:02Z","published":"2023-10-12T14:29:02Z","title":"Towards Demystifying the Generalization Behaviors When Neural Collapse\n  Emerges","summary":"  Neural Collapse (NC) is a well-known phenomenon of deep neural networks in\nthe terminal phase of training (TPT). It is characterized by the collapse of\nfeatures and classifier into a symmetrical structure, known as simplex\nequiangular tight frame (ETF). While there have been extensive studies on\noptimization characteristics showing the global optimality of neural collapse,\nlittle research has been done on the generalization behaviors during the\noccurrence of NC. Particularly, the important phenomenon of generalization\nimprovement during TPT has been remaining in an empirical observation and\nlacking rigorous theoretical explanation. In this paper, we establish the\nconnection between the minimization of CE and a multi-class SVM during TPT, and\nthen derive a multi-class margin generalization bound, which provides a\ntheoretical explanation for why continuing training can still lead to accuracy\nimprovement on test set, even after the train accuracy has reached 100%.\nAdditionally, our further theoretical results indicate that different alignment\nbetween labels and features in a simplex ETF can result in varying degrees of\ngeneralization improvement, despite all models reaching NC and demonstrating\nsimilar optimization performance on train set. We refer to this newly\ndiscovered property as \"non-conservative generalization\". In experiments, we\nalso provide empirical observations to verify the indications suggested by our\ntheoretical results.\n","authors":["Peifeng Gao","Qianqian Xu","Yibo Yang","Peisong Wen","Huiyang Shao","Zhiyong Yang","Bernard Ghanem","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2310.08358v1.pdf","comment":"20 pages, 6 figures. arXiv admin note: substantial text overlap with\n  arXiv:2304.08914"},{"id":"http://arxiv.org/abs/2310.06970v2","updated":"2023-10-12T14:21:52Z","published":"2023-10-10T19:47:58Z","title":"Flood and Echo: Algorithmic Alignment of GNNs with Distributed Computing","summary":"  Graph Neural Networks are a natural fit for learning algorithms. They can\ndirectly represent tasks through an abstract but versatile graph structure and\nhandle inputs of different sizes. This opens up the possibility for scaling and\nextrapolation to larger graphs, one of the most important advantages of an\nalgorithm. However, this raises two core questions i) How can we enable nodes\nto gather the required information in a given graph ($\\textit{information\nexchange}$), even if is far away and ii) How can we design an execution\nframework which enables this information exchange for extrapolation to larger\ngraph sizes ($\\textit{algorithmic alignment for extrapolation}$). We propose a\nnew execution framework that is inspired by the design principles of\ndistributed algorithms: Flood and Echo Net. It propagates messages through the\nentire graph in a wave like activation pattern, which naturally generalizes to\nlarger instances. Through its sparse but parallel activations it is provably\nmore efficient in terms of message complexity. We study the proposed model and\nprovide both empirical evidence and theoretical insights in terms of its\nexpressiveness, efficiency, information exchange and ability to extrapolate.\n","authors":["Joël Mathys","Florian Grötschla","Kalyan Varma Nadimpalli","Roger Wattenhofer"],"pdf_url":"https://arxiv.org/pdf/2310.06970v2.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2310.08348v1","updated":"2023-10-12T14:18:09Z","published":"2023-10-12T14:18:09Z","title":"LightZero: A Unified Benchmark for Monte Carlo Tree Search in General\n  Sequential Decision Scenarios","summary":"  Building agents based on tree-search planning capabilities with learned\nmodels has achieved remarkable success in classic decision-making problems,\nsuch as Go and Atari. However, it has been deemed challenging or even\ninfeasible to extend Monte Carlo Tree Search (MCTS) based algorithms to diverse\nreal-world applications, especially when these environments involve complex\naction spaces and significant simulation costs, or inherent stochasticity. In\nthis work, we introduce LightZero, the first unified benchmark for deploying\nMCTS/MuZero in general sequential decision scenarios. Specificially, we\nsummarize the most critical challenges in designing a general MCTS-style\ndecision-making solver, then decompose the tightly-coupled algorithm and system\ndesign of tree-search RL methods into distinct sub-modules. By incorporating\nmore appropriate exploration and optimization strategies, we can significantly\nenhance these sub-modules and construct powerful LightZero agents to tackle\ntasks across a wide range of domains, such as board games, Atari, MuJoCo,\nMiniGrid and GoBigger. Detailed benchmark results reveal the significant\npotential of such methods in building scalable and efficient decision\nintelligence. The code is available as part of OpenDILab at\nhttps://github.com/opendilab/LightZero.\n","authors":["Yazhe Niu","Yuan Pu","Zhenjie Yang","Xueyan Li","Tong Zhou","Jiyuan Ren","Shuai Hu","Hongsheng Li","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2310.08348v1.pdf","comment":"NeurIPS 2023 Spotlight"},{"id":"http://arxiv.org/abs/2306.14041v2","updated":"2023-10-12T14:16:15Z","published":"2023-06-24T19:22:01Z","title":"Smoothed $f$-Divergence Distributionally Robust Optimization","summary":"  In data-driven optimization, sample average approximation (SAA) is known to\nsuffer from the so-called optimizer's curse that causes an over-optimistic\nevaluation of the solution performance. We argue that a special type of\ndistributionallly robust optimization (DRO) formulation offers theoretical\nadvantages in correcting for this optimizer's curse compared to simple\n``margin'' adjustments to SAA and other DRO approaches: It attains a\nstatistical bound on the out-of-sample performance, for a wide class of\nobjective functions and distributions, that is nearly tightest in terms of\nexponential decay rate. This DRO uses an ambiguity set based on a Kullback\nLeibler (KL) divergence smoothed by the Wasserstein or L\\'evy-Prokhorov (LP)\ndistance via a suitable distance optimization. Computationally, we also show\nthat such a DRO, and its generalized versions using smoothed $f$-divergence,\nare not harder than DRO problems based on $f$-divergence or Wasserstein\ndistances, rendering our DRO formulations both statistically optimal and\ncomputationally viable.\n","authors":["Zhenyuan Liu","Bart P. G. Van Parys","Henry Lam"],"pdf_url":"https://arxiv.org/pdf/2306.14041v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07579v2","updated":"2023-10-12T14:15:24Z","published":"2023-10-11T15:19:31Z","title":"In-Context Unlearning: Language Models as Few Shot Unlearners","summary":"  Machine unlearning, the study of efficiently removing the impact of specific\ntraining points on the trained model, has garnered increased attention of late,\ndriven by the need to comply with privacy regulations like the Right to be\nForgotten. Although unlearning is particularly relevant for LLMs in light of\nthe copyright issues they raise, achieving precise unlearning is\ncomputationally infeasible for very large models. To this end, recent work has\nproposed several algorithms which approximate the removal of training data\nwithout retraining the model. These algorithms crucially rely on access to the\nmodel parameters in order to update them, an assumption that may not hold in\npractice due to computational constraints or when the LLM is accessed via API.\nIn this work, we propose a new class of unlearning methods for LLMs we call\n''In-Context Unlearning'', providing inputs in context and without having to\nupdate model parameters. To unlearn a particular training instance, we provide\nthe instance alongside a flipped label and additional correctly labelled\ninstances which are prepended as inputs to the LLM at inference time. Our\nexperimental results demonstrate that these contexts effectively remove\nspecific information from the training set while maintaining performance levels\nthat are competitive with (or in some cases exceed) state-of-the-art unlearning\nmethods that require access to the LLM parameters.\n","authors":["Martin Pawelczyk","Seth Neel","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2310.07579v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.00152v4","updated":"2023-10-12T14:05:03Z","published":"2023-04-29T02:27:42Z","title":"Limits of Model Selection under Transfer Learning","summary":"  Theoretical studies on transfer learning or domain adaptation have so far\nfocused on situations with a known hypothesis class or model; however in\npractice, some amount of model selection is usually involved, often appearing\nunder the umbrella term of hyperparameter-tuning: for example, one may think of\nthe problem of tuning for the right neural network architecture towards a\ntarget task, while leveraging data from a related source task.\n  Now, in addition to the usual tradeoffs on approximation vs estimation errors\ninvolved in model selection, this problem brings in a new complexity term,\nnamely, the transfer distance between source and target distributions, which is\nknown to vary with the choice of hypothesis class.\n  We present a first study of this problem, focusing on classification; in\nparticular, the analysis reveals some remarkable phenomena: adaptive rates,\ni.e., those achievable with no distributional information, can be arbitrarily\nslower than oracle rates, i.e., when given knowledge on distances.\n","authors":["Steve Hanneke","Samory Kpotufe","Yasaman Mahdaviyeh"],"pdf_url":"https://arxiv.org/pdf/2305.00152v4.pdf","comment":"Accepted for presentation at the Conference on Learning Theory (COLT)\n  2023"},{"id":"http://arxiv.org/abs/2310.08339v1","updated":"2023-10-12T13:57:32Z","published":"2023-10-12T13:57:32Z","title":"A Generic Software Framework for Distributed Topological Analysis\n  Pipelines","summary":"  This system paper presents a software framework for the support of\ntopological analysis pipelines in a distributed-memory model. While several\nrecent papers introduced topology-based approaches for distributed-memory\nenvironments, these were reporting experiments obtained with tailored,\nmono-algorithm implementations. In contrast, we describe in this paper a\ngeneral-purpose, generic framework for topological analysis pipelines, i.e. a\nsequence of topological algorithms interacting together, possibly on distinct\nnumbers of processes. Specifically, we instantiated our framework with the MPI\nmodel, within the Topology ToolKit (TTK). While developing this framework, we\nfaced several algorithmic and software engineering challenges, which we\ndocument in this paper. We provide a taxonomy for the distributed-memory\ntopological algorithms supported by TTK, depending on their communication needs\nand provide examples of hybrid MPI+thread parallelizations. Detailed\nperformance analyses show that parallel efficiencies range from $20\\%$ to\n$80\\%$ (depending on the algorithms), and that the MPI-specific preconditioning\nintroduced by our framework induces a negligible computation time overhead. We\nillustrate the new distributed-memory capabilities of TTK with an example of\nadvanced analysis pipeline, combining multiple algorithms, run on the largest\npublicly available dataset we have found (120 billion vertices) on a standard\ncluster with 64 nodes (for a total of 1,536 cores). Finally, we provide a\nroadmap for the completion of TTK's MPI extension, along with generic\nrecommendations for each algorithm communication category.\n","authors":["Eve Le Guillou","Michael Will","Pierre Guillou","Jonas Lukasczyk","Pierre Fortin","Christoph Garth","Julien Tierny"],"pdf_url":"https://arxiv.org/pdf/2310.08339v1.pdf","comment":"18 pages, 12 figures"},{"id":"http://arxiv.org/abs/2310.08337v1","updated":"2023-10-12T13:54:55Z","published":"2023-10-12T13:54:55Z","title":"Neural Diffusion Models","summary":"  Diffusion models have shown remarkable performance on many generative tasks.\nDespite recent success, most diffusion models are restricted in that they only\nallow linear transformation of the data distribution. In contrast, broader\nfamily of transformations can potentially help train generative distributions\nmore efficiently, simplifying the reverse process and closing the gap between\nthe true negative log-likelihood and the variational approximation. In this\npaper, we present Neural Diffusion Models (NDMs), a generalization of\nconventional diffusion models that enables defining and learning time-dependent\nnon-linear transformations of data. We show how to optimise NDMs using a\nvariational bound in a simulation-free setting. Moreover, we derive a\ntime-continuous formulation of NDMs, which allows fast and reliable inference\nusing off-the-shelf numerical ODE and SDE solvers. Finally, we demonstrate the\nutility of NDMs with learnable transformations through experiments on standard\nimage generation benchmarks, including CIFAR-10, downsampled versions of\nImageNet and CelebA-HQ. NDMs outperform conventional diffusion models in terms\nof likelihood and produce high-quality samples.\n","authors":["Grigory Bartosh","Dmitry Vetrov","Christian A. Naesseth"],"pdf_url":"https://arxiv.org/pdf/2310.08337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08331v1","updated":"2023-10-12T13:45:33Z","published":"2023-10-12T13:45:33Z","title":"Impact of multi-armed bandit strategies on deep recurrent reinforcement\n  learning","summary":"  Incomplete knowledge of the environment leads an agent to make decisions\nunder uncertainty. One of the major dilemmas in Reinforcement Learning (RL)\nwhere an autonomous agent has to balance two contrasting needs in making its\ndecisions is: exploiting the current knowledge of the environment to maximize\nthe cumulative reward as well as exploring actions that allow improving the\nknowledge of the environment, hopefully leading to higher reward values\n(exploration-exploitation trade-off). Concurrently, another relevant issue\nregards the full observability of the states, which may not be assumed in all\napplications. Such as when only 2D images are considered as input in a RL\napproach used for finding the optimal action within a 3D simulation\nenvironment. In this work, we address these issues by deploying and testing\nseveral techniques to balance exploration and exploitation trade-off on\npartially observable systems for predicting steering wheels in autonomous\ndriving scenario. More precisely, the final aim is to investigate the effects\nof using both stochastic and deterministic multi-armed bandit strategies\ncoupled with a Deep Recurrent Q-Network. Additionally, we adapted and evaluated\nthe impact of an innovative method to improve the learning phase of the\nunderlying Convolutional Recurrent Neural Network. We aim to show that adaptive\nstochastic methods for exploration better approximate the trade-off between\nexploration and exploitation as, in general, Softmax and Max-Boltzmann\nstrategies are able to outperform epsilon-greedy techniques.\n","authors":["Valentina Zangirolami","Matteo Borrotti"],"pdf_url":"https://arxiv.org/pdf/2310.08331v1.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2309.03004v2","updated":"2023-10-12T13:36:41Z","published":"2023-09-06T13:48:40Z","title":"A Theoretical Explanation of Activation Sparsity through Flat Minima and\n  Adversarial Robustness","summary":"  A recent empirical observation (Li et al., 2022b) of activation sparsity in\nMLP blocks offers an opportunity to drastically reduce computation costs for\nfree. Although having attributed it to training dynamics, existing theoretical\nexplanations of activation sparsity are restricted to shallow networks, small\ntraining steps and special training, despite its emergence in deep models\nstandardly trained for a large number of steps. To fill these gaps, we propose\nthe notion of gradient sparsity as one source of activation sparsity and a\ntheoretical explanation based on it that sees sparsity a necessary step to\nadversarial robustness w.r.t. hidden features and parameters, which is\napproximately the flatness of minima for well-learned models. The theory\napplies to standardly trained LayerNorm-ed MLPs, and further to Transformers or\nother architectures trained with weight noises. Eliminating other sources of\nflatness except for sparsity, we discover the phenomenon that the ratio between\nthe largest and smallest non-zero singular values of weight matrices is small.\nWhen discussing the emergence of this spectral concentration, we use random\nmatrix theory (RMT) as a powerful tool to analyze stochastic gradient noises.\nValidational experiments are conducted to verify our gradient-sparsity-based\nexplanation. We propose two plug-and-play modules for both training and\nfinetuning for sparsity. Experiments on ImageNet-1k and C4 demonstrate their\n50% sparsity improvements, indicating further potential cost reduction in both\ntraining and inference.\n","authors":["Ze Peng","Lei Qi","Yinghuan Shi","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2309.03004v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08320v1","updated":"2023-10-12T13:33:04Z","published":"2023-10-12T13:33:04Z","title":"Defending Our Privacy With Backdoors","summary":"  The proliferation of large AI models trained on uncurated, often sensitive\nweb-scraped data has raised significant privacy concerns. One of the concerns\nis that adversaries can extract information about the training data using\nprivacy attacks. Unfortunately, the task of removing specific information from\nthe models without sacrificing performance is not straightforward and has\nproven to be challenging. We propose a rather easy yet effective defense based\non backdoor attacks to remove private information such as names of individuals\nfrom models, and focus in this work on text encoders. Specifically, through\nstrategic insertion of backdoors, we align the embeddings of sensitive phrases\nwith those of neutral terms-\"a person\" instead of the person's name. Our\nempirical results demonstrate the effectiveness of our backdoor-based defense\non CLIP by assessing its performance using a specialized privacy attack for\nzero-shot classifiers. Our approach provides not only a new \"dual-use\"\nperspective on backdoor attacks, but also presents a promising avenue to\nenhance the privacy of individuals within models trained on uncurated\nweb-scraped data.\n","authors":["Dominik Hintersdorf","Lukas Struppek","Daniel Neider","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2310.08320v1.pdf","comment":"14 pages, 4 figures"},{"id":"http://arxiv.org/abs/2310.08312v1","updated":"2023-10-12T13:20:17Z","published":"2023-10-12T13:20:17Z","title":"GePSAn: Generative Procedure Step Anticipation in Cooking Videos","summary":"  We study the problem of future step anticipation in procedural videos. Given\na video of an ongoing procedural activity, we predict a plausible next\nprocedure step described in rich natural language. While most previous work\nfocus on the problem of data scarcity in procedural video datasets, another\ncore challenge of future anticipation is how to account for multiple plausible\nfuture realizations in natural settings. This problem has been largely\noverlooked in previous work. To address this challenge, we frame future step\nprediction as modelling the distribution of all possible candidates for the\nnext step. Specifically, we design a generative model that takes a series of\nvideo clips as input, and generates multiple plausible and diverse candidates\n(in natural language) for the next step. Following previous work, we side-step\nthe video annotation scarcity by pretraining our model on a large text-based\ncorpus of procedural activities, and then transfer the model to the video\ndomain. Our experiments, both in textual and video domains, show that our model\ncaptures diversity in the next step prediction and generates multiple plausible\nfuture predictions. Moreover, our model establishes new state-of-the-art\nresults on YouCookII, where it outperforms existing baselines on the next step\nanticipation. Finally, we also show that our model can successfully transfer\nfrom text to the video domain zero-shot, ie, without fine-tuning or adaptation,\nand produces good-quality future step predictions from video.\n","authors":["Mohamed Ashraf Abdelsalam","Samrudhdhi B. Rangrej","Isma Hadji","Nikita Dvornik","Konstantinos G. Derpanis","Afsaneh Fazly"],"pdf_url":"https://arxiv.org/pdf/2310.08312v1.pdf","comment":"published at ICCV 2023"},{"id":"http://arxiv.org/abs/2310.08304v1","updated":"2023-10-12T13:11:38Z","published":"2023-10-12T13:11:38Z","title":"CHIP: Contrastive Hierarchical Image Pretraining","summary":"  Few-shot object classification is the task of classifying objects in an image\nwith limited number of examples as supervision. We propose a one-shot/few-shot\nclassification model that can classify an object of any unseen class into a\nrelatively general category in an hierarchically based classification. Our\nmodel uses a three-level hierarchical contrastive loss based ResNet152\nclassifier for classifying an object based on its features extracted from Image\nembedding, not used during the training phase. For our experimentation, we have\nused a subset of the ImageNet (ILSVRC-12) dataset that contains only the animal\nclasses for training our model and created our own dataset of unseen classes\nfor evaluating our trained model. Our model provides satisfactory results in\nclassifying the unknown objects into a generic category which has been later\ndiscussed in greater detail.\n","authors":["Arpit Mittal","Harshil Jhaveri","Swapnil Mallick","Abhishek Ajmera"],"pdf_url":"https://arxiv.org/pdf/2310.08304v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.07189v3","updated":"2023-10-12T12:47:22Z","published":"2023-03-13T15:30:28Z","title":"Optimizing Convolutional Neural Networks for Chronic Obstructive\n  Pulmonary Disease Detection in Clinical Computed Tomography Imaging","summary":"  We aim to optimize the binary detection of Chronic Obstructive Pulmonary\nDisease (COPD) based on emphysema presence in the lung with convolutional\nneural networks (CNN) by exploring manually adjusted versus automated\nwindow-setting optimization (WSO) on computed tomography (CT) images. 7,194 CT\nimages (3,597 with COPD; 3,597 healthy controls) from 78 subjects (43 with\nCOPD; 35 healthy controls) were selected retrospectively (10.2018-12.2019) and\npreprocessed. For each image, intensity values were manually clipped to the\nemphysema window setting and a baseline 'full-range' window setting.\nClass-balanced train, validation, and test sets contained 3,392, 1,114, and\n2,688 images. The network backbone was optimized by comparing various CNN\narchitectures. Furthermore, automated WSO was implemented by adding a\ncustomized layer to the model. The image-level area under the Receiver\nOperating Characteristics curve (AUC) [lower, upper limit 95% confidence] was\nutilized to compare model variations. Repeated inference (n=7) on the test set\nshowed that the DenseNet was the most efficient backbone and achieved a mean\nAUC of 0.80 [0.76, 0.85] without WSO. Comparably, with input images manually\nadjusted to the emphysema window, the DenseNet model predicted COPD with a mean\nAUC of 0.86 [0.82, 0.89]. By adding a customized WSO layer to the DenseNet, an\noptimal window in the proximity of the emphysema window setting was learned\nautomatically, and a mean AUC of 0.82 [0.78, 0.86] was achieved. Detection of\nCOPD with DenseNet models was improved by WSO of CT data to the emphysema\nwindow setting range.\n","authors":["Tina Dorosti","Manuel Schultheiss","Felix Hofmann","Johannes Thalhammer","Luisa Kirchner","Theresa Urban","Franz Pfeiffer","Florian Schaff","Tobias Lasser","Daniela Pfeiffer"],"pdf_url":"https://arxiv.org/pdf/2303.07189v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08287v1","updated":"2023-10-12T12:45:13Z","published":"2023-10-12T12:45:13Z","title":"A Symmetry-Aware Exploration of Bayesian Neural Network Posteriors","summary":"  The distribution of the weights of modern deep neural networks (DNNs) -\ncrucial for uncertainty quantification and robustness - is an eminently complex\nobject due to its extremely high dimensionality. This paper proposes one of the\nfirst large-scale explorations of the posterior distribution of deep Bayesian\nNeural Networks (BNNs), expanding its study to real-world vision tasks and\narchitectures. Specifically, we investigate the optimal approach for\napproximating the posterior, analyze the connection between posterior quality\nand uncertainty quantification, delve into the impact of modes on the\nposterior, and explore methods for visualizing the posterior. Moreover, we\nuncover weight-space symmetries as a critical aspect for understanding the\nposterior. To this extent, we develop an in-depth assessment of the impact of\nboth permutation and scaling symmetries that tend to obfuscate the Bayesian\nposterior. While the first type of transformation is known for duplicating\nmodes, we explore the relationship between the latter and L2 regularization,\nchallenging previous misconceptions. Finally, to help the community improve our\nunderstanding of the Bayesian posterior, we will shortly release the first\nlarge-scale checkpoint dataset, including thousands of real-world models and\nour codes.\n","authors":["Olivier Laurent","Emanuel Aldea","Gianni Franchi"],"pdf_url":"https://arxiv.org/pdf/2310.08287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08282v1","updated":"2023-10-12T12:39:08Z","published":"2023-10-12T12:39:08Z","title":"Data driven modeling of self-similar dynamics","summary":"  Multiscale modeling of complex systems is crucial for understanding their\nintricacies. Data-driven multiscale modeling has emerged as a promising\napproach to tackle challenges associated with complex systems. On the other\nhand, self-similarity is prevalent in complex systems, hinting that large-scale\ncomplex systems can be modeled at a reduced cost. In this paper, we introduce a\nmultiscale neural network framework that incorporates self-similarity as prior\nknowledge, facilitating the modeling of self-similar dynamical systems. For\ndeterministic dynamics, our framework can discern whether the dynamics are\nself-similar. For uncertain dynamics, it can compare and determine which\nparameter set is closer to self-similarity. The framework allows us to extract\nscale-invariant kernels from the dynamics for modeling at any scale. Moreover,\nour method can identify the power law exponents in self-similar systems.\nPreliminary tests on the Ising model yielded critical exponents consistent with\ntheoretical expectations, providing valuable insights for addressing critical\nphase transitions in non-equilibrium systems.\n","authors":["Ruyi Tao","Ningning Tao","Yizhuang You","Jiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08282v1.pdf","comment":"10 pages,4 figures,1 table"},{"id":"http://arxiv.org/abs/2310.08278v1","updated":"2023-10-12T12:29:32Z","published":"2023-10-12T12:29:32Z","title":"Lag-Llama: Towards Foundation Models for Time Series Forecasting","summary":"  Aiming to build foundation models for time-series forecasting and study their\nscaling behavior, we present here our work-in-progress on Lag-Llama, a\ngeneral-purpose univariate probabilistic time-series forecasting model trained\non a large collection of time-series data. The model shows good zero-shot\nprediction capabilities on unseen \"out-of-distribution\" time-series datasets,\noutperforming supervised baselines. We use smoothly broken power-laws to fit\nand predict model scaling behavior. The open source code is made available at\nhttps://github.com/kashif/pytorch-transformer-ts.\n","authors":["Kashif Rasul","Arjun Ashok","Andrew Robert Williams","Arian Khorasani","George Adamopoulos","Rishika Bhagwatkar","Marin Biloš","Hena Ghonia","Nadhir Vincent Hassen","Anderson Schneider","Sahil Garg","Alexandre Drouin","Nicolas Chapados","Yuriy Nevmyvaka","Irina Rish"],"pdf_url":"https://arxiv.org/pdf/2310.08278v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15394v2","updated":"2023-10-12T12:11:23Z","published":"2023-05-24T17:56:18Z","title":"Differentially-Private Decision Trees and Provable Robustness to Data\n  Poisoning","summary":"  Decision trees are interpretable models that are well-suited to non-linear\nlearning problems. Much work has been done on extending decision tree learning\nalgorithms with differential privacy, a system that guarantees the privacy of\nsamples within the training data. However, current state-of-the-art algorithms\nfor this purpose sacrifice much utility for a small privacy benefit. These\nsolutions create random decision nodes that reduce decision tree accuracy or\nspend an excessive share of the privacy budget on labeling leaves. Moreover,\nmany works do not support continuous features or leak information about them.\nWe propose a new method called PrivaTree based on private histograms that\nchooses good splits while consuming a small privacy budget. The resulting trees\nprovide a significantly better privacy-utility trade-off and accept mixed\nnumerical and categorical data without leaking information about numerical\nfeatures. Finally, while it is notoriously hard to give robustness guarantees\nagainst data poisoning attacks, we demonstrate bounds for the expected accuracy\nand success rates of backdoor attacks against differentially-private learners.\nBy leveraging the better privacy-utility trade-off of PrivaTree we are able to\ntrain decision trees with significantly better robustness against backdoor\nattacks compared to regular decision trees and with meaningful theoretical\nguarantees.\n","authors":["Daniël Vos","Jelle Vos","Tianyu Li","Zekeriya Erkin","Sicco Verwer"],"pdf_url":"https://arxiv.org/pdf/2305.15394v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08259v1","updated":"2023-10-12T12:05:51Z","published":"2023-10-12T12:05:51Z","title":"Invisible Threats: Backdoor Attack in OCR Systems","summary":"  Optical Character Recognition (OCR) is a widely used tool to extract text\nfrom scanned documents. Today, the state-of-the-art is achieved by exploiting\ndeep neural networks. However, the cost of this performance is paid at the\nprice of system vulnerability. For instance, in backdoor attacks, attackers\ncompromise the training phase by inserting a backdoor in the victim's model\nthat will be activated at testing time by specific patterns while leaving the\noverall model performance intact. This work proposes a backdoor attack for OCR\nresulting in the injection of non-readable characters from malicious input\nimages. This simple but effective attack exposes the state-of-the-art OCR\nweakness, making the extracted text correct to human eyes but simultaneously\nunusable for the NLP application that uses OCR as a preprocessing step.\nExperimental results show that the attacked models successfully output\nnon-readable characters for around 90% of the poisoned instances without\nharming their performance for the remaining instances.\n","authors":["Mauro Conti","Nicola Farronato","Stefanos Koffas","Luca Pajola","Stjepan Picek"],"pdf_url":"https://arxiv.org/pdf/2310.08259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19443v2","updated":"2023-10-12T12:02:34Z","published":"2023-05-30T22:34:48Z","title":"OWAdapt: An adaptive loss function for deep learning using OWA operators","summary":"  In this paper, we propose a fuzzy adaptive loss function for enhancing deep\nlearning performance in classification tasks. Specifically, we redefine the\ncross-entropy loss to effectively address class-level noise conditions,\nincluding the challenging problem of class imbalance. Our approach introduces\naggregation operators, leveraging the power of fuzzy logic to improve\nclassification accuracy. The rationale behind our proposed method lies in the\niterative up-weighting of class-level components within the loss function,\nfocusing on those with larger errors. To achieve this, we employ the ordered\nweighted average (OWA) operator and combine it with an adaptive scheme for\ngradient-based learning. Through extensive experimentation, our method\noutperforms other commonly used loss functions, such as the standard\ncross-entropy or focal loss, across various binary and multiclass\nclassification tasks. Furthermore, we explore the influence of hyperparameters\nassociated with the OWA operators and present a default configuration that\nperforms well across different experimental settings.\n","authors":["Sebastián Maldonado","Carla Vairetti","Katherine Jara","Miguel Carrasco","Julio López"],"pdf_url":"https://arxiv.org/pdf/2305.19443v2.pdf","comment":"15 pages, 1 figure, published"},{"id":"http://arxiv.org/abs/2310.08256v1","updated":"2023-10-12T12:01:32Z","published":"2023-10-12T12:01:32Z","title":"Impact of Co-occurrence on Factual Knowledge of Large Language Models","summary":"  Large language models (LLMs) often make factually incorrect responses despite\ntheir success in various applications. In this paper, we hypothesize that\nrelying heavily on simple co-occurrence statistics of the pre-training corpora\nis one of the main factors that cause factual errors. Our results reveal that\nLLMs are vulnerable to the co-occurrence bias, defined as preferring frequently\nco-occurred words over the correct answer. Consequently, LLMs struggle to\nrecall facts whose subject and object rarely co-occur in the pre-training\ndataset although they are seen during finetuning. We show that co-occurrence\nbias remains despite scaling up model sizes or finetuning. Therefore, we\nsuggest finetuning on a debiased dataset to mitigate the bias by filtering out\nbiased samples whose subject-object co-occurrence count is high. Although\ndebiased finetuning allows LLMs to memorize rare facts in the training set, it\nis not effective in recalling rare facts unseen during finetuning. Further\nresearch in mitigation will help build reliable language models by preventing\npotential errors. The code is available at\n\\url{https://github.com/CheongWoong/impact_of_cooccurrence}.\n","authors":["Cheongwoong Kang","Jaesik Choi"],"pdf_url":"https://arxiv.org/pdf/2310.08256v1.pdf","comment":"EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2304.12233v3","updated":"2023-10-12T12:00:09Z","published":"2023-04-20T10:45:57Z","title":"Diffusion-based Generative AI for Exploring Transition States from 2D\n  Molecular Graphs","summary":"  The exploration of transition state (TS) geometries is crucial for\nelucidating chemical reaction mechanisms and modeling their kinetics. Recently,\nmachine learning (ML) models have shown remarkable performance for prediction\nof TS geometries. However, they require 3D conformations of reactants and\nproducts often with their appropriate orientations as input, which demands\nsubstantial efforts and computational cost. Here, we propose a generative\napproach based on the stochastic diffusion method, namely TSDiff, for\nprediction of TS geometries just from 2D molecular graphs. TSDiff outperformed\nthe existing ML models with 3D geometries in terms of both accuracy and\nefficiency. Moreover, it enables to sample various TS conformations, because it\nlearned the distribution of TS geometries for diverse reactions in training.\nThus, TSDiff was able to find more favorable reaction pathways with lower\nbarrier heights than those in the reference database. These results demonstrate\nthat TSDiff shows promising potential for an efficient and reliable TS\nexploration.\n","authors":["Seonghwan Kim","Jeheon Woo","Woo Youn Kim"],"pdf_url":"https://arxiv.org/pdf/2304.12233v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00327v2","updated":"2023-10-12T11:55:28Z","published":"2023-09-30T10:06:05Z","title":"Memorization with neural nets: going beyond the worst case","summary":"  In practice, deep neural networks are often able to easily interpolate their\ntraining data. To understand this phenomenon, many works have aimed to quantify\nthe memorization capacity of a neural network architecture: the largest number\nof points such that the architecture can interpolate any placement of these\npoints with any assignment of labels. For real-world data, however, one\nintuitively expects the presence of a benign structure so that interpolation\nalready occurs at a smaller network size than suggested by memorization\ncapacity. In this paper, we investigate interpolation by adopting an\ninstance-specific viewpoint. We introduce a simple randomized algorithm that,\ngiven a fixed finite dataset with two classes, with high probability constructs\nan interpolating three-layer neural network in polynomial time. The required\nnumber of parameters is linked to geometric properties of the two classes and\ntheir mutual arrangement. As a result, we obtain guarantees that are\nindependent of the number of samples and hence move beyond worst-case\nmemorization capacity bounds. We illustrate the effectiveness of the algorithm\nin non-pathological situations with extensive numerical experiments and link\nthe insights back to the theoretical results.\n","authors":["Sjoerd Dirksen","Patrick Finke","Martin Genzel"],"pdf_url":"https://arxiv.org/pdf/2310.00327v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08252v1","updated":"2023-10-12T11:55:17Z","published":"2023-10-12T11:55:17Z","title":"MetaBox: A Benchmark Platform for Meta-Black-Box Optimization with\n  Reinforcement Learning","summary":"  Recently, Meta-Black-Box Optimization with Reinforcement Learning\n(MetaBBO-RL) has showcased the power of leveraging RL at the meta-level to\nmitigate manual fine-tuning of low-level black-box optimizers. However, this\nfield is hindered by the lack of a unified benchmark. To fill this gap, we\nintroduce MetaBox, the first benchmark platform expressly tailored for\ndeveloping and evaluating MetaBBO-RL methods. MetaBox offers a flexible\nalgorithmic template that allows users to effortlessly implement their unique\ndesigns within the platform. Moreover, it provides a broad spectrum of over 300\nproblem instances, collected from synthetic to realistic scenarios, and an\nextensive library of 19 baseline methods, including both traditional black-box\noptimizers and recent MetaBBO-RL methods. Besides, MetaBox introduces three\nstandardized performance metrics, enabling a more thorough assessment of the\nmethods. In a bid to illustrate the utility of MetaBox for facilitating\nrigorous evaluation and in-depth analysis, we carry out a wide-ranging\nbenchmarking study on existing MetaBBO-RL methods. Our MetaBox is open-source\nand accessible at: https://github.com/GMC-DRL/MetaBox.\n","authors":["Zeyuan Ma","Hongshu Guo","Jiacheng Chen","Zhenrui Li","Guojun Peng","Yue-Jiao Gong","Yining Ma","Zhiguang Cao"],"pdf_url":"https://arxiv.org/pdf/2310.08252v1.pdf","comment":"Accepted at NuerIPS 2023"},{"id":"http://arxiv.org/abs/2110.03469v3","updated":"2023-10-12T11:53:59Z","published":"2021-10-07T13:49:23Z","title":"Federated Learning from Small Datasets","summary":"  Federated learning allows multiple parties to collaboratively train a joint\nmodel without sharing local data. This enables applications of machine learning\nin settings of inherently distributed, undisclosable data such as in the\nmedical domain. In practice, joint training is usually achieved by aggregating\nlocal models, for which local training objectives have to be in expectation\nsimilar to the joint (global) objective. Often, however, local datasets are so\nsmall that local objectives differ greatly from the global objective, resulting\nin federated learning to fail. We propose a novel approach that intertwines\nmodel aggregations with permutations of local models. The permutations expose\neach local model to a daisy chain of local datasets resulting in more efficient\ntraining in data-sparse domains. This enables training on extremely small local\ndatasets, such as patient data across hospitals, while retaining the training\nefficiency and privacy benefits of federated learning.\n","authors":["Michael Kamp","Jonas Fischer","Jilles Vreeken"],"pdf_url":"https://arxiv.org/pdf/2110.03469v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04263v3","updated":"2023-10-12T11:42:59Z","published":"2023-08-08T13:59:56Z","title":"BarlowRL: Barlow Twins for Data-Efficient Reinforcement Learning","summary":"  This paper introduces BarlowRL, a data-efficient reinforcement learning agent\nthat combines the Barlow Twins self-supervised learning framework with DER\n(Data-Efficient Rainbow) algorithm. BarlowRL outperforms both DER and its\ncontrastive counterpart CURL on the Atari 100k benchmark. BarlowRL avoids\ndimensional collapse by enforcing information spread to the whole space. This\nhelps RL algorithms to utilize uniformly spread state representation that\neventually results in a remarkable performance. The integration of Barlow Twins\nwith DER enhances data efficiency and achieves superior performance in the RL\ntasks. BarlowRL demonstrates the potential of incorporating self-supervised\nlearning techniques to improve RL algorithms.\n","authors":["Omer Veysel Cagatan","Baris Akgun"],"pdf_url":"https://arxiv.org/pdf/2308.04263v3.pdf","comment":"ACML 2023, Camera-Ready Version"},{"id":"http://arxiv.org/abs/2304.00457v3","updated":"2023-10-12T11:35:35Z","published":"2023-04-02T05:47:09Z","title":"LLMMaps -- A Visual Metaphor for Stratified Evaluation of Large Language\n  Models","summary":"  Large Language Models (LLMs) have revolutionized natural language processing\nand demonstrated impressive capabilities in various tasks. Unfortunately, they\nare prone to hallucinations, where the model exposes incorrect or false\ninformation in its responses, which renders diligent evaluation approaches\nmandatory. While LLM performance in specific knowledge fields is often\nevaluated based on question and answer (Q&A) datasets, such evaluations usually\nreport only a single accuracy number for the dataset, which often covers an\nentire field. This field-based evaluation, is problematic with respect to\ntransparency and model improvement. A stratified evaluation could instead\nreveal subfields, where hallucinations are more likely to occur and thus help\nto better assess LLMs' risks and guide their further development. To support\nsuch stratified evaluations, we propose LLMMaps as a novel visualization\ntechnique that enables users to evaluate LLMs' performance with respect to Q&A\ndatasets. LLMMaps provide detailed insights into LLMs' knowledge capabilities\nin different subfields, by transforming Q&A datasets as well as LLM responses\ninto an internal knowledge structure. An extension for comparative\nvisualization furthermore, allows for the detailed comparison of multiple LLMs.\nTo assess LLMMaps we use them to conduct a comparative analysis of several\nstate-of-the-art LLMs, such as BLOOM, GPT-2, GPT-3, ChatGPT and LLaMa-13B, as\nwell as two qualitative user evaluations. All necessary source code and data\nfor generating LLMMaps to be used in scientific publications and elsewhere is\navailable on GitHub: https://github.com/viscom-ulm/LLMMaps\n","authors":["Patrik Puchert","Poonam Poonam","Christian van Onzenoodt","Timo Ropinski"],"pdf_url":"https://arxiv.org/pdf/2304.00457v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08237v1","updated":"2023-10-12T11:33:15Z","published":"2023-10-12T11:33:15Z","title":"Towards a Unified Analysis of Kernel-based Methods Under Covariate Shift","summary":"  Covariate shift occurs prevalently in practice, where the input distributions\nof the source and target data are substantially different. Despite its\npractical importance in various learning problems, most of the existing methods\nonly focus on some specific learning tasks and are not well validated\ntheoretically and numerically. To tackle this problem, we propose a unified\nanalysis of general nonparametric methods in a reproducing kernel Hilbert space\n(RKHS) under covariate shift. Our theoretical results are established for a\ngeneral loss belonging to a rich loss function family, which includes many\ncommonly used methods as special cases, such as mean regression, quantile\nregression, likelihood-based classification, and margin-based classification.\nTwo types of covariate shift problems are the focus of this paper and the sharp\nconvergence rates are established for a general loss function to provide a\nunified theoretical analysis, which concurs with the optimal results in\nliterature where the squared loss is used. Extensive numerical studies on\nsynthetic and real examples confirm our theoretical findings and further\nillustrate the effectiveness of our proposed method.\n","authors":["Xingdong Feng","Xin He","Caixing Wang","Chao Wang","Jingnan Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08237v1.pdf","comment":"Poster to appear in Thirty-seventh Conference on Neural Information\n  Processing Systems"},{"id":"http://arxiv.org/abs/2310.08235v1","updated":"2023-10-12T11:31:01Z","published":"2023-10-12T11:31:01Z","title":"GROOT: Learning to Follow Instructions by Watching Gameplay Videos","summary":"  We study the problem of building a controller that can follow open-ended\ninstructions in open-world environments. We propose to follow reference videos\nas instructions, which offer expressive goal specifications while eliminating\nthe need for expensive text-gameplay annotations. A new learning framework is\nderived to allow learning such instruction-following controllers from gameplay\nvideos while producing a video instruction encoder that induces a structured\ngoal space. We implement our agent GROOT in a simple yet effective\nencoder-decoder architecture based on causal transformers. We evaluate GROOT\nagainst open-world counterparts and human players on a proposed Minecraft\nSkillForge benchmark. The Elo ratings clearly show that GROOT is closing the\nhuman-machine gap as well as exhibiting a 70% winning rate over the best\ngeneralist agent baseline. Qualitative analysis of the induced goal space\nfurther demonstrates some interesting emergent properties, including the goal\ncomposition and complex gameplay behavior synthesis. Code and video can be\nfound on the website https://craftjarvis-groot.github.io.\n","authors":["Shaofei Cai","Bowei Zhang","Zihao Wang","Xiaojian Ma","Anji Liu","Yitao Liang"],"pdf_url":"https://arxiv.org/pdf/2310.08235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2107.14432v4","updated":"2023-10-12T11:25:26Z","published":"2021-07-30T05:33:43Z","title":"Adaptive Optimizers with Sparse Group Lasso for Neural Networks in CTR\n  Prediction","summary":"  We develop a novel framework that adds the regularizers of the sparse group\nlasso to a family of adaptive optimizers in deep learning, such as Momentum,\nAdagrad, Adam, AMSGrad, AdaHessian, and create a new class of optimizers, which\nare named Group Momentum, Group Adagrad, Group Adam, Group AMSGrad and Group\nAdaHessian, etc., accordingly. We establish theoretically proven convergence\nguarantees in the stochastic convex settings, based on primal-dual methods. We\nevaluate the regularized effect of our new optimizers on three large-scale\nreal-world ad click datasets with state-of-the-art deep learning models. The\nexperimental results reveal that compared with the original optimizers with the\npost-processing procedure which uses the magnitude pruning method, the\nperformance of the models can be significantly improved on the same sparsity\nlevel. Furthermore, in comparison to the cases without magnitude pruning, our\nmethods can achieve extremely high sparsity with significantly better or highly\ncompetitive performance. The code is available at\nhttps://github.com/intelligent-machine-learning/dlrover/blob/master/tfplus.\n","authors":["Yun Yue","Yongchao Liu","Suo Tong","Minghao Li","Zhen Zhang","Chunyang Wen","Huanjun Bao","Lihong Gu","Jinjie Gu","Yixiang Mu"],"pdf_url":"https://arxiv.org/pdf/2107.14432v4.pdf","comment":"24 pages. Published as a conference paper at ECML PKDD 2021. This\n  version includes Appendix which was not included in the published version\n  because of page limit"},{"id":"http://arxiv.org/abs/2310.08224v1","updated":"2023-10-12T11:16:57Z","published":"2023-10-12T11:16:57Z","title":"Emergence of Latent Binary Encoding in Deep Neural Network Classifiers","summary":"  We observe the emergence of binary encoding within the latent space of\ndeep-neural-network classifiers. Such binary encoding is induced by introducing\na linear penultimate layer, which is equipped during training with a loss\nfunction that grows as $\\exp(\\vec{x}^2)$, where $\\vec{x}$ are the coordinates\nin the latent space. The phenomenon we describe represents a specific instance\nof a well-documented occurrence known as \\textit{neural collapse}, which arises\nin the terminal phase of training and entails the collapse of latent class\nmeans to the vertices of a simplex equiangular tight frame (ETF). We show that\nbinary encoding accelerates convergence toward the simplex ETF and enhances\nclassification accuracy.\n","authors":["Luigi Sbailò","Luca Ghiringhelli"],"pdf_url":"https://arxiv.org/pdf/2310.08224v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08221v1","updated":"2023-10-12T11:11:54Z","published":"2023-10-12T11:11:54Z","title":"SimCKP: Simple Contrastive Learning of Keyphrase Representations","summary":"  Keyphrase generation (KG) aims to generate a set of summarizing words or\nphrases given a source document, while keyphrase extraction (KE) aims to\nidentify them from the text. Because the search space is much smaller in KE, it\nis often combined with KG to predict keyphrases that may or may not exist in\nthe corresponding document. However, current unified approaches adopt sequence\nlabeling and maximization-based generation that primarily operate at a token\nlevel, falling short in observing and scoring keyphrases as a whole. In this\nwork, we propose SimCKP, a simple contrastive learning framework that consists\nof two stages: 1) An extractor-generator that extracts keyphrases by learning\ncontext-aware phrase-level representations in a contrastive manner while also\ngenerating keyphrases that do not appear in the document; 2) A reranker that\nadapts scores for each generated phrase by likewise aligning their\nrepresentations with the corresponding document. Experimental results on\nmultiple benchmark datasets demonstrate the effectiveness of our proposed\napproach, which outperforms the state-of-the-art models by a significant\nmargin.\n","authors":["Minseok Choi","Chaeheon Gwak","Seho Kim","Si Hyeong Kim","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2310.08221v1.pdf","comment":"Accepted to Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2309.00848v2","updated":"2023-10-12T11:11:23Z","published":"2023-09-02T07:17:43Z","title":"Bengali Document Layout Analysis -- A YOLOV8 Based Ensembling Approach","summary":"  This paper focuses on enhancing Bengali Document Layout Analysis (DLA) using\nthe YOLOv8 model and innovative post-processing techniques. We tackle\nchallenges unique to the complex Bengali script by employing data augmentation\nfor model robustness. After meticulous validation set evaluation, we fine-tune\nour approach on the complete dataset, leading to a two-stage prediction\nstrategy for accurate element segmentation. Our ensemble model, combined with\npost-processing, outperforms individual base architectures, addressing issues\nidentified in the BaDLAD dataset. By leveraging this approach, we aim to\nadvance Bengali document analysis, contributing to improved OCR and document\ncomprehension and BaDLAD serves as a foundational resource for this endeavor,\naiding future research in the field. Furthermore, our experiments provided key\ninsights to incorporate new strategies into the established solution.\n","authors":["Nazmus Sakib Ahmed","Saad Sakib Noor","Ashraful Islam Shanto Sikder","Abhijit Paul"],"pdf_url":"https://arxiv.org/pdf/2309.00848v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08217v1","updated":"2023-10-12T11:05:34Z","published":"2023-10-12T11:05:34Z","title":"TriRE: A Multi-Mechanism Learning Paradigm for Continual Knowledge\n  Retention and Promotion","summary":"  Continual learning (CL) has remained a persistent challenge for deep neural\nnetworks due to catastrophic forgetting (CF) of previously learned tasks.\nSeveral techniques such as weight regularization, experience rehearsal, and\nparameter isolation have been proposed to alleviate CF. Despite their relative\nsuccess, these research directions have predominantly remained orthogonal and\nsuffer from several shortcomings, while missing out on the advantages of\ncompeting strategies. On the contrary, the brain continually learns,\naccommodates, and transfers knowledge across tasks by simultaneously leveraging\nseveral neurophysiological processes, including neurogenesis, active\nforgetting, neuromodulation, metaplasticity, experience rehearsal, and\ncontext-dependent gating, rarely resulting in CF. Inspired by how the brain\nexploits multiple mechanisms concurrently, we propose TriRE, a novel CL\nparadigm that encompasses retaining the most prominent neurons for each task,\nrevising and solidifying the extracted knowledge of current and past tasks, and\nactively promoting less active neurons for subsequent tasks through rewinding\nand relearning. Across CL settings, TriRE significantly reduces task\ninterference and surpasses different CL approaches considered in isolation.\n","authors":["Preetha Vijayan","Prashant Bhat","Elahe Arani","Bahram Zonooz"],"pdf_url":"https://arxiv.org/pdf/2310.08217v1.pdf","comment":"Accepted at 37th Conference on Neural Information Processing Systems\n  (NeurIPS 2023)"},{"id":"http://arxiv.org/abs/2310.08215v1","updated":"2023-10-12T11:04:17Z","published":"2023-10-12T11:04:17Z","title":"Trustworthy Machine Learning","summary":"  As machine learning technology gets applied to actual products and solutions,\nnew challenges have emerged. Models unexpectedly fail to generalize to small\nchanges in the distribution, tend to be confident on novel data they have never\nseen, or cannot communicate the rationale behind their decisions effectively\nwith the end users. Collectively, we face a trustworthiness issue with the\ncurrent machine learning technology. This textbook on Trustworthy Machine\nLearning (TML) covers a theoretical and technical background of four key topics\nin TML: Out-of-Distribution Generalization, Explainability, Uncertainty\nQuantification, and Evaluation of Trustworthiness. We discuss important\nclassical and contemporary research papers of the aforementioned fields and\nuncover and connect their underlying intuitions. The book evolved from the\nhomonymous course at the University of T\\\"ubingen, first offered in the Winter\nSemester of 2022/23. It is meant to be a stand-alone product accompanied by\ncode snippets and various pointers to further sources on topics of TML. The\ndedicated website of the book is https://trustworthyml.io/.\n","authors":["Bálint Mucsányi","Michael Kirchhof","Elisa Nguyen","Alexander Rubinstein","Seong Joon Oh"],"pdf_url":"https://arxiv.org/pdf/2310.08215v1.pdf","comment":"373 pages, textbook at the University of T\\\"ubingen"},{"id":"http://arxiv.org/abs/2310.08209v1","updated":"2023-10-12T10:56:25Z","published":"2023-10-12T10:56:25Z","title":"Conformal inference for regression on Riemannian Manifolds","summary":"  Regression on manifolds, and, more broadly, statistics on manifolds, has\ngarnered significant importance in recent years due to the vast number of\napplications for this type of data. Circular data is a classic example, but so\nis data in the space of covariance matrices, data on the Grassmannian manifold\nobtained as a result of principal component analysis, among many others. In\nthis work we investigate prediction sets for regression scenarios when the\nresponse variable, denoted by $Y$, resides in a manifold, and the covariable,\ndenoted by X, lies in Euclidean space. This extends the concepts delineated in\n[Lei and Wasserman, 2014] to this novel context. Aligning with traditional\nprinciples in conformal inference, these prediction sets are distribution-free,\nindicating that no specific assumptions are imposed on the joint distribution\nof $(X, Y)$, and they maintain a non-parametric character. We prove the\nasymptotic almost sure convergence of the empirical version of these regions on\nthe manifold to their population counterparts. The efficiency of this method is\nshown through a comprehensive simulation study and an analysis involving\nreal-world data.\n","authors":["Alejandro Cholaquidis","Fabrice Gamboa","Leonardo Moreno"],"pdf_url":"https://arxiv.org/pdf/2310.08209v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08204v1","updated":"2023-10-12T10:50:21Z","published":"2023-10-12T10:50:21Z","title":"Lifelong Audio-video Masked Autoencoder with Forget-robust Localized\n  Alignments","summary":"  We present a lifelong audio-video masked autoencoder that continually learns\nthe multimodal representations from a video stream containing audio-video\npairs, while its distribution continually shifts over time. Specifically, we\npropose two novel ideas to tackle the problem: (1) Localized Alignment: We\nintroduce a small trainable multimodal encoder that predicts the audio and\nvideo tokens that are well-aligned with each other. This allows the model to\nlearn only the highly correlated audiovisual patches with accurate multimodal\nrelationships. (2) Forget-robust multimodal patch selection: We compare the\nrelative importance of each audio-video patch between the current and past data\npair to mitigate unintended drift of the previously learned audio-video\nrepresentations. Our proposed method, FLAVA (Forget-robust Localized\nAudio-Video Alignment), therefore, captures the complex relationships between\nthe audio and video modalities during training on a sequence of pre-training\ntasks while alleviating the forgetting of learned audiovisual correlations. Our\nexperiments validate that FLAVA outperforms the state-of-the-art continual\nlearning methods on several benchmark datasets under continual audio-video\nrepresentation learning scenarios.\n","authors":["Jaewoo Lee","Jaehong Yoon","Wonjae Kim","Yunji Kim","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2310.08204v1.pdf","comment":"Preprint, project page: https://g-jwlee.github.io/FLAVA/"},{"id":"http://arxiv.org/abs/2310.08198v1","updated":"2023-10-12T10:44:47Z","published":"2023-10-12T10:44:47Z","title":"Beyond Traditional DoE: Deep Reinforcement Learning for Optimizing\n  Experiments in Model Identification of Battery Dynamics","summary":"  Model identification of battery dynamics is a central problem in energy\nresearch; many energy management systems and design processes rely on accurate\nbattery models for efficiency optimization. The standard methodology for\nbattery modelling is traditional design of experiments (DoE), where the battery\ndynamics are excited with many different current profiles and the measured\noutputs are used to estimate the system dynamics. However, although it is\npossible to obtain useful models with the traditional approach, the process is\ntime consuming and expensive because of the need to sweep many different\ncurrent-profile configurations. In the present work, a novel DoE approach is\ndeveloped based on deep reinforcement learning, which alters the configuration\nof the experiments on the fly based on the statistics of past experiments.\nInstead of sticking to a library of predefined current profiles, the proposed\napproach modifies the current profiles dynamically by updating the output space\ncovered by past measurements, hence only the current profiles that are\ninformative for future experiments are applied. Simulations and real\nexperiments are used to show that the proposed approach gives models that are\nas accurate as those obtained with traditional DoE but by using 85\\% less\nresources.\n","authors":["Gokhan Budan","Francesca Damiani","Can Kurtulus","N. Kemal Ure"],"pdf_url":"https://arxiv.org/pdf/2310.08198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09033v2","updated":"2023-10-12T10:34:03Z","published":"2023-03-16T02:07:29Z","title":"Only Pay for What Is Uncertain: Variance-Adaptive Thompson Sampling","summary":"  Most bandit algorithms assume that the reward variances or their upper bounds\nare known, and that they are the same for all arms. This naturally leads to\nsuboptimal performance and higher regret due to variance overestimation. On the\nother hand, underestimated reward variances may lead to linear regret due to\ncommitting early to a suboptimal arm. This motivated prior works on\nvariance-adaptive frequentist algorithms, which have strong instance-dependent\nregret bounds but cannot incorporate prior knowledge on reward variances. We\nlay foundations for the Bayesian setting, which incorporates prior knowledge.\nThis results in lower regret in practice, due to using the prior in the\nalgorithm design, and also improved regret guarantees. Specifically, we study\nGaussian bandits with {unknown heterogeneous reward variances}, and develop a\nThompson sampling algorithm with prior-dependent Bayes regret bounds. We\nachieve lower regret with lower reward variances and more informative priors on\nthem, which is precisely why we pay only for what is uncertain. This is the\nfirst result of its kind. Finally, we corroborate our theory with extensive\nexperiments, which show the superiority of our variance-adaptive Bayesian\nalgorithm over prior frequentist approaches. We also show that our approach is\nrobust to model misspecification and can be applied with estimated priors.\n","authors":["Aadirupa Saha","Branislav Kveton"],"pdf_url":"https://arxiv.org/pdf/2303.09033v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07402v2","updated":"2023-10-12T10:30:35Z","published":"2023-10-11T11:38:18Z","title":"NuTime: Numerically Multi-Scaled Embedding for Large-Scale Time Series\n  Pretraining","summary":"  Recent research on time-series self-supervised models shows great promise in\nlearning semantic representations. However, it has been limited to small-scale\ndatasets, e.g., thousands of temporal sequences. In this work, we make key\ntechnical contributions that are tailored to the numerical properties of\ntime-series data and allow the model to scale to large datasets, e.g., millions\nof temporal sequences. We adopt the Transformer architecture by first\npartitioning the input into non-overlapping windows. Each window is then\ncharacterized by its normalized shape and two scalar values denoting the mean\nand standard deviation within each window. To embed scalar values that may\npossess arbitrary numerical scales to high-dimensional vectors, we propose a\nnumerically multi-scaled embedding module enumerating all possible scales for\nthe scalar values. The model undergoes pretraining using the proposed\nnumerically multi-scaled embedding with a simple contrastive objective on a\nlarge-scale dataset containing over a million sequences. We study its transfer\nperformance on a number of univariate and multivariate classification\nbenchmarks. Our method exhibits remarkable improvement against previous\nrepresentation learning approaches and establishes the new state of the art,\neven compared with domain-specific non-learning-based methods.\n","authors":["Chenguo Lin","Xumeng Wen","Wei Cao","Congrui Huang","Jiang Bian","Stephen Lin","Zhirong Wu"],"pdf_url":"https://arxiv.org/pdf/2310.07402v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08184v1","updated":"2023-10-12T10:20:36Z","published":"2023-10-12T10:20:36Z","title":"Learn From Model Beyond Fine-Tuning: A Survey","summary":"  Foundation models (FM) have demonstrated remarkable performance across a wide\nrange of tasks (especially in the fields of natural language processing and\ncomputer vision), primarily attributed to their ability to comprehend\ninstructions and access extensive, high-quality data. This not only showcases\ntheir current effectiveness but also sets a promising trajectory towards the\ndevelopment of artificial general intelligence. Unfortunately, due to multiple\nconstraints, the raw data of the model used for large model training are often\ninaccessible, so the use of end-to-end models for downstream tasks has become a\nnew research trend, which we call Learn From Model (LFM) in this article. LFM\nfocuses on the research, modification, and design of FM based on the model\ninterface, so as to better understand the model structure and weights (in a\nblack box environment), and to generalize the model to downstream tasks. The\nstudy of LFM techniques can be broadly categorized into five major areas: model\ntuning, model distillation, model reuse, meta learning and model editing. Each\ncategory encompasses a repertoire of methods and strategies that aim to enhance\nthe capabilities and performance of FM. This paper gives a comprehensive review\nof the current methods based on FM from the perspective of LFM, in order to\nhelp readers better understand the current research status and ideas. To\nconclude, we summarize the survey by highlighting several critical areas for\nfuture exploration and addressing open issues that require further attention\nfrom the research community. The relevant papers we investigated in this\narticle can be accessed at\n<https://github.com/ruthless-man/Awesome-Learn-from-Model>.\n","authors":["Hongling Zheng","Li Shen","Anke Tang","Yong Luo","Han Hu","Bo Du","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2310.08184v1.pdf","comment":"20 pages, 9 figures"},{"id":"http://arxiv.org/abs/2310.08182v1","updated":"2023-10-12T10:17:40Z","published":"2023-10-12T10:17:40Z","title":"XIMAGENET-12: An Explainable AI Benchmark Dataset for Model Robustness\n  Evaluation","summary":"  The lack of standardized robustness metrics and the widespread reliance on\nnumerous unrelated benchmark datasets for testing have created a gap between\nacademically validated robust models and their often problematic practical\nadoption. To address this, we introduce XIMAGENET-12, an explainable benchmark\ndataset with over 200K images and 15,600 manual semantic annotations. Covering\n12 categories from ImageNet to represent objects commonly encountered in\npractical life and simulating six diverse scenarios, including overexposure,\nblurring, color changing, etc., we further propose a novel robustness criterion\nthat extends beyond model generation ability assessment. This benchmark\ndataset, along with related code, is available at\nhttps://sites.google.com/view/ximagenet-12/home. Researchers and practitioners\ncan leverage this resource to evaluate the robustness of their visual models\nunder challenging conditions and ultimately benefit from the demands of\npractical computer vision systems.\n","authors":["Qiang Li","Dan Zhang","Shengzhao Lei","Xun Zhao","Shuyan Li","Porawit Kamnoedboon","WeiWei Li"],"pdf_url":"https://arxiv.org/pdf/2310.08182v1.pdf","comment":"UnderSubmission"},{"id":"http://arxiv.org/abs/2310.08177v1","updated":"2023-10-12T10:03:25Z","published":"2023-10-12T10:03:25Z","title":"Improving Fast Minimum-Norm Attacks with Hyperparameter Optimization","summary":"  Evaluating the adversarial robustness of machine learning models using\ngradient-based attacks is challenging. In this work, we show that\nhyperparameter optimization can improve fast minimum-norm attacks by automating\nthe selection of the loss function, the optimizer and the step-size scheduler,\nalong with the corresponding hyperparameters. Our extensive evaluation\ninvolving several robust models demonstrates the improved efficacy of fast\nminimum-norm attacks when hyper-up with hyperparameter optimization. We release\nour open-source code at https://github.com/pralab/HO-FMN.\n","authors":["Giuseppe Floris","Raffaele Mura","Luca Scionis","Giorgio Piras","Maura Pintor","Ambra Demontis","Battista Biggio"],"pdf_url":"https://arxiv.org/pdf/2310.08177v1.pdf","comment":"Accepted at ESANN23"},{"id":"http://arxiv.org/abs/2310.08176v1","updated":"2023-10-12T10:01:39Z","published":"2023-10-12T10:01:39Z","title":"Infinite Width Graph Neural Networks for Node Regression/ Classification","summary":"  This work analyzes Graph Neural Networks, a generalization of Fully-Connected\nDeep Neural Nets on Graph structured data, when their width, that is the number\nof nodes in each fullyconnected layer is increasing to infinity. Infinite Width\nNeural Networks are connecting Deep Learning to Gaussian Processes and Kernels,\nboth Machine Learning Frameworks with long traditions and extensive theoretical\nfoundations. Gaussian Processes and Kernels have much less hyperparameters then\nNeural Networks and can be used for uncertainty estimation, making them more\nuser friendly for applications. This works extends the increasing amount of\nresearch connecting Gaussian Processes and Kernels to Neural Networks. The\nKernel and Gaussian Process closed forms are derived for a variety of\narchitectures, namely the standard Graph Neural Network, the Graph Neural\nNetwork with Skip-Concatenate Connections and the Graph Attention Neural\nNetwork. All architectures are evaluated on a variety of datasets on the task\nof transductive Node Regression and Classification. Additionally, a Spectral\nSparsification method known as Effective Resistance is used to improve runtime\nand memory requirements. Extending the setting to inductive graph learning\ntasks (Graph Regression/ Classification) is straightforward and is briefly\ndiscussed in 3.5.\n","authors":["Yunus Cobanoglu"],"pdf_url":"https://arxiv.org/pdf/2310.08176v1.pdf","comment":"50 Pages, 2 Figures (with subfigures)o, multiple tables"},{"id":"http://arxiv.org/abs/2308.08469v3","updated":"2023-10-12T09:58:03Z","published":"2023-08-16T16:19:50Z","title":"LLM4TS: Two-Stage Fine-Tuning for Time-Series Forecasting with\n  Pre-Trained LLMs","summary":"  In this work, we leverage pre-trained Large Language Models (LLMs) to enhance\ntime-series forecasting. Mirroring the growing interest in unifying models for\nNatural Language Processing and Computer Vision, we envision creating an\nanalogous model for long-term time-series forecasting. Due to limited\nlarge-scale time-series data for building robust foundation models, our\napproach LLM4TS focuses on leveraging the strengths of pre-trained LLMs. By\ncombining time-series patching with temporal encoding, we have enhanced the\ncapability of LLMs to handle time-series data effectively. Inspired by the\nsupervised fine-tuning in chatbot domains, we prioritize a two-stage\nfine-tuning process: first conducting supervised fine-tuning to orient the LLM\ntowards time-series data, followed by task-specific downstream fine-tuning.\nFurthermore, to unlock the flexibility of pre-trained LLMs without extensive\nparameter adjustments, we adopt several Parameter-Efficient Fine-Tuning (PEFT)\ntechniques. Drawing on these innovations, LLM4TS has yielded state-of-the-art\nresults in long-term forecasting. Our model has also shown exceptional\ncapabilities as both a robust representation learner and an effective few-shot\nlearner, thanks to the knowledge transferred from the pre-trained LLM.\n","authors":["Ching Chang","Wen-Chih Peng","Tien-Fu Chen"],"pdf_url":"https://arxiv.org/pdf/2308.08469v3.pdf","comment":"This paper is currently under review. The code will be made available\n  upon acceptance"},{"id":"http://arxiv.org/abs/2310.08165v1","updated":"2023-10-12T09:37:56Z","published":"2023-10-12T09:37:56Z","title":"COVID-19 Detection Using Swin Transformer Approach from Computed\n  Tomography Images","summary":"  The accurate and efficient diagnosis of COVID-19 is of paramount importance,\nparticularly in the context of large-scale medical imaging datasets. In this\npreprint paper, we propose a novel approach for COVID-19 diagnosis using CT\nimages that leverages the power of Swin Transformer models, state-of-the-art\nsolutions in computer vision tasks. Our method includes a systematic approach\nfor patient-level predictions, where individual CT slices are classified as\nCOVID-19 or non-COVID, and the patient's overall diagnosis is determined\nthrough majority voting. The application of the Swin Transformer in this\ncontext results in patient-level predictions that demonstrate exceptional\ndiagnostic accuracy. In terms of evaluation metrics, our approach consistently\noutperforms the baseline, as well as numerous competing methods, showcasing its\neffectiveness in COVID-19 diagnosis. The macro F1 score achieved by our model\nexceeds the baseline and offers a robust solution for accurate diagnosis.\n","authors":["Kenan Morani"],"pdf_url":"https://arxiv.org/pdf/2310.08165v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08164v1","updated":"2023-10-12T09:36:03Z","published":"2023-10-12T09:36:03Z","title":"Interpreting Reward Models in RLHF-Tuned Language Models Using Sparse\n  Autoencoders","summary":"  Large language models (LLMs) aligned to human preferences via reinforcement\nlearning from human feedback (RLHF) underpin many commercial applications.\nHowever, how RLHF impacts LLM internals remains opaque. We propose a novel\nmethod to interpret learned reward functions in RLHF-tuned LLMs using sparse\nautoencoders. Our approach trains autoencoder sets on activations from a base\nLLM and its RLHF-tuned version. By comparing autoencoder hidden spaces, we\nidentify unique features that reflect the accuracy of the learned reward model.\nTo quantify this, we construct a scenario where the tuned LLM learns\ntoken-reward mappings to maximize reward. This is the first application of\nsparse autoencoders for interpreting learned rewards and broadly inspecting\nreward learning in LLMs. Our method provides an abstract approximation of\nreward integrity. This presents a promising technique for ensuring alignment\nbetween specified objectives and model behaviors.\n","authors":["Luke Marks","Amir Abdullah","Luna Mendez","Rauno Arike","Philip Torr","Fazl Barez"],"pdf_url":"https://arxiv.org/pdf/2310.08164v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04234v2","updated":"2023-10-12T09:20:00Z","published":"2023-04-09T13:20:19Z","title":"Variational operator learning: A unified paradigm marrying training\n  neural operators and solving partial differential equations","summary":"  Neural operators as novel neural architectures for fast approximating\nsolution operators of partial differential equations (PDEs), have shown\nconsiderable promise for future scientific computing. However, the mainstream\nof training neural operators is still data-driven, which needs an expensive\nground-truth dataset from various sources (e.g., solving PDEs' samples with the\nconventional solvers, real-world experiments) in addition to training stage\ncosts. From a computational perspective, marrying operator learning and\nspecific domain knowledge to solve PDEs is an essential step in reducing\ndataset costs and label-free learning. We propose a novel paradigm that\nprovides a unified framework of training neural operators and solving PDEs with\nthe variational form, which we refer to as the variational operator learning\n(VOL). Ritz and Galerkin approach with finite element discretization are\ndeveloped for VOL to achieve matrix-free approximation of system functional and\nresidual, then direct minimization and iterative update are proposed as two\noptimization strategies for VOL. Various types of experiments based on\nreasonable benchmarks about variable heat source, Darcy flow, and variable\nstiffness elasticity are conducted to demonstrate the effectiveness of VOL.\nWith a label-free training set and a 5-label-only shift set, VOL learns\nsolution operators with its test errors decreasing in a power law with respect\nto the amount of unlabeled data. To the best of the authors' knowledge, this is\nthe first study that integrates the perspectives of the weak form and efficient\niterative methods for solving sparse linear systems into the end-to-end\noperator learning task.\n","authors":["Tengfei Xu","Dachuan Liu","Peng Hao","Bo Wang"],"pdf_url":"https://arxiv.org/pdf/2304.04234v2.pdf","comment":"35 pages, 6 figures with 5 extended figures"},{"id":"http://arxiv.org/abs/2305.14133v2","updated":"2023-10-12T09:18:09Z","published":"2023-05-23T14:56:19Z","title":"Conditional Mutual Information for Disentangled Representations in\n  Reinforcement Learning","summary":"  Reinforcement Learning (RL) environments can produce training data with\nspurious correlations between features due to the amount of training data or\nits limited feature coverage. This can lead to RL agents encoding these\nmisleading correlations in their latent representation, preventing the agent\nfrom generalising if the correlation changes within the environment or when\ndeployed in the real world. Disentangled representations can improve\nrobustness, but existing disentanglement techniques that minimise mutual\ninformation between features require independent features, thus they cannot\ndisentangle correlated features. We propose an auxiliary task for RL algorithms\nthat learns a disentangled representation of high-dimensional observations with\ncorrelated features by minimising the conditional mutual information between\nfeatures in the representation. We demonstrate experimentally, using continuous\ncontrol tasks, that our approach improves generalisation under correlation\nshifts, as well as improving the training performance of RL algorithms in the\npresence of correlated features.\n","authors":["Mhairi Dunion","Trevor McInroe","Kevin Sebastian Luck","Josiah P. Hanna","Stefano V. Albrecht"],"pdf_url":"https://arxiv.org/pdf/2305.14133v2.pdf","comment":"Conference on Neural Information Processing Systems (NeurIPS), 2023"},{"id":"http://arxiv.org/abs/2310.08150v1","updated":"2023-10-12T09:17:46Z","published":"2023-10-12T09:17:46Z","title":"On Extreme Value Asymptotics of Projected Sample Covariances in High\n  Dimensions with Applications in Finance and Convolutional Networks","summary":"  Maximum-type statistics of certain functions of the sample covariance matrix\nof high-dimensional vector time series are studied to statistically confirm or\nreject the null hypothesis that a data set has been collected under normal\nconditions. The approach generalizes the case of the maximal deviation of the\nsample autocovariances function from its assumed values. Within a linear time\nseries framework it is shown that Gumbel-type extreme value asymptotics holds\ntrue. As applications we discuss long-only mimimal-variance portfolio\noptimization and subportfolio analysis with respect to idiosyncratic risks, ETF\nindex tracking by sparse tracking portfolios, convolutional deep learners for\nimage analysis and the analysis of array-of-sensors data.\n","authors":["Ansgar Steland"],"pdf_url":"https://arxiv.org/pdf/2310.08150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08148v1","updated":"2023-10-12T09:12:50Z","published":"2023-10-12T09:12:50Z","title":"Open-Set Knowledge-Based Visual Question Answering with Inference Paths","summary":"  Given an image and an associated textual question, the purpose of\nKnowledge-Based Visual Question Answering (KB-VQA) is to provide a correct\nanswer to the question with the aid of external knowledge bases. Prior KB-VQA\nmodels are usually formulated as a retriever-classifier framework, where a\npre-trained retriever extracts textual or visual information from knowledge\ngraphs and then makes a prediction among the candidates. Despite promising\nprogress, there are two drawbacks with existing models. Firstly, modeling\nquestion-answering as multi-class classification limits the answer space to a\npreset corpus and lacks the ability of flexible reasoning. Secondly, the\nclassifier merely consider \"what is the answer\" without \"how to get the\nanswer\", which cannot ground the answer to explicit reasoning paths. In this\npaper, we confront the challenge of \\emph{explainable open-set} KB-VQA, where\nthe system is required to answer questions with entities at wild and retain an\nexplainable reasoning path. To resolve the aforementioned issues, we propose a\nnew retriever-ranker paradigm of KB-VQA, Graph pATH rankER (GATHER for\nbrevity). Specifically, it contains graph constructing, pruning, and path-level\nranking, which not only retrieves accurate answers but also provides inference\npaths that explain the reasoning process. To comprehensively evaluate our\nmodel, we reformulate the benchmark dataset OK-VQA with manually corrected\nentity-level annotations and release it as ConceptVQA. Extensive experiments on\nreal-world questions demonstrate that our framework is not only able to perform\nopen-set question answering across the whole knowledge base but provide\nexplicit reasoning path.\n","authors":["Jingru Gan","Xinzhe Han","Shuhui Wang","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2310.08148v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16584v2","updated":"2023-10-12T09:03:48Z","published":"2023-09-28T16:44:18Z","title":"A Design Toolbox for the Development of Collaborative Distributed\n  Machine Learning Systems","summary":"  To leverage data for the sufficient training of machine learning (ML) models\nfrom multiple parties in a confidentiality-preserving way, various\ncollaborative distributed ML (CDML) system designs have been developed, for\nexample, to perform assisted learning, federated learning, and split learning.\nCDML system designs show different traits, including high agent autonomy, ML\nmodel confidentiality, and fault tolerance. Facing a wide variety of CDML\nsystem designs with different traits, it is difficult for developers to design\nCDML systems with traits that match use case requirements in a targeted way.\nHowever, inappropriate CDML system designs may result in CDML systems failing\ntheir envisioned purposes. We developed a CDML design toolbox that can guide\nthe development of CDML systems. Based on the CDML design toolbox, we present\nCDML system archetypes with distinct key traits that can support the design of\nCDML systems to meet use case requirements.\n","authors":["David Jin","Niclas Kannengießer","Sascha Rank","Ali Sunyaev"],"pdf_url":"https://arxiv.org/pdf/2309.16584v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08138v1","updated":"2023-10-12T08:52:36Z","published":"2023-10-12T08:52:36Z","title":"Multi-Scale Spatial-Temporal Recurrent Networks for Traffic Flow\n  Prediction","summary":"  Traffic flow prediction is one of the most fundamental tasks of intelligent\ntransportation systems. The complex and dynamic spatial-temporal dependencies\nmake the traffic flow prediction quite challenging. Although existing\nspatial-temporal graph neural networks hold prominent, they often encounter\nchallenges such as (1) ignoring the fixed graph that limits the predictive\nperformance of the model, (2) insufficiently capturing complex spatial-temporal\ndependencies simultaneously, and (3) lacking attention to spatial-temporal\ninformation at different time lengths. In this paper, we propose a Multi-Scale\nSpatial-Temporal Recurrent Network for traffic flow prediction, namely MSSTRN,\nwhich consists of two different recurrent neural networks: the single-step gate\nrecurrent unit and the multi-step gate recurrent unit to fully capture the\ncomplex spatial-temporal information in the traffic data under different time\nwindows. Moreover, we propose a spatial-temporal synchronous attention\nmechanism that integrates adaptive position graph convolutions into the\nself-attention mechanism to achieve synchronous capture of spatial-temporal\ndependencies. We conducted extensive experiments on four real traffic datasets\nand demonstrated that our model achieves the best prediction accuracy with\nnon-trivial margins compared to all the twenty baseline methods.\n","authors":["Haiyang Liu","Chunjiang Zhu","Detian Zhang","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2310.08138v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08137v1","updated":"2023-10-12T08:51:59Z","published":"2023-10-12T08:51:59Z","title":"Counterfactual Explanations for Time Series Forecasting","summary":"  Among recent developments in time series forecasting methods, deep\nforecasting models have gained popularity as they can utilize hidden feature\npatterns in time series to improve forecasting performance. Nevertheless, the\nmajority of current deep forecasting models are opaque, hence making it\nchallenging to interpret the results. While counterfactual explanations have\nbeen extensively employed as a post-hoc approach for explaining classification\nmodels, their application to forecasting models still remains underexplored. In\nthis paper, we formulate the novel problem of counterfactual generation for\ntime series forecasting, and propose an algorithm, called ForecastCF, that\nsolves the problem by applying gradient-based perturbations to the original\ntime series. ForecastCF guides the perturbations by applying constraints to the\nforecasted values to obtain desired prediction outcomes. We experimentally\nevaluate ForecastCF using four state-of-the-art deep model architectures and\ncompare to two baselines. Our results show that ForecastCF outperforms the\nbaseline in terms of counterfactual validity and data manifold closeness.\nOverall, our findings suggest that ForecastCF can generate meaningful and\nrelevant counterfactual explanations for various forecasting tasks.\n","authors":["Zhendong Wang","Ioanna Miliou","Isak Samsten","Panagiotis Papapetrou"],"pdf_url":"https://arxiv.org/pdf/2310.08137v1.pdf","comment":"10 pages, 6 figures. Accepted by ICDM 2023"},{"id":"http://arxiv.org/abs/2310.00177v3","updated":"2023-10-12T08:51:28Z","published":"2023-09-29T22:49:47Z","title":"A Neural-preconditioned Poisson Solver for Mixed Dirichlet and Neumann\n  Boundary Conditions","summary":"  We introduce a neural-preconditioned iterative solver for Poisson equations\nwith mixed boundary conditions. The Poisson equation is ubiquitous in\nscientific computing: it governs a wide array of physical phenomena, arises as\na subproblem in many numerical algorithms, and serves as a model problem for\nthe broader class of elliptic PDEs. The most popular Poisson discretizations\nyield large sparse linear systems. At high resolution, and for\nperformance-critical applications, iterative solvers can be advantageous for\nthese -- but only when paired with powerful preconditioners. The core of our\nsolver is a neural network trained to approximate the inverse of a discrete\nstructured-grid Laplace operator for a domain of arbitrary shape and with mixed\nboundary conditions. The structure of this problem motivates a novel network\narchitecture that we demonstrate is highly effective as a preconditioner even\nfor boundary conditions outside the training set. We show that on challenging\ntest cases arising from an incompressible fluid simulation, our method\noutperforms state-of-the-art solvers like algebraic multigrid as well as some\nrecent neural preconditioners.\n","authors":["Kai Weixian Lan","Elias Gueidon","Ayano Kaneda","Julian Panetta","Joseph Teran"],"pdf_url":"https://arxiv.org/pdf/2310.00177v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06118v2","updated":"2023-10-12T08:46:40Z","published":"2023-05-10T13:09:57Z","title":"NeRF2: Neural Radio-Frequency Radiance Fields","summary":"  Although Maxwell discovered the physical laws of electromagnetic waves 160\nyears ago, how to precisely model the propagation of an RF signal in an\nelectrically large and complex environment remains a long-standing problem. The\ndifficulty is in the complex interactions between the RF signal and the\nobstacles (e.g., reflection, diffraction, etc.). Inspired by the great success\nof using a neural network to describe the optical field in computer vision, we\npropose a neural radio-frequency radiance field, NeRF$^\\textbf{2}$, which\nrepresents a continuous volumetric scene function that makes sense of an RF\nsignal's propagation. Particularly, after training with a few signal\nmeasurements, NeRF$^\\textbf{2}$ can tell how/what signal is received at any\nposition when it knows the position of a transmitter. As a physical-layer\nneural network, NeRF$^\\textbf{2}$ can take advantage of the learned statistic\nmodel plus the physical model of ray tracing to generate a synthetic dataset\nthat meets the training demands of application-layer artificial neural networks\n(ANNs). Thus, we can boost the performance of ANNs by the proposed\nturbo-learning, which mixes the true and synthetic datasets to intensify the\ntraining. Our experiment results show that turbo-learning can enhance\nperformance with an approximate 50% increase. We also demonstrate the power of\nNeRF$^\\textbf{2}$ in the field of indoor localization and 5G MIMO.\n","authors":["Xiaopeng Zhao","Zhenlin An","Qingrui Pan","Lei Yang"],"pdf_url":"https://arxiv.org/pdf/2305.06118v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.02286v3","updated":"2023-10-12T08:40:31Z","published":"2022-10-05T14:22:24Z","title":"Efficient probabilistic reconciliation of forecasts for real-valued and\n  count time series","summary":"  Hierarchical time series are common in several applied fields. The forecasts\nfor these time series are required to be coherent, that is, to satisfy the\nconstraints given by the hierarchy. The most popular technique to enforce\ncoherence is called reconciliation, which adjusts the base forecasts computed\nfor each time series. However, recent works on probabilistic reconciliation\npresent several limitations. In this paper, we propose a new approach based on\nconditioning to reconcile any type of forecast distribution. We then introduce\na new algorithm, called Bottom-Up Importance Sampling, to efficiently sample\nfrom the reconciled distribution. It can be used for any base forecast\ndistribution: discrete, continuous, or in the form of samples, providing a\nmajor speedup compared to the current methods. Experiments on several temporal\nhierarchies show a significant improvement over base probabilistic forecasts.\n","authors":["Lorenzo Zambon","Dario Azzimonti","Giorgio Corani"],"pdf_url":"https://arxiv.org/pdf/2210.02286v3.pdf","comment":"27 pages, 4 figures"},{"id":"http://arxiv.org/abs/2310.08122v1","updated":"2023-10-12T08:24:02Z","published":"2023-10-12T08:24:02Z","title":"Core-sets for Fair and Diverse Data Summarization","summary":"  We study core-set construction algorithms for the task of Diversity\nMaximization under fairness/partition constraint. Given a set of points $P$ in\na metric space partitioned into $m$ groups, and given $k_1,\\ldots,k_m$, the\ngoal of this problem is to pick $k_i$ points from each group $i$ such that the\noverall diversity of the $k=\\sum_i k_i$ picked points is maximized. We consider\ntwo natural diversity measures: sum-of-pairwise distances and\nsum-of-nearest-neighbor distances, and show improved core-set construction\nalgorithms with respect to these measures. More precisely, we show the first\nconstant factor core-set w.r.t. sum-of-pairwise distances whose size is\nindependent of the size of the dataset and the aspect ratio. Second, we show\nthe first core-set w.r.t. the sum-of-nearest-neighbor distances. Finally, we\nrun several experiments showing the effectiveness of our core-set approach. In\nparticular, we apply constrained diversity maximization to summarize a set of\ntimed messages that takes into account the messages' recency. Specifically, the\nsummary should include more recent messages compared to older ones. This is a\nreal task in one of the largest communication platforms, affecting the\nexperience of hundreds of millions daily active users. By utilizing our\ncore-set method for this task, we achieve a 100x speed-up while losing the\ndiversity by only a few percent. Moreover, our approach allows us to improve\nthe space usage of the algorithm in the streaming setting.\n","authors":["Sepideh Mahabadi","Stojan Trajanovski"],"pdf_url":"https://arxiv.org/pdf/2310.08122v1.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.08109v1","updated":"2023-10-12T08:10:31Z","published":"2023-10-12T08:10:31Z","title":"Overview of Physics-Informed Machine Learning Inversion of Geophysical\n  Data","summary":"  We review four types of algorithms for physics-informed machine learning\n(PIML) inversion of geophysical data. The unifying equation is given by the\njoint objective function $\\epsilon$:\n  \\begin{eqnarray} \\epsilon^{||-PIML}&=&\\lambda_1 \\overbrace{||{\\bf\nW}^{ML}({\\bf H}_{{\\bf w}} {\\bf d}^{obs}-{\\bf m})||^2}^{NN} + \\lambda_2\n\\overbrace{{||{\\bf W}^{FWI}({\\bf L} {\\bf m}-{\\bf d}^{obs})||^2}}^{FWI} ~+\n\\nonumber\\\\ \\nonumber\\\\ && + ~~Regularizer, \\label{PIML.eq120}\n\\end{eqnarray}where the optimal model ${\\bf m}^*$ and weights $\\bf w^*$\nminimize $\\epsilon$. Here, The matrix weights are given by the boldface symbol\n$\\bf W$, and full waveform inversion (FWI) is typically computed using a\nfinite-difference solution of the wave equation, where $\\bf L$ represents the\nforward modeling operation of the wave equation as a function of the model $\\bf\nm$. Also, a fully-connected neural network (NN) is used to compute the model\n${\\bf H_w}{\\bf d}^{obs} \\approx \\bf m$ from the observed input data ${\\bf\nd}^{obs}$. The selection of weights $\\lambda_i$ and the NN operations determine\none of four different PIML algorithms.\n  PIML offers potential advantages over standard FWI through its enhanced\nability to avoid local minima and the option to locally train the inversion\noperator, minimizing the requirement for extensive training data for global\napplicability. However, the effectiveness of PIML relies on the similarity\nbetween the test and trained data. Nevertheless, a possible strategy to\novercome this limitation involves initial pretraining of a PIML architecture\nwith data from a broader region, followed by fine-tuning for specific data-a\nmethod reminiscent of the way large language models are pretrained and adapted\nfor various tasks.\n","authors":["Gerard T. Schuster","Shihang Feng"],"pdf_url":"https://arxiv.org/pdf/2310.08109v1.pdf","comment":"37 pages, 16 figures"},{"id":"http://arxiv.org/abs/2209.10404v3","updated":"2023-10-12T07:55:10Z","published":"2022-09-21T14:51:42Z","title":"GP-net: Flexible Viewpoint Grasp Proposal","summary":"  We present the Grasp Proposal Network (GP-net), a Convolutional Neural\nNetwork model which can generate 6-DoF grasps from flexible viewpoints, e.g. as\nexperienced by mobile manipulators. To train GP-net, we synthetically generate\na dataset containing depth-images and ground-truth grasp information. In\nreal-world experiments, we use the EGAD evaluation benchmark to evaluate GP-net\nagainst two commonly used algorithms, the Volumetric Grasping Network (VGN) and\nthe Grasp Pose Detection package (GPD), on a PAL TIAGo mobile manipulator. In\ncontrast to the state-of-the-art methods in robotic grasping, GP-net can be\nused for grasping objects from flexible, unknown viewpoints without the need to\ndefine the workspace and achieves a grasp success of 54.4% compared to 51.6%\nfor VGN and 44.2% for GPD. We provide a ROS package along with our code and\npre-trained models at https://aucoroboticsmu.github.io/GP-net/.\n","authors":["Anna Konrad","John McDonald","Rudi Villing"],"pdf_url":"https://arxiv.org/pdf/2209.10404v3.pdf","comment":"Accepted to ICAR 2023"},{"id":"http://arxiv.org/abs/2309.15505v2","updated":"2023-10-12T07:55:05Z","published":"2023-09-27T09:13:40Z","title":"Finite Scalar Quantization: VQ-VAE Made Simple","summary":"  We propose to replace vector quantization (VQ) in the latent representation\nof VQ-VAEs with a simple scheme termed finite scalar quantization (FSQ), where\nwe project the VAE representation down to a few dimensions (typically less than\n10). Each dimension is quantized to a small set of fixed values, leading to an\n(implicit) codebook given by the product of these sets. By appropriately\nchoosing the number of dimensions and values each dimension can take, we obtain\nthe same codebook size as in VQ. On top of such discrete representations, we\ncan train the same models that have been trained on VQ-VAE representations. For\nexample, autoregressive and masked transformer models for image generation,\nmultimodal generation, and dense prediction computer vision tasks. Concretely,\nwe employ FSQ with MaskGIT for image generation, and with UViM for depth\nestimation, colorization, and panoptic segmentation. Despite the much simpler\ndesign of FSQ, we obtain competitive performance in all these tasks. We\nemphasize that FSQ does not suffer from codebook collapse and does not need the\ncomplex machinery employed in VQ (commitment losses, codebook reseeding, code\nsplitting, entropy penalties, etc.) to learn expressive discrete\nrepresentations.\n","authors":["Fabian Mentzer","David Minnen","Eirikur Agustsson","Michael Tschannen"],"pdf_url":"https://arxiv.org/pdf/2309.15505v2.pdf","comment":"Code:\n  https://github.com/google-research/google-research/tree/master/fsq"},{"id":"http://arxiv.org/abs/2310.08100v1","updated":"2023-10-12T07:50:37Z","published":"2023-10-12T07:50:37Z","title":"Generative Intrinsic Optimization: Intrisic Control with Model Learning","summary":"  Future sequence represents the outcome after executing the action into the\nenvironment. When driven by the information-theoretic concept of mutual\ninformation, it seeks maximally informative consequences. Explicit outcomes may\nvary across state, return, or trajectory serving different purposes such as\ncredit assignment or imitation learning. However, the inherent nature of\nincorporating intrinsic motivation with reward maximization is often neglected.\nIn this work, we propose a variational approach to jointly learn the necessary\nquantity for estimating the mutual information and the dynamics model,\nproviding a general framework for incorporating different forms of outcomes of\ninterest. Integrated into a policy iteration scheme, our approach guarantees\nconvergence to the optimal policy. While we mainly focus on theoretical\nanalysis, our approach opens the possibilities of leveraging intrinsic control\nwith model learning to enhance sample efficiency and incorporate uncertainty of\nthe environment into decision-making.\n","authors":["Jianfei Ma"],"pdf_url":"https://arxiv.org/pdf/2310.08100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08096v1","updated":"2023-10-12T07:43:27Z","published":"2023-10-12T07:43:27Z","title":"ClimateBERT-NetZero: Detecting and Assessing Net Zero and Reduction\n  Targets","summary":"  Public and private actors struggle to assess the vast amounts of information\nabout sustainability commitments made by various institutions. To address this\nproblem, we create a novel tool for automatically detecting corporate,\nnational, and regional net zero and reduction targets in three steps. First, we\nintroduce an expert-annotated data set with 3.5K text samples. Second, we train\nand release ClimateBERT-NetZero, a natural language classifier to detect\nwhether a text contains a net zero or reduction target. Third, we showcase its\nanalysis potential with two use cases: We first demonstrate how\nClimateBERT-NetZero can be combined with conventional question-answering (Q&A)\nmodels to analyze the ambitions displayed in net zero and reduction targets.\nFurthermore, we employ the ClimateBERT-NetZero model on quarterly earning call\ntranscripts and outline how communication patterns evolve over time. Our\nexperiments demonstrate promising pathways for extracting and analyzing net\nzero and emission reduction targets at scale.\n","authors":["Tobias Schimanski","Julia Bingler","Camilla Hyslop","Mathias Kraus","Markus Leippold"],"pdf_url":"https://arxiv.org/pdf/2310.08096v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08091v1","updated":"2023-10-12T07:38:10Z","published":"2023-10-12T07:38:10Z","title":"Discerning Temporal Difference Learning","summary":"  Temporal difference learning (TD) is a foundational concept in reinforcement\nlearning (RL), aimed at efficiently assessing a policy's value function.\nTD($\\lambda$), a potent variant, incorporates a memory trace to distribute the\nprediction error into the historical context. However, this approach often\nneglects the significance of historical states and the relative importance of\npropagating the TD error, influenced by challenges such as visitation imbalance\nor outcome noise. To address this, we propose a novel TD algorithm named\ndiscerning TD learning (DTD), which allows flexible emphasis\nfunctions$-$predetermined or adapted during training$-$to allocate efforts\neffectively across states. We establish the convergence properties of our\nmethod within a specific class of emphasis functions and showcase its promising\npotential for adaptation to deep RL contexts. Empirical results underscore that\nemploying a judicious emphasis function not only improves value estimation but\nalso expedites learning across diverse scenarios.\n","authors":["Jianfei Ma"],"pdf_url":"https://arxiv.org/pdf/2310.08091v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08088v1","updated":"2023-10-12T07:26:41Z","published":"2023-10-12T07:26:41Z","title":"Dealing with zero-inflated data: achieving SOTA with a two-fold machine\n  learning approach","summary":"  In many cases, a machine learning model must learn to correctly predict a few\ndata points with particular values of interest in a broader range of data where\nmany target values are zero. Zero-inflated data can be found in diverse\nscenarios, such as lumpy and intermittent demands, power consumption for home\nappliances being turned on and off, impurities measurement in distillation\nprocesses, and even airport shuttle demand prediction. The presence of zeroes\naffects the models' learning and may result in poor performance. Furthermore,\nzeroes also distort the metrics used to compute the model's prediction quality.\nThis paper showcases two real-world use cases (home appliances classification\nand airport shuttle demand prediction) where a hierarchical model applied in\nthe context of zero-inflated data leads to excellent results. In particular,\nfor home appliances classification, the weighted average of Precision, Recall,\nF1, and AUC ROC was increased by 27%, 34%, 49%, and 27%, respectively.\nFurthermore, it is estimated that the proposed approach is also four times more\nenergy efficient than the SOTA approach against which it was compared to.\nTwo-fold models performed best in all cases when predicting airport shuttle\ndemand, and the difference against other models has been proven to be\nstatistically significant.\n","authors":["Jože M. Rožanec","Gašper Petelin","João Costa","Blaž Bertalanič","Gregor Cerar","Marko Guček","Gregor Papa","Dunja Mladenić"],"pdf_url":"https://arxiv.org/pdf/2310.08088v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08087v1","updated":"2023-10-12T07:20:03Z","published":"2023-10-12T07:20:03Z","title":"A Carbon Tracking Model for Federated Learning: Impact of Quantization\n  and Sparsification","summary":"  Federated Learning (FL) methods adopt efficient communication technologies to\ndistribute machine learning tasks across edge devices, reducing the overhead in\nterms of data storage and computational complexity compared to centralized\nsolutions. Rather than moving large data volumes from producers (sensors,\nmachines) to energy-hungry data centers, raising environmental concerns due to\nresource demands, FL provides an alternative solution to mitigate the energy\ndemands of several learning tasks while enabling new Artificial Intelligence of\nThings (AIoT) applications. This paper proposes a framework for real-time\nmonitoring of the energy and carbon footprint impacts of FL systems. The carbon\ntracking tool is evaluated for consensus (fully decentralized) and classical FL\npolicies. For the first time, we present a quantitative evaluation of different\ncomputationally and communication efficient FL methods from the perspectives of\nenergy consumption and carbon equivalent emissions, suggesting also general\nguidelines for energy-efficient design. Results indicate that consensus-driven\nFL implementations should be preferred for limiting carbon emissions when the\nenergy efficiency of the communication is low (i.e., < 25 Kbit/Joule). Besides,\nquantization and sparsification operations are shown to strike a balance\nbetween learning performances and energy consumption, leading to sustainable FL\ndesigns.\n","authors":["Luca Barbieri","Stefano Savazzi","Sanaz Kianoush","Monica Nicoli","Luigi Serio"],"pdf_url":"https://arxiv.org/pdf/2310.08087v1.pdf","comment":"accepted for presentation at IEEE CAMAD 2023"},{"id":"http://arxiv.org/abs/2307.03486v2","updated":"2023-10-12T07:13:32Z","published":"2023-07-07T09:47:15Z","title":"Discovering Hierarchical Achievements in Reinforcement Learning via\n  Contrastive Learning","summary":"  Discovering achievements with a hierarchical structure in procedurally\ngenerated environments presents a significant challenge. This requires an agent\nto possess a broad range of abilities, including generalization and long-term\nreasoning. Many prior methods have been built upon model-based or hierarchical\napproaches, with the belief that an explicit module for long-term planning\nwould be advantageous for learning hierarchical dependencies. However, these\nmethods demand an excessive number of environment interactions or large model\nsizes, limiting their practicality. In this work, we demonstrate that proximal\npolicy optimization (PPO), a simple yet versatile model-free algorithm,\noutperforms previous methods when optimized with recent implementation\npractices. Moreover, we find that the PPO agent can predict the next\nachievement to be unlocked to some extent, albeit with limited confidence.\nBased on this observation, we introduce a novel contrastive learning method,\ncalled achievement distillation, which strengthens the agent's ability to\npredict the next achievement. Our method exhibits a strong capacity for\ndiscovering hierarchical achievements and shows state-of-the-art performance on\nthe challenging Crafter environment in a sample-efficient manner while\nutilizing fewer model parameters.\n","authors":["Seungyong Moon","Junyoung Yeom","Bumsoo Park","Hyun Oh Song"],"pdf_url":"https://arxiv.org/pdf/2307.03486v2.pdf","comment":"Accepted at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.08078v1","updated":"2023-10-12T06:59:10Z","published":"2023-10-12T06:59:10Z","title":"To token or not to token: A Comparative Study of Text Representations\n  for Cross-Lingual Transfer","summary":"  Choosing an appropriate tokenization scheme is often a bottleneck in\nlow-resource cross-lingual transfer. To understand the downstream implications\nof text representation choices, we perform a comparative analysis on language\nmodels having diverse text representation modalities including 2\nsegmentation-based models (\\texttt{BERT}, \\texttt{mBERT}), 1 image-based model\n(\\texttt{PIXEL}), and 1 character-level model (\\texttt{CANINE}). First, we\npropose a scoring Language Quotient (LQ) metric capable of providing a weighted\nrepresentation of both zero-shot and few-shot evaluation combined. Utilizing\nthis metric, we perform experiments comprising 19 source languages and 133\ntarget languages on three tasks (POS tagging, Dependency parsing, and NER). Our\nanalysis reveals that image-based models excel in cross-lingual transfer when\nlanguages are closely related and share visually similar scripts. However, for\ntasks biased toward word meaning (POS, NER), segmentation-based models prove to\nbe superior. Furthermore, in dependency parsing tasks where word relationships\nplay a crucial role, models with their character-level focus, outperform\nothers. Finally, we propose a recommendation scheme based on our findings to\nguide model selection according to task and language requirements.\n","authors":["Md Mushfiqur Rahman","Fardin Ahsan Sakib","Fahim Faisal","Antonios Anastasopoulos"],"pdf_url":"https://arxiv.org/pdf/2310.08078v1.pdf","comment":"Accepted at 3RD MULTILINGUAL REPRESENTATION LEARNING (MRL) WORKSHOP,\n  2023"},{"id":"http://arxiv.org/abs/2310.07312v2","updated":"2023-10-12T06:57:51Z","published":"2023-10-11T08:57:59Z","title":"WiGenAI: The Symphony of Wireless and Generative AI via Diffusion Models","summary":"  Innovative foundation models, such as GPT-3 and stable diffusion models, have\nmade a paradigm shift in the realm of artificial intelligence (AI) towards\ngenerative AI-based systems. In unison, from data communication and networking\nperspective, AI and machine learning (AI/ML) algorithms are envisioned to be\npervasively incorporated into the future generations of wireless communications\nsystems, highlighting the need for novel AI-native solutions for the emergent\ncommunication scenarios. In this article, we outline the applications of\ngenerative AI in wireless communication systems to lay the foundations for\nresearch in this field. Diffusion-based generative models, as the new\nstate-of-the-art paradigm of generative models, are introduced, and their\napplications in wireless communication systems are discussed. Two case studies\nare also presented to showcase how diffusion models can be exploited for the\ndevelopment of resilient AI-native communication systems. Specifically, we\npropose denoising diffusion probabilistic models (DDPM) for a wireless\ncommunication scheme with non-ideal transceivers, where 30% improvement is\nachieved in terms of bit error rate. As the second application, DDPMs are\nemployed at the transmitter to shape the constellation symbols, highlighting a\nrobust out-of-distribution performance. Finally, future directions and open\nissues for the development of generative AI-based wireless systems are\ndiscussed to promote future research endeavors towards wireless generative AI\n(WiGenAI).\n","authors":["Mehdi Letafati","Samad Ali","Matti Latva-aho"],"pdf_url":"https://arxiv.org/pdf/2310.07312v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08073v1","updated":"2023-10-12T06:50:43Z","published":"2023-10-12T06:50:43Z","title":"Samples on Thin Ice: Re-Evaluating Adversarial Pruning of Neural\n  Networks","summary":"  Neural network pruning has shown to be an effective technique for reducing\nthe network size, trading desirable properties like generalization and\nrobustness to adversarial attacks for higher sparsity. Recent work has claimed\nthat adversarial pruning methods can produce sparse networks while also\npreserving robustness to adversarial examples. In this work, we first\nre-evaluate three state-of-the-art adversarial pruning methods, showing that\ntheir robustness was indeed overestimated. We then compare pruned and dense\nversions of the same models, discovering that samples on thin ice, i.e., closer\nto the unpruned model's decision boundary, are typically misclassified after\npruning. We conclude by discussing how this intuition may lead to designing\nmore effective adversarial pruning methods in future work.\n","authors":["Giorgio Piras","Maura Pintor","Ambra Demontis","Battista Biggio"],"pdf_url":"https://arxiv.org/pdf/2310.08073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02010v2","updated":"2023-10-12T06:46:25Z","published":"2023-06-03T05:45:29Z","title":"Memorization Capacity of Multi-Head Attention in Transformers","summary":"  Transformers have become the go-to architecture for language and vision\ntasks, yet their theoretical properties, especially memorization capacity,\nremain elusive. This paper investigates the memorization abilities of\nmulti-head attention mechanisms, examining how many example sequences they can\nmemorize, as a function of the number of heads and sequence length. Motivated\nby experimental findings on vision transformers, we introduce novel assumptions\nabout the linear independence of input data, distinct from the commonly used\ngeneral-position assumption. Under these assumptions, we demonstrate that an\nattention layer with $H$ heads, dimension $d$, and context size $n < d$,\nfeaturing $\\Theta(Hd^2)$ parameters, can memorize $\\Omega(Hn)$ examples. Our\nanalysis sheds light on how different attention heads handle various example\nsequences, aided by the softmax operator's saturation property. We validate our\nfindings through experiments on synthetic data.\n","authors":["Sadegh Mahdavi","Renjie Liao","Christos Thrampoulidis"],"pdf_url":"https://arxiv.org/pdf/2306.02010v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08071v1","updated":"2023-10-12T06:36:41Z","published":"2023-10-12T06:36:41Z","title":"Learning Transferable Conceptual Prototypes for Interpretable\n  Unsupervised Domain Adaptation","summary":"  Despite the great progress of unsupervised domain adaptation (UDA) with the\ndeep neural networks, current UDA models are opaque and cannot provide\npromising explanations, limiting their applications in the scenarios that\nrequire safe and controllable model decisions. At present, a surge of work\nfocuses on designing deep interpretable methods with adequate data annotations\nand only a few methods consider the distributional shift problem. Most existing\ninterpretable UDA methods are post-hoc ones, which cannot facilitate the model\nlearning process for performance enhancement. In this paper, we propose an\ninherently interpretable method, named Transferable Conceptual Prototype\nLearning (TCPL), which could simultaneously interpret and improve the processes\nof knowledge transfer and decision-making in UDA. To achieve this goal, we\ndesign a hierarchically prototypical module that transfers categorical basic\nconcepts from the source domain to the target domain and learns domain-shared\nprototypes for explaining the underlying reasoning process. With the learned\ntransferable prototypes, a self-predictive consistent pseudo-label strategy\nthat fuses confidence, predictions, and prototype information, is designed for\nselecting suitable target samples for pseudo annotations and gradually\nnarrowing down the domain gap. Comprehensive experiments show that the proposed\nmethod can not only provide effective and intuitive explanations but also\noutperform previous state-of-the-arts.\n","authors":["Junyu Gao","Xinhong Ma","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2310.08071v1.pdf","comment":"Submitted to IEEE TIP"},{"id":"http://arxiv.org/abs/2310.08070v1","updated":"2023-10-12T06:36:31Z","published":"2023-10-12T06:36:31Z","title":"Tight Time-Space Lower Bounds for Constant-Pass Learning","summary":"  In his breakthrough paper, Raz showed that any parity learning algorithm\nrequires either quadratic memory or an exponential number of samples [FOCS'16,\nJACM'19]. A line of work that followed extended this result to a large class of\nlearning problems. Until recently, all these results considered learning in the\nstreaming model, where each sample is drawn independently, and the learner is\nallowed a single pass over the stream of samples. Garg, Raz, and Tal [CCC'19]\nconsidered a stronger model, allowing multiple passes over the stream. In the\n$2$-pass model, they showed that learning parities of size $n$ requires either\na memory of size $n^{1.5}$ or at least $2^{\\sqrt{n}}$ samples. (Their result\nalso generalizes to other learning problems.)\n  In this work, for any constant $q$, we prove tight memory-sample lower bounds\nfor any parity learning algorithm that makes $q$ passes over the stream of\nsamples. We show that such a learner requires either $\\Omega(n^{2})$ memory\nsize or at least $2^{\\Omega(n)}$ samples. Beyond establishing a tight lower\nbound, this is the first non-trivial lower bound for $q$-pass learning for any\n$q\\ge 3$. Similar to prior work, our results extend to any learning problem\nwith many nearly-orthogonal concepts.\n  We complement the lower bound with an upper bound, showing that parity\nlearning with $q$ passes can be done efficiently with $O(n^2/\\log q)$ memory.\n","authors":["Xin Lyu","Avishay Tal","Hongxun Wu","Junzhao Yang"],"pdf_url":"https://arxiv.org/pdf/2310.08070v1.pdf","comment":"To appear at FOCS 2023"},{"id":"http://arxiv.org/abs/2310.08069v1","updated":"2023-10-12T06:32:42Z","published":"2023-10-12T06:32:42Z","title":"Rethinking Negative Pairs in Code Search","summary":"  Recently, contrastive learning has become a key component in fine-tuning code\nsearch models for software development efficiency and effectiveness. It pulls\ntogether positive code snippets while pushing negative samples away given\nsearch queries. Among contrastive learning, InfoNCE is the most widely used\nloss function due to its better performance. However, the following problems in\nnegative samples of InfoNCE may deteriorate its representation learning: 1) The\nexistence of false negative samples in large code corpora due to duplications.\n2). The failure to explicitly differentiate between the potential relevance of\nnegative samples. As an example, a bubble sorting algorithm example is less\n``negative'' than a file saving function for the quick sorting algorithm query.\nIn this paper, we tackle the above problems by proposing a simple yet effective\nSoft-InfoNCE loss that inserts weight terms into InfoNCE. In our proposed loss\nfunction, we apply three methods to estimate the weights of negative pairs and\nshow that the vanilla InfoNCE loss is a special case of Soft-InfoNCE.\nTheoretically, we analyze the effects of Soft-InfoNCE on controlling the\ndistribution of learnt code representations and on deducing a more precise\nmutual information estimation. We furthermore discuss the superiority of\nproposed loss functions with other design alternatives. Extensive experiments\ndemonstrate the effectiveness of Soft-InfoNCE and weights estimation methods\nunder state-of-the-art code search models on a large-scale public dataset\nconsisting of six programming languages. Source code is available at\n\\url{https://github.com/Alex-HaochenLi/Soft-InfoNCE}.\n","authors":["Haochen Li","Xin Zhou","Luu Anh Tuan","Chunyan Miao"],"pdf_url":"https://arxiv.org/pdf/2310.08069v1.pdf","comment":"Accepted to EMNLP 2023"},{"id":"http://arxiv.org/abs/2309.03084v2","updated":"2023-10-12T06:24:33Z","published":"2023-09-04T09:16:49Z","title":"Pure Monte Carlo Counterfactual Regret Minimization","summary":"  Counterfactual Regret Minimization (CFR) and its variants are the best\nalgorithms so far for solving large-scale incomplete information games.\nHowever, we believe that there are two problems with CFR: First, matrix\nmultiplication is required in CFR iteration, and the time complexity of one\niteration is too high; Secondly, the game characteristics in the real world are\ndifferent. Just using one CFR algorithm will not be perfectly suitable for all\ngame problems.\n  For these two problems, this paper proposes a new algorithm called Pure CFR\n(PCFR) based on CFR. PCFR can be seen as a combination of CFR and Fictitious\nPlay (FP), inheriting the concept of counterfactual regret (value) from CFR,\nand using the best response strategy instead of the regret matching strategy\nfor the next iteration. This algorithm has three advantages. First, PCFR can be\ncombined with any CFR variant. The resulting Pure MCCFR (PMCCFR) can\nsignificantly reduce the time and space complexity of one iteration. Secondly,\nour experiments show that the convergence speed of the PMCCFR is 2$\\sim$3 times\nthat of the MCCFR. Finally, there is a type of game that is very suitable for\nPCFR, we call this type of game clear-game, which is characterized by a high\nproportion of dominated strategies. Experiments show that in clear-game, the\nconvergence rate of PMCCFR is two orders of magnitude higher than that of\nMCCFR.\n","authors":["Ju Qi","Ting Feng","Falun Hei","Zhemei Fang","Yunfeng Luo"],"pdf_url":"https://arxiv.org/pdf/2309.03084v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08061v1","updated":"2023-10-12T06:23:12Z","published":"2023-10-12T06:23:12Z","title":"ETDock: A Novel Equivariant Transformer for Protein-Ligand Docking","summary":"  Predicting the docking between proteins and ligands is a crucial and\nchallenging task for drug discovery. However, traditional docking methods\nmainly rely on scoring functions, and deep learning-based docking approaches\nusually neglect the 3D spatial information of proteins and ligands, as well as\nthe graph-level features of ligands, which limits their performance. To address\nthese limitations, we propose an equivariant transformer neural network for\nprotein-ligand docking pose prediction. Our approach involves the fusion of\nligand graph-level features by feature processing, followed by the learning of\nligand and protein representations using our proposed TAMformer module.\nAdditionally, we employ an iterative optimization approach based on the\npredicted distance matrix to generate refined ligand poses. The experimental\nresults on real datasets show that our model can achieve state-of-the-art\nperformance.\n","authors":["Yiqiang Yi","Xu Wan","Yatao Bian","Le Ou-Yang","Peilin Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.08061v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08056v1","updated":"2023-10-12T06:09:26Z","published":"2023-10-12T06:09:26Z","title":"Learning from Label Proportions: Bootstrapping Supervised Learners via\n  Belief Propagation","summary":"  Learning from Label Proportions (LLP) is a learning problem where only\naggregate level labels are available for groups of instances, called bags,\nduring training, and the aim is to get the best performance at the\ninstance-level on the test data. This setting arises in domains like\nadvertising and medicine due to privacy considerations. We propose a novel\nalgorithmic framework for this problem that iteratively performs two main\nsteps. For the first step (Pseudo Labeling) in every iteration, we define a\nGibbs distribution over binary instance labels that incorporates a) covariate\ninformation through the constraint that instances with similar covariates\nshould have similar labels and b) the bag level aggregated label. We then use\nBelief Propagation (BP) to marginalize the Gibbs distribution to obtain pseudo\nlabels. In the second step (Embedding Refinement), we use the pseudo labels to\nprovide supervision for a learner that yields a better embedding. Further, we\niterate on the two steps again by using the second step's embeddings as new\ncovariates for the next iteration. In the final iteration, a classifier is\ntrained using the pseudo labels. Our algorithm displays strong gains against\nseveral SOTA baselines (up to 15%) for the LLP Binary Classification problem on\nvarious dataset types - tabular and Image. We achieve these improvements with\nminimal computational overhead above standard supervised learning due to Belief\nPropagation, for large bag sizes, even for a million samples.\n","authors":["Shreyas Havaldar","Navodita Sharma","Shubhi Sareen","Karthikeyan Shanmugam","Aravindan Raghuveer"],"pdf_url":"https://arxiv.org/pdf/2310.08056v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07365v2","updated":"2023-10-12T06:00:52Z","published":"2023-10-11T10:30:49Z","title":"GraphControl: Adding Conditional Control to Universal Graph Pre-trained\n  Models for Graph Domain Transfer Learning","summary":"  Graph-structured data is ubiquitous in the world which models complex\nrelationships between objects, enabling various Web applications. Daily\ninfluxes of unlabeled graph data on the Web offer immense potential for these\napplications. Graph self-supervised algorithms have achieved significant\nsuccess in acquiring generic knowledge from abundant unlabeled graph data.\nThese pre-trained models can be applied to various downstream Web applications,\nsaving training time and improving downstream (target) performance. However,\ndifferent graphs, even across seemingly similar domains, can differ\nsignificantly in terms of attribute semantics, posing difficulties, if not\ninfeasibility, for transferring the pre-trained models to downstream tasks.\nConcretely speaking, for example, the additional task-specific node information\nin downstream tasks (specificity) is usually deliberately omitted so that the\npre-trained representation (transferability) can be leveraged. The trade-off as\nsuch is termed as \"transferability-specificity dilemma\" in this work. To\naddress this challenge, we introduce an innovative deployment module coined as\nGraphControl, motivated by ControlNet, to realize better graph domain transfer\nlearning. Specifically, by leveraging universal structural pre-trained models\nand GraphControl, we align the input space across various graphs and\nincorporate unique characteristics of target data as conditional inputs. These\nconditions will be progressively integrated into the model during fine-tuning\nor prompt tuning through ControlNet, facilitating personalized deployment.\nExtensive experiments show that our method significantly enhances the\nadaptability of pre-trained models on target attributed datasets, achieving\n1.4-3x performance gain. Furthermore, it outperforms training-from-scratch\nmethods on target data with a comparable margin and exhibits faster\nconvergence.\n","authors":["Yun Zhu","Yaoke Wang","Haizhou Shi","Zhenshuo Zhang","Siliang Tang"],"pdf_url":"https://arxiv.org/pdf/2310.07365v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2310.08051v1","updated":"2023-10-12T05:52:54Z","published":"2023-10-12T05:52:54Z","title":"LGL-BCI: A Lightweight Geometric Learning Framework for Motor\n  Imagery-Based Brain-Computer Interfaces","summary":"  Brain-Computer Interfaces (BCIs) are a groundbreaking technology for\ninteracting with external devices using brain signals. Despite advancements,\nelectroencephalogram (EEG)-based Motor Imagery (MI) tasks face challenges like\namplitude and phase variability, and complex spatial correlations, with a need\nfor smaller model size and faster inference. This study introduces the LGL-BCI\nframework, employing a Geometric Deep Learning Framework for EEG processing in\nnon-Euclidean metric spaces, particularly the Symmetric Positive Definite (SPD)\nManifold space. LGL-BCI offers robust EEG data representation and captures\nspatial correlations. We propose an EEG channel selection solution via a\nfeature decomposition algorithm to reduce SPD matrix dimensionality, with a\nlossless transformation boosting inference speed. Extensive experiments show\nLGL-BCI's superior accuracy and efficiency compared to current solutions,\nhighlighting geometric deep learning's potential in MI-BCI applications. The\nefficiency, assessed on two public EEG datasets and two real-world EEG devices,\nsignificantly outperforms the state-of-the-art solution in accuracy ($82.54\\%$\nversus $62.22\\%$) with fewer parameters (64.9M compared to 183.7M).\n","authors":["Jianchao Lu","Yuzhe Tian","Yang Zhang","Jiaqi Ge","Quan Z. Sheng","Xi Zheng"],"pdf_url":"https://arxiv.org/pdf/2310.08051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06599v4","updated":"2023-10-12T05:51:30Z","published":"2023-06-11T06:27:06Z","title":"Variational Imbalanced Regression: Fair Uncertainty Quantification via\n  Probabilistic Smoothing","summary":"  Existing regression models tend to fall short in both accuracy and\nuncertainty estimation when the label distribution is imbalanced. In this\npaper, we propose a probabilistic deep learning model, dubbed variational\nimbalanced regression (VIR), which not only performs well in imbalanced\nregression but naturally produces reasonable uncertainty estimation as a\nbyproduct. Different from typical variational autoencoders assuming I.I.D.\nrepresentations (a data point's representation is not directly affected by\nother data points), our VIR borrows data with similar regression labels to\ncompute the latent representation's variational distribution; furthermore,\ndifferent from deterministic regression models producing point estimates, VIR\npredicts the entire normal-inverse-gamma distributions and modulates the\nassociated conjugate distributions to impose probabilistic reweighting on the\nimbalanced data, thereby providing better uncertainty estimation. Experiments\nin several real-world datasets show that our VIR can outperform\nstate-of-the-art imbalanced regression models in terms of both accuracy and\nuncertainty estimation. Code will soon be available at\n\\url{https://github.com/Wang-ML-Lab/variational-imbalanced-regression}.\n","authors":["Ziyan Wang","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2306.06599v4.pdf","comment":"Accepted at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.08049v1","updated":"2023-10-12T05:43:06Z","published":"2023-10-12T05:43:06Z","title":"Exploring the Relationship Between Model Architecture and In-Context\n  Learning Ability","summary":"  What is the relationship between model architecture and the ability to\nperform in-context learning? In this empirical study, we take the first steps\ntowards answering this question. In particular, we evaluate fifteen model\narchitectures across a suite of synthetic in-context learning tasks. The\nselected architectures represent a broad range of paradigms, including\nrecurrent and convolution-based neural networks, transformers, and emerging\nattention alternatives. We discover that all considered architectures can\nperform in-context learning under certain conditions. However, contemporary\narchitectures are found to be the best performing, especially as task\ncomplexity grows. Additionally, our follow-up experiments delve into various\nfactors that influence in-context learning. We observe varied sensitivities\namong architectures with respect to hyperparameter settings. Our study of\ntraining dynamics reveals that certain architectures exhibit a smooth,\nprogressive learning trajectory, while others demonstrate periods of stagnation\nfollowed by abrupt mastery of the task. Finally, and somewhat surprisingly, we\nfind that several emerging attention alternatives are more robust in-context\nlearners than transformers; since such approaches have constant-sized memory\nfootprints at inference time, this result opens the future possibility of\nscaling up in-context learning to vastly larger numbers of in-context examples.\n","authors":["Ivan Lee","Nan Jiang","Taylor Berg-Kirkpatrick"],"pdf_url":"https://arxiv.org/pdf/2310.08049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05624v2","updated":"2023-10-12T05:33:19Z","published":"2023-10-09T11:26:58Z","title":"Locality-Aware Generalizable Implicit Neural Representation","summary":"  Generalizable implicit neural representation (INR) enables a single\ncontinuous function, i.e., a coordinate-based neural network, to represent\nmultiple data instances by modulating its weights or intermediate features\nusing latent codes. However, the expressive power of the state-of-the-art\nmodulation is limited due to its inability to localize and capture fine-grained\ndetails of data entities such as specific pixels and rays. To address this\nissue, we propose a novel framework for generalizable INR that combines a\ntransformer encoder with a locality-aware INR decoder. The transformer encoder\npredicts a set of latent tokens from a data instance to encode local\ninformation into each latent token. The locality-aware INR decoder extracts a\nmodulation vector by selectively aggregating the latent tokens via\ncross-attention for a coordinate input and then predicts the output by\nprogressively decoding with coarse-to-fine modulation through multiple\nfrequency bandwidths. The selective token aggregation and the multi-band\nfeature modulation enable us to learn locality-aware representation in spatial\nand spectral aspects, respectively. Our framework significantly outperforms\nprevious generalizable INRs and validates the usefulness of the locality-aware\nlatents for downstream tasks such as image generation.\n","authors":["Doyup Lee","Chiheon Kim","Minsu Cho","Wook-Shin Han"],"pdf_url":"https://arxiv.org/pdf/2310.05624v2.pdf","comment":"19 pages, 12 figures"},{"id":"http://arxiv.org/abs/2310.08041v1","updated":"2023-10-12T05:25:49Z","published":"2023-10-12T05:25:49Z","title":"QLLM: Accurate and Efficient Low-Bitwidth Quantization for Large\n  Language Models","summary":"  Large Language Models (LLMs) excel in NLP, but their demands hinder their\nwidespread deployment. While Quantization-Aware Training (QAT) offers a\nsolution, its extensive training costs make Post-Training Quantization (PTQ) a\nmore practical approach for LLMs. In existing studies, activation outliers in\nparticular channels are identified as the bottleneck to PTQ accuracy. They\npropose to transform the magnitudes from activations to weights, which however\noffers limited alleviation or suffers from unstable gradients, resulting in a\nsevere performance drop at low-bitwidth. In this paper, we propose QLLM, an\naccurate and efficient low-bitwidth PTQ method designed for LLMs. QLLM\nintroduces an adaptive channel reassembly technique that reallocates the\nmagnitude of outliers to other channels, thereby mitigating their impact on the\nquantization range. This is achieved by channel disassembly and channel\nassembly, which first breaks down the outlier channels into several\nsub-channels to ensure a more balanced distribution of activation magnitudes.\nThen similar channels are merged to maintain the original channel number for\nefficiency. Additionally, an adaptive strategy is designed to autonomously\ndetermine the optimal number of sub-channels for channel disassembly. To\nfurther compensate for the performance loss caused by quantization, we propose\nan efficient tuning method that only learns a small number of low-rank weights\nwhile freezing the pre-trained quantized model. After training, these low-rank\nparameters can be fused into the frozen weights without affecting inference.\nExtensive experiments on LLaMA-1 and LLaMA-2 show that QLLM can obtain accurate\nquantized models efficiently. For example, QLLM quantizes the 4-bit LLaMA-2-70B\nwithin 10 hours on a single A100-80G GPU, outperforming the previous\nstate-of-the-art method by 7.89% on the average accuracy across five zero-shot\ntasks.\n","authors":["Jing Liu","Ruihao Gong","Xiuying Wei","Zhiwei Dong","Jianfei Cai","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2310.08041v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.10902v2","updated":"2023-10-12T05:21:21Z","published":"2023-01-26T02:22:46Z","title":"Efficient Hyperdimensional Computing","summary":"  Hyperdimensional computing (HDC) is a method to perform classification that\nuses binary vectors with high dimensions and the majority rule. This approach\nhas the potential to be energy-efficient and hence deemed suitable for\nresource-limited platforms due to its simplicity and massive parallelism.\nHowever, in order to achieve high accuracy, HDC sometimes uses hypervectors\nwith tens of thousands of dimensions. This potentially negates its efficiency\nadvantage. In this paper, we examine the necessity of such high dimensions and\nconduct a detailed theoretical analysis of the relationship between hypervector\ndimensions and accuracy. Our results demonstrate that as the dimension of the\nhypervectors increases, the worst-case/average-case HDC prediction accuracy\nwith the majority rule decreases. Building on this insight, we develop HDC\nmodels that use binary hypervectors with dimensions orders of magnitude lower\nthan those of state-of-the-art HDC models while maintaining equivalent or even\nimproved accuracy and efficiency. For instance, on the MNIST dataset, we\nachieve 91.12% HDC accuracy in image classification with a dimension of only\n64. Our methods perform operations that are only 0.35% of other HDC models with\ndimensions of 10,000. Furthermore, we evaluate our methods on ISOLET, UCI-HAR,\nand Fashion-MNIST datasets and investigate the limits of HDC computing.\n","authors":["Zhanglu Yan","Shida Wang","Kaiwen Tang","Weng-Fai Wong"],"pdf_url":"https://arxiv.org/pdf/2301.10902v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08040v1","updated":"2023-10-12T05:20:18Z","published":"2023-10-12T05:20:18Z","title":"SEE-OoD: Supervised Exploration For Enhanced Out-of-Distribution\n  Detection","summary":"  Current techniques for Out-of-Distribution (OoD) detection predominantly rely\non quantifying predictive uncertainty and incorporating model regularization\nduring the training phase, using either real or synthetic OoD samples. However,\nmethods that utilize real OoD samples lack exploration and are prone to overfit\nthe OoD samples at hand. Whereas synthetic samples are often generated based on\nfeatures extracted from training data, rendering them less effective when the\ntraining and OoD data are highly overlapped in the feature space. In this work,\nwe propose a Wasserstein-score-based generative adversarial training scheme to\nenhance OoD detection accuracy, which, for the first time, performs data\naugmentation and exploration simultaneously under the supervision of limited\nOoD samples. Specifically, the generator explores OoD spaces and generates\nsynthetic OoD samples using feedback from the discriminator, while the\ndiscriminator exploits both the observed and synthesized samples for OoD\ndetection using a predefined Wasserstein score. We provide theoretical\nguarantees that the optimal solutions of our generative scheme are\nstatistically achievable through adversarial training in empirical settings. We\nthen demonstrate that the proposed method outperforms state-of-the-art\ntechniques on various computer vision datasets and exhibits superior\ngeneralizability to unseen OoD data.\n","authors":["Xiaoyang Song","Wenbo Sun","Maher Nouiehed","Raed Al Kontar","Judy Jin"],"pdf_url":"https://arxiv.org/pdf/2310.08040v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07297v2","updated":"2023-10-12T05:15:51Z","published":"2023-10-11T08:31:26Z","title":"Score Regularized Policy Optimization through Diffusion Behavior","summary":"  Recent developments in offline reinforcement learning have uncovered the\nimmense potential of diffusion modeling, which excels at representing\nheterogeneous behavior policies. However, sampling from diffusion policies is\nconsiderably slow because it necessitates tens to hundreds of iterative\ninference steps for one action. To address this issue, we propose to extract an\nefficient deterministic inference policy from critic models and pretrained\ndiffusion behavior models, leveraging the latter to directly regularize the\npolicy gradient with the behavior distribution's score function during\noptimization. Our method enjoys powerful generative capabilities of diffusion\nmodeling while completely circumventing the computationally intensive and\ntime-consuming diffusion sampling scheme, both during training and evaluation.\nExtensive results on D4RL tasks show that our method boosts action sampling\nspeed by more than 25 times compared with various leading diffusion-based\nmethods in locomotion tasks, while still maintaining state-of-the-art\nperformance.\n","authors":["Huayu Chen","Cheng Lu","Zhengyi Wang","Hang Su","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.07297v2.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2310.08039v1","updated":"2023-10-12T05:14:42Z","published":"2023-10-12T05:14:42Z","title":"Rethinking Large-scale Pre-ranking System: Entire-chain Cross-domain\n  Models","summary":"  Industrial systems such as recommender systems and online advertising, have\nbeen widely equipped with multi-stage architectures, which are divided into\nseveral cascaded modules, including matching, pre-ranking, ranking and\nre-ranking. As a critical bridge between matching and ranking, existing\npre-ranking approaches mainly endure sample selection bias (SSB) problem owing\nto ignoring the entire-chain data dependence, resulting in sub-optimal\nperformances. In this paper, we rethink pre-ranking system from the perspective\nof the entire sample space, and propose Entire-chain Cross-domain Models (ECM),\nwhich leverage samples from the whole cascaded stages to effectively alleviate\nSSB problem. Besides, we design a fine-grained neural structure named ECMM to\nfurther improve the pre-ranking accuracy. Specifically, we propose a\ncross-domain multi-tower neural network to comprehensively predict for each\nstage result, and introduce the sub-networking routing strategy with $L0$\nregularization to reduce computational costs. Evaluations on real-world\nlarge-scale traffic logs demonstrate that our pre-ranking models outperform\nSOTA methods while time consumption is maintained within an acceptable level,\nwhich achieves better trade-off between efficiency and effectiveness.\n","authors":["Jinbo Song","Ruoran Huang","Xinyang Wang","Wei Huang","Qian Yu","Mingming Chen","Yafei Yao","Chaosheng Fan","Changping Peng","Zhangang Lin","Jinghe Hu","Jingping Shao"],"pdf_url":"https://arxiv.org/pdf/2310.08039v1.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2310.08038v1","updated":"2023-10-12T05:09:27Z","published":"2023-10-12T05:09:27Z","title":"Continual Learning via Manifold Expansion Replay","summary":"  In continual learning, the learner learns multiple tasks in sequence, with\ndata being acquired only once for each task. Catastrophic forgetting is a major\nchallenge to continual learning. To reduce forgetting, some existing\nrehearsal-based methods use episodic memory to replay samples of previous\ntasks. However, in the process of knowledge integration when learning a new\ntask, this strategy also suffers from catastrophic forgetting due to an\nimbalance between old and new knowledge. To address this problem, we propose a\nnovel replay strategy called Manifold Expansion Replay (MaER). We argue that\nexpanding the implicit manifold of the knowledge representation in the episodic\nmemory helps to improve the robustness and expressiveness of the model. To this\nend, we propose a greedy strategy to keep increasing the diameter of the\nimplicit manifold represented by the knowledge in the buffer during memory\nmanagement. In addition, we introduce Wasserstein distance instead of cross\nentropy as distillation loss to preserve previous knowledge. With extensive\nexperimental validation on MNIST, CIFAR10, CIFAR100, and TinyImageNet, we show\nthat the proposed method significantly improves the accuracy in continual\nlearning setup, outperforming the state of the arts.\n","authors":["Zihao Xu","Xuan Tang","Yufei Shi","Jianfeng Zhang","Jian Yang","Mingsong Chen","Xian Wei"],"pdf_url":"https://arxiv.org/pdf/2310.08038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08036v1","updated":"2023-10-12T05:08:21Z","published":"2023-10-12T05:08:21Z","title":"ZEST: Attention-based Zero-Shot Learning for Unseen IoT Device\n  Classification","summary":"  Recent research works have proposed machine learning models for classifying\nIoT devices connected to a network. However, there is still a practical\nchallenge of not having all devices (and hence their traffic) available during\nthe training of a model. This essentially means, during the operational phase,\nwe need to classify new devices not seen during the training phase. To address\nthis challenge, we propose ZEST -- a ZSL (zero-shot learning) framework based\non self-attention for classifying both seen and unseen devices. ZEST consists\nof i) a self-attention based network feature extractor, termed SANE, for\nextracting latent space representations of IoT traffic, ii) a generative model\nthat trains a decoder using latent features to generate pseudo data, and iii) a\nsupervised model that is trained on the generated pseudo data for classifying\ndevices. We carry out extensive experiments on real IoT traffic data; our\nexperiments demonstrate i) ZEST achieves significant improvement (in terms of\naccuracy) over the baselines; ii) ZEST is able to better extract meaningful\nrepresentations than LSTM which has been commonly used for modeling network\ntraffic.\n","authors":["Binghui Wu","Philipp Gysel","Dinil Mon Divakaran","Mohan Gurusamy"],"pdf_url":"https://arxiv.org/pdf/2310.08036v1.pdf","comment":"9 pages, 6 figures, 3 tables"},{"id":"http://arxiv.org/abs/2306.03410v2","updated":"2023-10-12T04:57:23Z","published":"2023-06-06T05:17:02Z","title":"Learning to Simulate Tree-Branch Dynamics for Manipulation","summary":"  We propose to use a simulation driven inverse inference approach to model the\ndynamics of tree branches under manipulation. Learning branch dynamics and\ngaining the ability to manipulate deformable vegetation can help with\nocclusion-prone tasks, such as fruit picking in dense foliage, as well as\nmoving overhanging vines and branches for navigation in dense vegetation. The\nunderlying deformable tree geometry is encapsulated as coarse spring\nabstractions executed on parallel, non-differentiable simulators. The implicit\nstatistical model defined by the simulator, reference trajectories obtained by\nactively probing the ground truth, and the Bayesian formalism, together guide\nthe spring parameter posterior density estimation. Our non-parametric inference\nalgorithm, based on Stein Variational Gradient Descent, incorporates\nbiologically motivated assumptions into the inference process as neural network\ndriven learnt joint priors; moreover, it leverages the finite difference scheme\nfor gradient approximations. Real and simulated experiments confirm that our\nmodel can predict deformation trajectories, quantify the estimation\nuncertainty, and it can perform better when base-lined against other inference\nalgorithms, particularly from the Monte Carlo family. The model displays strong\nrobustness properties in the presence of heteroscedastic sensor noise;\nfurthermore, it can generalise to unseen grasp locations.\n","authors":["Jayadeep Jacob","Tirthankar Bandyopadhyay","Jason Williams","Paulo Borges","Fabio Ramos"],"pdf_url":"https://arxiv.org/pdf/2306.03410v2.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2310.08031v1","updated":"2023-10-12T04:37:15Z","published":"2023-10-12T04:37:15Z","title":"Local Graph Clustering with Noisy Labels","summary":"  The growing interest in machine learning problems over graphs with additional\nnode information such as texts, images, or labels has popularized methods that\nrequire the costly operation of processing the entire graph. Yet, little effort\nhas been made to the development of fast local methods (i.e. without accessing\nthe entire graph) that extract useful information from such data. To that end,\nwe propose a study of local graph clustering using noisy node labels as a proxy\nfor additional node information. In this setting, nodes receive initial binary\nlabels based on cluster affiliation: 1 if they belong to the target cluster and\n0 otherwise. Subsequently, a fraction of these labels is flipped. We\ninvestigate the benefits of incorporating noisy labels for local graph\nclustering. By constructing a weighted graph with such labels, we study the\nperformance of graph diffusion-based local clustering method on both the\noriginal and the weighted graphs. From a theoretical perspective, we consider\nrecovering an unknown target cluster with a single seed node in a random graph\nwith independent noisy node labels. We provide sufficient conditions on the\nlabel noise under which, with high probability, using diffusion in the weighted\ngraph yields a more accurate recovery of the target cluster. This approach\nproves more effective than using the given labels alone or using diffusion in\nthe label-free original graph. Empirically, we show that reliable node labels\ncan be obtained with just a few samples from an attributed graph. Moreover,\nutilizing these labels via diffusion in the weighted graph leads to\nsignificantly better local clustering performance across several real-world\ndatasets, improving F1 scores by up to 13%.\n","authors":["Artur Back de Luca","Kimon Fountoulakis","Shenghao Yang"],"pdf_url":"https://arxiv.org/pdf/2310.08031v1.pdf","comment":"26 pages, 5 figures, 14 tables"},{"id":"http://arxiv.org/abs/2309.10691v2","updated":"2023-10-12T04:07:56Z","published":"2023-09-19T15:25:42Z","title":"MINT: Evaluating LLMs in Multi-turn Interaction with Tools and Language\n  Feedback","summary":"  To solve complex tasks, large language models (LLMs) often require multiple\nrounds of interactions with the user, sometimes assisted by external tools.\nHowever, current evaluation protocols often emphasize benchmark performance\nwith single-turn exchanges, neglecting the nuanced interactions among the user,\nLLMs, and external tools, while also underestimating the importance of natural\nlanguage feedback from users. These oversights contribute to discrepancies\nbetween research benchmark evaluations and real-world use cases. We introduce\nMINT, a benchmark that evaluates LLMs' ability to solve tasks with multi-turn\ninteractions by (1) using tools and (2) leveraging natural language feedback.\nTo ensure reproducibility, we provide an evaluation framework where LLMs can\naccess tools by executing Python code and receive users' natural language\nfeedback simulated by GPT-4. We repurpose a diverse set of established\nevaluation datasets focusing on reasoning, coding, and decision-making and\ncarefully curate them into a compact subset for efficient evaluation. Our\nanalysis of 20 open- and closed-source LLMs offers intriguing findings. (a)\nLLMs generally benefit from tools and language feedback, with performance gains\n(absolute, same below) of 1-8% for each turn of tool use and 2-17% with natural\nlanguage feedback. (b) Better single-turn performance does not guarantee better\nmulti-turn performance. (c) Surprisingly, on the LLMs evaluated, supervised\ninstruction-finetuning (SIFT) and reinforcement learning from human feedback\n(RLHF) generally hurt multi-turn capabilities. We expect MINT can help measure\nprogress and incentivize research in improving LLMs' capabilities in multi-turn\ninteractions, especially for open-source communities where multi-turn human\nevaluation can be less accessible compared to commercial LLMs with a larger\nuser base.\n","authors":["Xingyao Wang","Zihan Wang","Jiateng Liu","Yangyi Chen","Lifan Yuan","Hao Peng","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2309.10691v2.pdf","comment":"Code is available on our project website:\n  https://xingyaoww.github.io/mint-bench"},{"id":"http://arxiv.org/abs/2307.02484v5","updated":"2023-10-12T04:06:48Z","published":"2023-07-05T17:58:21Z","title":"Elastic Decision Transformer","summary":"  This paper introduces Elastic Decision Transformer (EDT), a significant\nadvancement over the existing Decision Transformer (DT) and its variants.\nAlthough DT purports to generate an optimal trajectory, empirical evidence\nsuggests it struggles with trajectory stitching, a process involving the\ngeneration of an optimal or near-optimal trajectory from the best parts of a\nset of sub-optimal trajectories. The proposed EDT differentiates itself by\nfacilitating trajectory stitching during action inference at test time,\nachieved by adjusting the history length maintained in DT. Further, the EDT\noptimizes the trajectory by retaining a longer history when the previous\ntrajectory is optimal and a shorter one when it is sub-optimal, enabling it to\n\"stitch\" with a more optimal trajectory. Extensive experimentation demonstrates\nEDT's ability to bridge the performance gap between DT-based and Q\nLearning-based approaches. In particular, the EDT outperforms Q Learning-based\nmethods in a multi-task regime on the D4RL locomotion benchmark and Atari\ngames. Videos are available at: https://kristery.github.io/edt/\n","authors":["Yueh-Hua Wu","Xiaolong Wang","Masashi Hamaya"],"pdf_url":"https://arxiv.org/pdf/2307.02484v5.pdf","comment":"Accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2210.15889v4","updated":"2023-10-12T04:05:41Z","published":"2022-10-28T04:38:10Z","title":"Towards Data-and Knowledge-Driven Artificial Intelligence: A Survey on\n  Neuro-Symbolic Computing","summary":"  Neural-symbolic computing (NeSy), which pursues the integration of the\nsymbolic and statistical paradigms of cognition, has been an active research\narea of Artificial Intelligence (AI) for many years. As NeSy shows promise of\nreconciling the advantages of reasoning and interpretability of symbolic\nrepresentation and robust learning in neural networks, it may serve as a\ncatalyst for the next generation of AI. In the present paper, we provide a\nsystematic overview of the recent developments and important contributions of\nNeSy research. Firstly, we introduce study history of this area, covering early\nwork and foundations. We further discuss background concepts and identify key\ndriving factors behind the development of NeSy. Afterward, we categorize recent\nlandmark approaches along several main characteristics that underline this\nresearch paradigm, including neural-symbolic integration, knowledge\nrepresentation, knowledge embedding, and functionality. Next, we briefly\ndiscuss the successful application of modern NeSy approaches in several\ndomains. Then, we benchmark several NeSy methods on three representative\napplication tasks. Finally, we identify the open problems together with\npotential future research directions. This survey is expected to help new\nresearchers enter this rapidly evolving field and accelerate the progress\ntowards data-and knowledge-driven AI.\n","authors":["Wenguan Wang","Yi Yang","Fei Wu"],"pdf_url":"https://arxiv.org/pdf/2210.15889v4.pdf","comment":"Ongoing project"},{"id":"http://arxiv.org/abs/2302.02931v2","updated":"2023-10-12T03:47:00Z","published":"2023-02-06T17:07:16Z","title":"Bitrate-Constrained DRO: Beyond Worst Case Robustness To Unknown Group\n  Shifts","summary":"  Training machine learning models robust to distribution shifts is critical\nfor real-world applications. Some robust training algorithms (e.g., Group DRO)\nspecialize to group shifts and require group information on all training\npoints. Other methods (e.g., CVaR DRO) that do not need group annotations can\nbe overly conservative, since they naively upweight high loss points which may\nform a contrived set that does not correspond to any meaningful group in the\nreal world (e.g., when the high loss points are randomly mislabeled training\npoints). In this work, we address limitations in prior approaches by assuming a\nmore nuanced form of group shift: conditioned on the label, we assume that the\ntrue group function (indicator over group) is simple. For example, we may\nexpect that group shifts occur along low bitrate features (e.g., image\nbackground, lighting). Thus, we aim to learn a model that maintains high\naccuracy on simple group functions realized by these low bitrate features, that\nneed not spend valuable model capacity achieving high accuracy on contrived\ngroups of examples. Based on this, we consider the two-player game formulation\nof DRO where the adversary's capacity is bitrate-constrained. Our resulting\npractical algorithm, Bitrate-Constrained DRO (BR-DRO), does not require group\ninformation on training samples yet matches the performance of Group DRO on\ndatasets that have training group annotations and that of CVaR DRO on\nlong-tailed distributions. Our theoretical analysis reveals that in some\nsettings BR-DRO objective can provably yield statistically efficient and less\nconservative solutions than unconstrained CVaR DRO.\n","authors":["Amrith Setlur","Don Dennis","Benjamin Eysenbach","Aditi Raghunathan","Chelsea Finn","Virginia Smith","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2302.02931v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08019v1","updated":"2023-10-12T03:41:32Z","published":"2023-10-12T03:41:32Z","title":"Robust 1-bit Compressed Sensing with Iterative Hard Thresholding","summary":"  In 1-bit compressed sensing, the aim is to estimate a $k$-sparse unit vector\n$x\\in S^{n-1}$ within an $\\epsilon$ error (in $\\ell_2$) from minimal number of\nlinear measurements that are quantized to just their signs, i.e., from\nmeasurements of the form $y = \\mathrm{Sign}(\\langle a, x\\rangle).$ In this\npaper, we study a noisy version where a fraction of the measurements can be\nflipped, potentially by an adversary. In particular, we analyze the Binary\nIterative Hard Thresholding (BIHT) algorithm, a proximal gradient descent on a\nproperly defined loss function used for 1-bit compressed sensing, in this noisy\nsetting. It is known from recent results that, with\n$\\tilde{O}(\\frac{k}{\\epsilon})$ noiseless measurements, BIHT provides an\nestimate within $\\epsilon$ error. This result is optimal and universal, meaning\none set of measurements work for all sparse vectors. In this paper, we show\nthat BIHT also provides better results than all known methods for the noisy\nsetting. We show that when up to $\\tau$-fraction of the sign measurements are\nincorrect (adversarial error), with the same number of measurements as before,\nBIHT agnostically provides an estimate of $x$ within an\n$\\tilde{O}(\\epsilon+\\tau)$ error, maintaining the universality of measurements.\nThis establishes stability of iterative hard thresholding in the presence of\nmeasurement error. To obtain the result, we use the restricted approximate\ninvertibility of Gaussian matrices, as well as a tight analysis of the\nhigh-dimensional geometry of the adversarially corrupted measurements.\n","authors":["Namiko Matsumoto","Arya Mazumdar"],"pdf_url":"https://arxiv.org/pdf/2310.08019v1.pdf","comment":"Accepted to appear in ACM-SIAM Symposium on Discrete Algorithms\n  (SODA) 2024"},{"id":"http://arxiv.org/abs/2308.03807v2","updated":"2023-10-12T03:36:17Z","published":"2023-08-06T15:47:03Z","title":"Nest-DGIL: Nesterov-optimized Deep Geometric Incremental Learning for CS\n  Image Reconstruction","summary":"  Proximal gradient-based optimization is one of the most common strategies to\nsolve inverse problem of images, and it is easy to implement. However, these\ntechniques often generate heavy artifacts in image reconstruction. One of the\nmost popular refinement methods is to fine-tune the regularization parameter to\nalleviate such artifacts, but it may not always be sufficient or applicable due\nto increased computational costs. In this work, we propose a deep geometric\nincremental learning framework based on the second Nesterov proximal gradient\noptimization. The proposed end-to-end network not only has the powerful\nlearning ability for high-/low-frequency image features, but also can\ntheoretically guarantee that geometric texture details will be reconstructed\nfrom preliminary linear reconstruction. Furthermore, it can avoid the risk of\nintermediate reconstruction results falling outside the geometric decomposition\ndomains and achieve fast convergence. Our reconstruction framework is\ndecomposed into four modules including general linear reconstruction, cascade\ngeometric incremental restoration, Nesterov acceleration, and post-processing.\nIn the image restoration step, a cascade geometric incremental learning module\nis designed to compensate for missing texture information from different\ngeometric spectral decomposition domains. Inspired by the overlap-tile\nstrategy, we also develop a post-processing module to remove the block effect\nin patch-wise-based natural image reconstruction. All parameters in the\nproposed model are learnable, an adaptive initialization technique of physical\nparameters is also employed to make model flexibility and ensure converging\nsmoothly. We compare the reconstruction performance of the proposed method with\nexisting state-of-the-art methods to demonstrate its superiority. Our source\ncodes are available at https://github.com/fanxiaohong/Nest-DGIL.\n","authors":["Xiaohong Fan","Yin Yang","Ke Chen","Yujie Feng","Jianping Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03807v2.pdf","comment":"15 pages,our source codes are available at\n  https://github.com/fanxiaohong/Nest-DGIL"},{"id":"http://arxiv.org/abs/2310.07644v2","updated":"2023-10-12T03:32:32Z","published":"2023-10-11T16:40:57Z","title":"Rethinking the BERT-like Pretraining for DNA Sequences","summary":"  With the success of large-scale pretraining in NLP, there is an increasing\ntrend of applying it to the domain of life sciences. In particular, pretraining\nmethods based on DNA sequences have garnered growing attention due to their\npotential to capture generic information about genes. However, existing\npretraining methods for DNA sequences largely rely on direct adoptions of BERT\npretraining from NLP, lacking a comprehensive understanding and a specifically\ntailored approach. To address this research gap, we first conducted a series of\nexploratory experiments and gained several insightful observations: 1) In the\nfine-tuning phase of downstream tasks, when using K-mer overlapping\ntokenization instead of K-mer non-overlapping tokenization, both overlapping\nand non-overlapping pretraining weights show consistent performance\nimprovement.2) During the pre-training process, using K-mer overlapping\ntokenization quickly produces clear K-mer embeddings and reduces the loss to a\nvery low level, while using K-mer non-overlapping tokenization results in less\ndistinct embeddings and continuously decreases the loss. 3) Using overlapping\ntokenization causes the self-attention in the intermediate layers of\npre-trained models to tend to overly focus on certain tokens, reflecting that\nthese layers are not adequately optimized. In summary, overlapping tokenization\ncan benefit the fine-tuning of downstream tasks but leads to inadequate\npretraining with fast convergence. To unleash the pretraining potential, we\nintroduce a novel approach called RandomMask, which gradually increases the\ntask difficulty of BERT-like pretraining by continuously expanding its mask\nboundary, forcing the model to learn more knowledge. RandomMask is simple but\neffective, achieving top-tier performance across 26 datasets of 28 datasets\nspanning 7 downstream tasks.\n","authors":["Chaoqi Liang","Weiqiang Bai","Lifeng Qiao","Yuchen Ren","Jianle Sun","Peng Ye","Hongliang Yan","Xinzhu Ma","Wangmeng Zuo","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2310.07644v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08015v1","updated":"2023-10-12T03:29:53Z","published":"2023-10-12T03:29:53Z","title":"Why Train More? Effective and Efficient Membership Inference via\n  Memorization","summary":"  Membership Inference Attacks (MIAs) aim to identify specific data samples\nwithin the private training dataset of machine learning models, leading to\nserious privacy violations and other sophisticated threats. Many practical\nblack-box MIAs require query access to the data distribution (the same\ndistribution where the private data is drawn) to train shadow models. By doing\nso, the adversary obtains models trained \"with\" or \"without\" samples drawn from\nthe distribution, and analyzes the characteristics of the samples under\nconsideration. The adversary is often required to train more than hundreds of\nshadow models to extract the signals needed for MIAs; this becomes the\ncomputational overhead of MIAs. In this paper, we propose that by strategically\nchoosing the samples, MI adversaries can maximize their attack success while\nminimizing the number of shadow models. First, our motivational experiments\nsuggest memorization as the key property explaining disparate sample\nvulnerability to MIAs. We formalize this through a theoretical bound that\nconnects MI advantage with memorization. Second, we show sample complexity\nbounds that connect the number of shadow models needed for MIAs with\nmemorization. Lastly, we confirm our theoretical arguments with comprehensive\nexperiments; by utilizing samples with high memorization scores, the adversary\ncan (a) significantly improve its efficacy regardless of the MIA used, and (b)\nreduce the number of shadow models by nearly two orders of magnitude compared\nto state-of-the-art approaches.\n","authors":["Jihye Choi","Shruti Tople","Varun Chandrasekaran","Somesh Jha"],"pdf_url":"https://arxiv.org/pdf/2310.08015v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08012v1","updated":"2023-10-12T03:28:14Z","published":"2023-10-12T03:28:14Z","title":"AutoFHE: Automated Adaption of CNNs for Efficient Evaluation over FHE","summary":"  Secure inference of deep convolutional neural networks (CNNs) under RNS-CKKS\ninvolves polynomial approximation of unsupported non-linear activation\nfunctions. However, existing approaches have three main limitations: 1)\nInflexibility: The polynomial approximation and associated homomorphic\nevaluation architecture are customized manually for each CNN architecture and\ndo not generalize to other networks. 2) Suboptimal Approximation: Each\nactivation function is approximated instead of the function represented by the\nCNN. 3) Restricted Design: Either high-degree or low-degree polynomial\napproximations are used. The former retains high accuracy but slows down\ninference due to bootstrapping operations, while the latter accelerates\nciphertext inference but compromises accuracy. To address these limitations, we\npresent AutoFHE, which automatically adapts standard CNNs for secure inference\nunder RNS-CKKS. The key idea is to adopt layerwise mixed-degree polynomial\nactivation functions, which are optimized jointly with the homomorphic\nevaluation architecture in terms of the placement of bootstrapping operations.\nThe problem is modeled within a multi-objective optimization framework to\nmaximize accuracy and minimize the number of bootstrapping operations. AutoFHE\ncan be applied flexibly on any CNN architecture, and it provides diverse\nsolutions that span the trade-off between accuracy and latency. Experimental\nevaluation over RNS-CKKS encrypted CIFAR datasets shows that AutoFHE\naccelerates secure inference by $1.32\\times$ to $1.8\\times$ compared to methods\nemploying high-degree polynomials. It also improves accuracy by up to 2.56%\ncompared to methods using low-degree polynomials. Lastly, AutoFHE accelerates\ninference and improves accuracy by $103\\times$ and 3.46%, respectively,\ncompared to CNNs under TFHE.\n","authors":["Wei Ao","Vishnu Naresh Boddeti"],"pdf_url":"https://arxiv.org/pdf/2310.08012v1.pdf","comment":"USENIX Security Symposium 2024"},{"id":"http://arxiv.org/abs/2310.06763v2","updated":"2023-10-12T03:24:43Z","published":"2023-10-10T16:39:47Z","title":"FABind: Fast and Accurate Protein-Ligand Binding","summary":"  Modeling the interaction between proteins and ligands and accurately\npredicting their binding structures is a critical yet challenging task in drug\ndiscovery. Recent advancements in deep learning have shown promise in\naddressing this challenge, with sampling-based and regression-based methods\nemerging as two prominent approaches. However, these methods have notable\nlimitations. Sampling-based methods often suffer from low efficiency due to the\nneed for generating multiple candidate structures for selection. On the other\nhand, regression-based methods offer fast predictions but may experience\ndecreased accuracy. Additionally, the variation in protein sizes often requires\nexternal modules for selecting suitable binding pockets, further impacting\nefficiency. In this work, we propose $\\mathbf{FABind}$, an end-to-end model\nthat combines pocket prediction and docking to achieve accurate and fast\nprotein-ligand binding. $\\mathbf{FABind}$ incorporates a unique ligand-informed\npocket prediction module, which is also leveraged for docking pose estimation.\nThe model further enhances the docking process by incrementally integrating the\npredicted pocket to optimize protein-ligand binding, reducing discrepancies\nbetween training and inference. Through extensive experiments on benchmark\ndatasets, our proposed $\\mathbf{FABind}$ demonstrates strong advantages in\nterms of effectiveness and efficiency compared to existing methods. Our code is\navailable at $\\href{https://github.com/QizhiPei/FABind}{Github}$.\n","authors":["Qizhi Pei","Kaiyuan Gao","Lijun Wu","Jinhua Zhu","Yingce Xia","Shufang Xie","Tao Qin","Kun He","Tie-Yan Liu","Rui Yan"],"pdf_url":"https://arxiv.org/pdf/2310.06763v2.pdf","comment":"Neural Information Processing Systems (NeurIPS 2023)"},{"id":"http://arxiv.org/abs/2310.06488v2","updated":"2023-10-12T03:23:40Z","published":"2023-10-10T09:57:17Z","title":"SpikeCLIP: A Contrastive Language-Image Pretrained Spiking Neural\n  Network","summary":"  Spiking neural networks (SNNs) have demonstrated the capability to achieve\ncomparable performance to deep neural networks (DNNs) in both visual and\nlinguistic domains while offering the advantages of improved energy efficiency\nand adherence to biological plausibility. However, the extension of such\nsingle-modality SNNs into the realm of multimodal scenarios remains an\nunexplored territory. Drawing inspiration from the concept of contrastive\nlanguage-image pre-training (CLIP), we introduce a novel framework, named\nSpikeCLIP, to address the gap between two modalities within the context of\nspike-based computing through a two-step recipe involving ``Alignment\nPre-training + Dual-Loss Fine-tuning\". Extensive experiments demonstrate that\nSNNs achieve comparable results to their DNN counterparts while significantly\nreducing energy consumption across a variety of datasets commonly used for\nmultimodal model evaluation. Furthermore, SpikeCLIP maintains robust\nperformance in image classification tasks that involve class labels not\npredefined within specific categories.\n","authors":["Tianlong Li","Wenhao Liu","Changze Lv","Jianhan Xu","Cenyuan Zhang","Muling Wu","Xiaoqing Zheng","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2310.06488v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02285v2","updated":"2023-10-12T03:05:36Z","published":"2023-09-05T14:45:27Z","title":"PromptTTS 2: Describing and Generating Voices with Text Prompt","summary":"  Speech conveys more information than text, as the same word can be uttered in\nvarious voices to convey diverse information. Compared to traditional\ntext-to-speech (TTS) methods relying on speech prompts (reference speech) for\nvoice variability, using text prompts (descriptions) is more user-friendly\nsince speech prompts can be hard to find or may not exist at all. TTS\napproaches based on the text prompt face two main challenges: 1) the\none-to-many problem, where not all details about voice variability can be\ndescribed in the text prompt, and 2) the limited availability of text prompt\ndatasets, where vendors and large cost of data labeling are required to write\ntext prompts for speech. In this work, we introduce PromptTTS 2 to address\nthese challenges with a variation network to provide variability information of\nvoice not captured by text prompts, and a prompt generation pipeline to utilize\nthe large language models (LLM) to compose high quality text prompts.\nSpecifically, the variation network predicts the representation extracted from\nthe reference speech (which contains full information about voice variability)\nbased on the text prompt representation. For the prompt generation pipeline, it\ngenerates text prompts for speech with a speech language understanding model to\nrecognize voice attributes (e.g., gender, speed) from speech and a large\nlanguage model to formulate text prompts based on the recognition results.\nExperiments on a large-scale (44K hours) speech dataset demonstrate that\ncompared to the previous works, PromptTTS 2 generates voices more consistent\nwith text prompts and supports the sampling of diverse voice variability,\nthereby offering users more choices on voice generation. Additionally, the\nprompt generation pipeline produces high-quality text prompts, eliminating the\nlarge labeling cost. The demo page of PromptTTS 2 is available online.\n","authors":["Yichong Leng","Zhifang Guo","Kai Shen","Xu Tan","Zeqian Ju","Yanqing Liu","Yufei Liu","Dongchao Yang","Leying Zhang","Kaitao Song","Lei He","Xiang-Yang Li","Sheng Zhao","Tao Qin","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2309.02285v2.pdf","comment":"Demo page: https://speechresearch.github.io/prompttts2"},{"id":"http://arxiv.org/abs/2309.09408v2","updated":"2023-10-12T23:55:38Z","published":"2023-09-18T00:22:59Z","title":"Guided Online Distillation: Promoting Safe Reinforcement Learning by\n  Offline Demonstration","summary":"  Safe Reinforcement Learning (RL) aims to find a policy that achieves high\nrewards while satisfying cost constraints. When learning from scratch, safe RL\nagents tend to be overly conservative, which impedes exploration and restrains\nthe overall performance. In many realistic tasks, e.g. autonomous driving,\nlarge-scale expert demonstration data are available. We argue that extracting\nexpert policy from offline data to guide online exploration is a promising\nsolution to mitigate the conserveness issue. Large-capacity models, e.g.\ndecision transformers (DT), have been proven to be competent in offline policy\nlearning. However, data collected in real-world scenarios rarely contain\ndangerous cases (e.g., collisions), which makes it prohibitive for the policies\nto learn safety concepts. Besides, these bulk policy networks cannot meet the\ncomputation speed requirements at inference time on real-world tasks such as\nautonomous driving. To this end, we propose Guided Online Distillation (GOLD),\nan offline-to-online safe RL framework. GOLD distills an offline DT policy into\na lightweight policy network through guided online safe RL training, which\noutperforms both the offline DT policy and online safe RL algorithms.\nExperiments in both benchmark safe RL tasks and real-world driving tasks based\non the Waymo Open Motion Dataset (WOMD) demonstrate that GOLD can successfully\ndistill lightweight policies and solve decision-making problems in challenging\nsafety-critical scenarios.\n","authors":["Jinning Li","Xinyi Liu","Banghua Zhu","Jiantao Jiao","Masayoshi Tomizuka","Chen Tang","Wei Zhan"],"pdf_url":"https://arxiv.org/pdf/2309.09408v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.08141v2","updated":"2023-10-12T23:50:05Z","published":"2023-01-11T22:47:12Z","title":"Self-supervised Learning for Segmentation and Quantification of Dopamine\n  Neurons in Parkinson's Disease","summary":"  Parkinson's Disease (PD) is the second most common neurodegenerative disease\nin humans. PD is characterized by the gradual loss of dopaminergic neurons in\nthe Substantia Nigra (SN). Counting the number of dopaminergic neurons in the\nSN is one of the most important indexes in evaluating drug efficacy in PD\nanimal models. Currently, analyzing and quantifying dopaminergic neurons is\nconducted manually by experts through analysis of digital pathology images\nwhich is laborious, time-consuming, and highly subjective. As such, a reliable\nand unbiased automated system is demanded for the quantification of\ndopaminergic neurons in digital pathology images. Recent years have seen a\nsurge in adopting deep learning solutions in medical image processing. However,\ndeveloping high-performing deep learning models hinges on the availability of\nlarge-scale, high-quality annotated data, which can be expensive to acquire,\nespecially in applications like digital pathology image analysis. To this end,\nwe propose an end-to-end deep learning framework based on self-supervised\nlearning for the segmentation and quantification of dopaminergic neurons in PD\nanimal models. To the best of our knowledge, this is the first deep learning\nmodel that detects the cell body of dopaminergic neurons, counts the number of\ndopaminergic neurons, and provides characteristics of individual dopaminergic\nneurons as a numerical output. Extensive experiments demonstrate the\neffectiveness of our model in quantifying neurons with high precision, which\ncan provide a faster turnaround for drug efficacy studies, better understanding\nof dopaminergic neuronal health status, and unbiased results in PD pre-clinical\nresearch. As part of our contributions, we also provide the first publicly\navailable dataset of histology digital images along with expert annotations for\nthe segmentation of TH-positive DA neuronal soma.\n","authors":["Fatemeh Haghighi","Soumitra Ghosh","Hai Ngu","Sarah Chu","Han Lin","Mohsen Hejrati","Baris Bingol","Somaye Hashemifar"],"pdf_url":"https://arxiv.org/pdf/2301.08141v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08775v1","updated":"2023-10-12T23:47:22Z","published":"2023-10-12T23:47:22Z","title":"When Machine Learning Models Leak: An Exploration of Synthetic Training\n  Data","summary":"  We investigate an attack on a machine learning model that predicts whether a\nperson or household will relocate in the next two years, i.e., a\npropensity-to-move classifier. The attack assumes that the attacker can query\nthe model to obtain predictions and that the marginal distribution of the data\non which the model was trained is publicly available. The attack also assumes\nthat the attacker has obtained the values of non-sensitive attributes for a\ncertain number of target individuals. The objective of the attack is to infer\nthe values of sensitive attributes for these target individuals. We explore how\nreplacing the original data with synthetic data when training the model impacts\nhow successfully the attacker can infer sensitive attributes.\\footnote{Original\npaper published at PSD 2022. The paper was subsequently updated.}\n","authors":["Manel Slokom","Peter-Paul de Wolf","Martha Larson"],"pdf_url":"https://arxiv.org/pdf/2310.08775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08774v1","updated":"2023-10-12T23:46:08Z","published":"2023-10-12T23:46:08Z","title":"PhyloGFN: Phylogenetic inference with generative flow networks","summary":"  Phylogenetics is a branch of computational biology that studies the\nevolutionary relationships among biological entities. Its long history and\nnumerous applications notwithstanding, inference of phylogenetic trees from\nsequence data remains challenging: the high complexity of tree space poses a\nsignificant obstacle for the current combinatorial and probabilistic\ntechniques. In this paper, we adopt the framework of generative flow networks\n(GFlowNets) to tackle two core problems in phylogenetics: parsimony-based and\nBayesian phylogenetic inference. Because GFlowNets are well-suited for sampling\ncomplex combinatorial structures, they are a natural choice for exploring and\nsampling from the multimodal posterior distribution over tree topologies and\nevolutionary distances. We demonstrate that our amortized posterior sampler,\nPhyloGFN, produces diverse and high-quality evolutionary hypotheses on real\nbenchmark datasets. PhyloGFN is competitive with prior works in marginal\nlikelihood estimation and achieves a closer fit to the target distribution than\nstate-of-the-art variational inference methods.\n","authors":["Mingyang Zhou","Zichao Yan","Elliot Layne","Nikolay Malkin","Dinghuai Zhang","Moksh Jain","Mathieu Blanchette","Yoshua Bengio"],"pdf_url":"https://arxiv.org/pdf/2310.08774v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00489v2","updated":"2023-10-12T23:37:01Z","published":"2023-09-30T20:59:42Z","title":"Dynamic DAG Discovery for Interpretable Imitation Learning","summary":"  Imitation learning, which learns agent policy by mimicking expert\ndemonstration, has shown promising results in many applications such as medical\ntreatment regimes and self-driving vehicles. However, it remains a difficult\ntask to interpret control policies learned by the agent. Difficulties mainly\ncome from two aspects: 1) agents in imitation learning are usually implemented\nas deep neural networks, which are black-box models and lack interpretability;\n2) the latent causal mechanism behind agents' decisions may vary along the\ntrajectory, rather than staying static throughout time steps. To increase\ntransparency and offer better interpretability of the neural agent, we propose\nto expose its captured knowledge in the form of a directed acyclic causal\ngraph, with nodes being action and state variables and edges denoting the\ncausal relations behind predictions. Furthermore, we design this causal\ndiscovery process to be state-dependent, enabling it to model the dynamics in\nlatent causal graphs. Concretely, we conduct causal discovery from the\nperspective of Granger causality and propose a self-explainable imitation\nlearning framework, {\\method}. The proposed framework is composed of three\nparts: a dynamic causal discovery module, a causality encoding module, and a\nprediction module, and is trained in an end-to-end manner. After the model is\nlearned, we can obtain causal relations among states and action variables\nbehind its decisions, exposing policies learned by it. Experimental results on\nboth synthetic and real-world datasets demonstrate the effectiveness of the\nproposed {\\method} in learning the dynamic causal graphs for understanding the\ndecision-making of imitation learning meanwhile maintaining high prediction\naccuracy.\n","authors":["ianxiang Zhao","Wenchao Yu","Suhang Wang","Lu Wang","Xiang Zhang","Yuncong Chen","Yanchi Liu","Wei Cheng","Haifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2310.00489v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08767v1","updated":"2023-10-12T23:26:44Z","published":"2023-10-12T23:26:44Z","title":"Modeling Fission Gas Release at the Mesoscale using Multiscale DenseNet\n  Regression with Attention Mechanism and Inception Blocks","summary":"  Mesoscale simulations of fission gas release (FGR) in nuclear fuel provide a\npowerful tool for understanding how microstructure evolution impacts FGR, but\nthey are computationally intensive. In this study, we present an alternate,\ndata-driven approach, using deep learning to predict instantaneous FGR flux\nfrom 2D nuclear fuel microstructure images. Four convolutional neural network\n(CNN) architectures with multiscale regression are trained and evaluated on\nsimulated FGR data generated using a hybrid phase field/cluster dynamics model.\nAll four networks show high predictive power, with $R^{2}$ values above 98%.\nThe best performing network combine a Convolutional Block Attention Module\n(CBAM) and InceptionNet mechanisms to provide superior accuracy (mean absolute\npercentage error of 4.4%), training stability, and robustness on very low\ninstantaneous FGR flux values.\n","authors":["Peter Toma","Md Ali Muntaha","Joel B. Harley","Michael R. Tonks"],"pdf_url":"https://arxiv.org/pdf/2310.08767v1.pdf","comment":"Submitted at Journal of Nuclear Materials, 20 pages, 10 figures, 3\n  tables"},{"id":"http://arxiv.org/abs/2006.10189v4","updated":"2023-10-12T23:18:22Z","published":"2020-06-17T22:45:14Z","title":"Revisiting minimum description length complexity in overparameterized\n  models","summary":"  Complexity is a fundamental concept underlying statistical learning theory\nthat aims to inform generalization performance. Parameter count, while\nsuccessful in low-dimensional settings, is not well-justified for\noverparameterized settings when the number of parameters is more than the\nnumber of training samples. We revisit complexity measures based on Rissanen's\nprinciple of minimum description length (MDL) and define a novel MDL-based\ncomplexity (MDL-COMP) that remains valid for overparameterized models. MDL-COMP\nis defined via an optimality criterion over the encodings induced by a good\nRidge estimator class. We provide an extensive theoretical characterization of\nMDL-COMP for linear models and kernel methods and show that it is not just a\nfunction of parameter count, but rather a function of the singular values of\nthe design or the kernel matrix and the signal-to-noise ratio. For a linear\nmodel with $n$ observations, $d$ parameters, and i.i.d. Gaussian predictors,\nMDL-COMP scales linearly with $d$ when $d<n$, but the scaling is exponentially\nsmaller -- $\\log d$ for $d>n$. For kernel methods, we show that MDL-COMP\ninforms minimax in-sample error, and can decrease as the dimensionality of the\ninput increases. We also prove that MDL-COMP upper bounds the in-sample mean\nsquared error (MSE). Via an array of simulations and real-data experiments, we\nshow that a data-driven Prac-MDL-COMP informs hyper-parameter tuning for\noptimizing test MSE with ridge regression in limited data settings, sometimes\nimproving upon cross-validation and (always) saving computational costs.\nFinally, our findings also suggest that the recently observed double decent\nphenomenons in overparameterized models might be a consequence of the choice of\nnon-ideal estimators.\n","authors":["Raaz Dwivedi","Chandan Singh","Bin Yu","Martin J. Wainwright"],"pdf_url":"https://arxiv.org/pdf/2006.10189v4.pdf","comment":"First two authors contributed equally"},{"id":"http://arxiv.org/abs/2310.08764v1","updated":"2023-10-12T23:17:56Z","published":"2023-10-12T23:17:56Z","title":"Calibrating Likelihoods towards Consistency in Summarization Models","summary":"  Despite the recent advances in abstractive text summarization, current\nsummarization models still suffer from generating factually inconsistent\nsummaries, reducing their utility for real-world application. We argue that the\nmain reason for such behavior is that the summarization models trained with\nmaximum likelihood objective assign high probability to plausible sequences\ngiven the context, but they often do not accurately rank sequences by their\nconsistency. In this work, we solve this problem by calibrating the likelihood\nof model generated sequences to better align with a consistency metric measured\nby natural language inference (NLI) models. The human evaluation study and\nautomatic metrics show that the calibrated models generate more consistent and\nhigher-quality summaries. We also show that the models trained using our method\nreturn probabilities that are better aligned with the NLI scores, which\nsignificantly increase reliability of summarization models.\n","authors":["Polina Zablotskaia","Misha Khalman","Rishabh Joshi","Livio Baldini Soares","Shoshana Jakobovits","Joshua Maynez","Shashi Narayan"],"pdf_url":"https://arxiv.org/pdf/2310.08764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08762v1","updated":"2023-10-12T23:06:52Z","published":"2023-10-12T23:06:52Z","title":"Stabilizing Subject Transfer in EEG Classification with Divergence\n  Estimation","summary":"  Classification models for electroencephalogram (EEG) data show a large\ndecrease in performance when evaluated on unseen test sub jects. We reduce this\nperformance decrease using new regularization techniques during model training.\nWe propose several graphical models to describe an EEG classification task.\nFrom each model, we identify statistical relationships that should hold true in\nan idealized training scenario (with infinite data and a globally-optimal\nmodel) but that may not hold in practice. We design regularization penalties to\nenforce these relationships in two stages. First, we identify suitable proxy\nquantities (divergences such as Mutual Information and Wasserstein-1) that can\nbe used to measure statistical independence and dependence relationships.\nSecond, we provide algorithms to efficiently estimate these quantities during\ntraining using secondary neural network models. We conduct extensive\ncomputational experiments using a large benchmark EEG dataset, comparing our\nproposed techniques with a baseline method that uses an adversarial classifier.\nWe find our proposed methods significantly increase balanced accuracy on test\nsubjects and decrease overfitting. The proposed methods exhibit a larger\nbenefit over a greater range of hyperparameters than the baseline method, with\nonly a small computational cost at training time. These benefits are largest\nwhen used for a fixed training period, though there is still a significant\nbenefit for a subset of hyperparameters when our techniques are used in\nconjunction with early stopping regularization.\n","authors":["Niklas Smedemark-Margulies","Ye Wang","Toshiaki Koike-Akino","Jing Liu","Kieran Parsons","Yunus Bicer","Deniz Erdogmus"],"pdf_url":"https://arxiv.org/pdf/2310.08762v1.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2303.06827v2","updated":"2023-10-12T23:04:25Z","published":"2023-03-13T03:00:03Z","title":"Kernel Density Bayesian Inverse Reinforcement Learning","summary":"  Inverse reinforcement learning~(IRL) is a powerful framework to infer an\nagent's reward function by observing its behavior, but IRL algorithms that\nlearn point estimates of the reward function can be misleading because there\nmay be several functions that describe an agent's behavior equally well. A\nBayesian approach to IRL models a distribution over candidate reward functions,\nalleviating the shortcomings of learning a point estimate. However, several\nBayesian IRL algorithms use a $Q$-value function in place of the likelihood\nfunction. The resulting posterior is computationally intensive to calculate,\nhas few theoretical guarantees, and the $Q$-value function is often a poor\napproximation for the likelihood. We introduce kernel density Bayesian IRL\n(KD-BIRL), which uses conditional kernel density estimation to directly\napproximate the likelihood, providing an efficient framework that, with a\nmodified reward function parameterization, is applicable to environments with\ncomplex and infinite state spaces. We demonstrate KD-BIRL's benefits through a\nseries of experiments in Gridworld environments and a simulated sepsis\ntreatment task.\n","authors":["Aishwarya Mandyam","Didong Li","Diana Cai","Andrew Jones","Barbara E. Engelhardt"],"pdf_url":"https://arxiv.org/pdf/2303.06827v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08759v1","updated":"2023-10-12T22:56:53Z","published":"2023-10-12T22:56:53Z","title":"Question Answering for Electronic Health Records: A Scoping Review of\n  datasets and models","summary":"  Question Answering (QA) systems on patient-related data can assist both\nclinicians and patients. They can, for example, assist clinicians in\ndecision-making and enable patients to have a better understanding of their\nmedical history. Significant amounts of patient data are stored in Electronic\nHealth Records (EHRs), making EHR QA an important research area. In EHR QA, the\nanswer is obtained from the medical record of the patient. Because of the\ndifferences in data format and modality, this differs greatly from other\nmedical QA tasks that employ medical websites or scientific papers to retrieve\nanswers, making it critical to research EHR question answering. This study\naimed to provide a methodological review of existing works on QA over EHRs. We\nsearched for articles from January 1st, 2005 to September 30th, 2023 in four\ndigital sources including Google Scholar, ACL Anthology, ACM Digital Library,\nand PubMed to collect relevant publications on EHR QA. 4111 papers were\nidentified for our study, and after screening based on our inclusion criteria,\nwe obtained a total of 47 papers for further study. Out of the 47 papers, 25\npapers were about EHR QA datasets, and 37 papers were about EHR QA models. It\nwas observed that QA on EHRs is relatively new and unexplored. Most of the\nworks are fairly recent. Also, it was observed that emrQA is by far the most\npopular EHR QA dataset, both in terms of citations and usage in other papers.\nFurthermore, we identified the different models used in EHR QA along with the\nevaluation metrics used for these models.\n","authors":["Jayetri Bardhan","Kirk Roberts","Daisy Zhe Wang"],"pdf_url":"https://arxiv.org/pdf/2310.08759v1.pdf","comment":"5 tables, 6 figures"},{"id":"http://arxiv.org/abs/2303.06171v4","updated":"2023-10-12T22:55:43Z","published":"2023-03-10T19:14:20Z","title":"DP-Fast MH: Private, Fast, and Accurate Metropolis-Hastings for\n  Large-Scale Bayesian Inference","summary":"  Bayesian inference provides a principled framework for learning from complex\ndata and reasoning under uncertainty. It has been widely applied in machine\nlearning tasks such as medical diagnosis, drug design, and policymaking. In\nthese common applications, data can be highly sensitive. Differential privacy\n(DP) offers data analysis tools with powerful worst-case privacy guarantees and\nhas been developed as the leading approach in privacy-preserving data analysis.\nIn this paper, we study Metropolis-Hastings (MH), one of the most fundamental\nMCMC methods, for large-scale Bayesian inference under differential privacy.\nWhile most existing private MCMC algorithms sacrifice accuracy and efficiency\nto obtain privacy, we provide the first exact and fast DP MH algorithm, using\nonly a minibatch of data in most iterations. We further reveal, for the first\ntime, a three-way trade-off among privacy, scalability (i.e. the batch size),\nand efficiency (i.e. the convergence rate), theoretically characterizing how\nprivacy affects the utility and computational cost in Bayesian inference. We\nempirically demonstrate the effectiveness and efficiency of our algorithm in\nvarious experiments.\n","authors":["Wanrong Zhang","Ruqi Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.06171v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08757v1","updated":"2023-10-12T22:52:29Z","published":"2023-10-12T22:52:29Z","title":"Detection and prediction of clopidogrel treatment failures using\n  longitudinal structured electronic health records","summary":"  We propose machine learning algorithms to automatically detect and predict\nclopidogrel treatment failure using longitudinal structured electronic health\nrecords (EHR). By drawing analogies between natural language and structured\nEHR, we introduce various machine learning algorithms used in natural language\nprocessing (NLP) applications to build models for treatment failure detection\nand prediction. In this regard, we generated a cohort of patients with\nclopidogrel prescriptions from UK Biobank and annotated if the patients had\ntreatment failure events within one year of the first clopidogrel prescription;\nout of 502,527 patients, 1,824 patients were identified as treatment failure\ncases, and 6,859 patients were considered as control cases. From the dataset,\nwe gathered diagnoses, prescriptions, and procedure records together per\npatient and organized them into visits with the same date to build models. The\nmodels were built for two different tasks, i.e., detection and prediction, and\nthe experimental results showed that time series models outperform bag-of-words\napproaches in both tasks. In particular, a Transformer-based model, namely\nBERT, could reach 0.928 AUC in detection tasks and 0.729 AUC in prediction\ntasks. BERT also showed competence over other time series models when there is\nnot enough training data, because it leverages the pre-training procedure using\nlarge unlabeled data.\n","authors":["Samuel Kim","In Gu Sean Lee","Mijeong Irene Ban","Jane Chiang"],"pdf_url":"https://arxiv.org/pdf/2310.08757v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14941v2","updated":"2023-10-12T22:49:49Z","published":"2023-06-26T17:54:24Z","title":"SIMMF: Semantics-aware Interactive Multiagent Motion Forecasting for\n  Autonomous Vehicle Driving","summary":"  Autonomous vehicles require motion forecasting of their surrounding\nmultiagents (pedestrians and vehicles) to make optimal decisions for\nnavigation. The existing methods focus on techniques to utilize the positions\nand velocities of these agents and fail to capture semantic information from\nthe scene. Moreover, to mitigate the increase in computational complexity\nassociated with the number of agents in the scene, some works leverage\nEuclidean distance to prune far-away agents. However, distance-based metric\nalone is insufficient to select relevant agents and accurately perform their\npredictions. To resolve these issues, we propose the Semantics-aware\nInteractive Multiagent Motion Forecasting (SIMMF) method to capture semantics\nalong with spatial information and optimally select relevant agents for motion\nprediction. Specifically, we achieve this by implementing a semantic-aware\nselection of relevant agents from the scene and passing them through an\nattention mechanism to extract global encodings. These encodings along with\nagents' local information, are passed through an encoder to obtain\ntime-dependent latent variables for a motion policy predicting the future\ntrajectories. Our results show that the proposed approach outperforms\nstate-of-the-art baselines and provides more accurate and scene-consistent\npredictions.\n","authors":["Vidyaa Krishnan Nivash","Ahmed H. Qureshi"],"pdf_url":"https://arxiv.org/pdf/2306.14941v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08754v1","updated":"2023-10-12T22:44:19Z","published":"2023-10-12T22:44:19Z","title":"Tokenizer Choice For LLM Training: Negligible or Crucial?","summary":"  The recent success of LLMs has been predominantly driven by curating the\ntraining dataset composition, scaling of model architectures and dataset sizes\nand advancements in pretraining objectives, leaving tokenizer influence as a\nblind spot. Shedding light on this underexplored area, we conduct a\ncomprehensive study on the influence of tokenizer choice on LLM downstream\nperformance by training 24 mono- and multilingual LLMs at a 2.6B parameter\nscale, ablating different tokenizer algorithms and parameterizations. Our\nstudies highlight that the tokenizer choice can significantly impact the\nmodel's downstream performance, training and inference costs. In particular, we\nfind that the common tokenizer evaluation metrics fertility and parity are not\nalways predictive of model downstream performance, rendering these metrics a\nquestionable choice for tokenizer evaluation. Furthermore, we show that\nmultilingual tokenizers trained on the five most frequent European languages\nrequire vocabulary size increases of factor three in comparison to English.\nWhile English-only tokenizers have been applied to the training of\nmulti-lingual LLMs in the past, we find that this approach results in a severe\ndownstream performance degradation and additional training costs of up to 68%,\ndue to an inefficient tokenization vocabulary.\n","authors":["Mehdi Ali","Michael Fromm","Klaudia Thellmann","Richard Rutmann","Max Lübbering","Johannes Leveling","Katrin Klug","Jan Ebert","Niclas Doll","Jasper Schulze Buschhoff","Charvi Jain","Alexander Arno Weber","Lena Jurkschat","Hammam Abdelwahab","Chelsea John","Pedro Ortiz Suarez","Malte Ostendorff","Samuel Weinbach","Rafet Sifa","Stefan Kesselheim","Nicolas Flores-Herr"],"pdf_url":"https://arxiv.org/pdf/2310.08754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08751v1","updated":"2023-10-12T22:32:00Z","published":"2023-10-12T22:32:00Z","title":"Constrained Bayesian Optimization with Adaptive Active Learning of\n  Unknown Constraints","summary":"  Optimizing objectives under constraints, where both the objectives and\nconstraints are black box functions, is a common scenario in real-world\napplications such as scientific experimental design, design of medical\ntherapies, and industrial process optimization. One popular approach to\nhandling these complex scenarios is Bayesian Optimization (BO). In terms of\ntheoretical behavior, BO is relatively well understood in the unconstrained\nsetting, where its principles have been well explored and validated. However,\nwhen it comes to constrained Bayesian optimization (CBO), the existing\nframework often relies on heuristics or approximations without the same level\nof theoretical guarantees.\n  In this paper, we delve into the theoretical and practical aspects of\nconstrained Bayesian optimization, where the objective and constraints can be\nindependently evaluated and are subject to noise. By recognizing that both the\nobjective and constraints can help identify high-confidence regions of interest\n(ROI), we propose an efficient CBO framework that intersects the ROIs\nidentified from each aspect to determine the general ROI. The ROI, coupled with\na novel acquisition function that adaptively balances the optimization of the\nobjective and the identification of feasible regions, enables us to derive\nrigorous theoretical justifications for its performance. We showcase the\nefficiency and robustness of our proposed CBO framework through empirical\nevidence and discuss the fundamental challenge of deriving practical regret\nbounds for CBO algorithms.\n","authors":["Fengxue Zhang","Zejie Zhu","Yuxin Chen"],"pdf_url":"https://arxiv.org/pdf/2310.08751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08750v1","updated":"2023-10-12T22:30:15Z","published":"2023-10-12T22:30:15Z","title":"Search-Adaptor: Text Embedding Customization for Information Retrieval","summary":"  Text embeddings extracted by pre-trained Large Language Models (LLMs) have\nsignificant potential to improve information retrieval and search. Beyond the\nzero-shot setup in which they are being conventionally used, being able to take\nadvantage of the information from the relevant query-corpus paired data has the\npower to further boost the LLM capabilities. In this paper, we propose a novel\nmethod, Search-Adaptor, for customizing LLMs for information retrieval in an\nefficient and robust way. Search-Adaptor modifies the original text embedding\ngenerated by pre-trained LLMs, and can be integrated with any LLM, including\nthose only available via APIs. On multiple real-world English and multilingual\nretrieval datasets, we show consistent and significant performance benefits for\nSearch-Adaptor -- e.g., more than 5.2% improvements over the Google Embedding\nAPIs in nDCG@10 averaged over 13 BEIR datasets.\n","authors":["Jinsung Yoon","Sercan O Arik","Yanfei Chen","Tomas Pfister"],"pdf_url":"https://arxiv.org/pdf/2310.08750v1.pdf","comment":"9 pages, 2 figures"},{"id":"http://arxiv.org/abs/2310.08748v1","updated":"2023-10-12T22:28:53Z","published":"2023-10-12T22:28:53Z","title":"Evolutionary Dynamic Optimization and Machine Learning","summary":"  Evolutionary Computation (EC) has emerged as a powerful field of Artificial\nIntelligence, inspired by nature's mechanisms of gradual development. However,\nEC approaches often face challenges such as stagnation, diversity loss,\ncomputational complexity, population initialization, and premature convergence.\nTo overcome these limitations, researchers have integrated learning algorithms\nwith evolutionary techniques. This integration harnesses the valuable data\ngenerated by EC algorithms during iterative searches, providing insights into\nthe search space and population dynamics. Similarly, the relationship between\nevolutionary algorithms and Machine Learning (ML) is reciprocal, as EC methods\noffer exceptional opportunities for optimizing complex ML tasks characterized\nby noisy, inaccurate, and dynamic objective functions. These hybrid techniques,\nknown as Evolutionary Machine Learning (EML), have been applied at various\nstages of the ML process. EC techniques play a vital role in tasks such as data\nbalancing, feature selection, and model training optimization. Moreover, ML\ntasks often require dynamic optimization, for which Evolutionary Dynamic\nOptimization (EDO) is valuable. This paper presents the first comprehensive\nexploration of reciprocal integration between EDO and ML. The study aims to\nstimulate interest in the evolutionary learning community and inspire\ninnovative contributions in this domain.\n","authors":["Abdennour Boulesnane"],"pdf_url":"https://arxiv.org/pdf/2310.08748v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.06366v4","updated":"2023-10-12T22:25:43Z","published":"2022-10-12T16:18:25Z","title":"A Generalist Framework for Panoptic Segmentation of Images and Videos","summary":"  Panoptic segmentation assigns semantic and instance ID labels to every pixel\nof an image. As permutations of instance IDs are also valid solutions, the task\nrequires learning of high-dimensional one-to-many mapping. As a result,\nstate-of-the-art approaches use customized architectures and task-specific loss\nfunctions. We formulate panoptic segmentation as a discrete data generation\nproblem, without relying on inductive bias of the task. A diffusion model is\nproposed to model panoptic masks, with a simple architecture and generic loss\nfunction. By simply adding past predictions as a conditioning signal, our\nmethod is capable of modeling video (in a streaming setting) and thereby learns\nto track object instances automatically. With extensive experiments, we\ndemonstrate that our simple approach can perform competitively to\nstate-of-the-art specialist methods in similar settings.\n","authors":["Ting Chen","Lala Li","Saurabh Saxena","Geoffrey Hinton","David J. Fleet"],"pdf_url":"https://arxiv.org/pdf/2210.06366v4.pdf","comment":"ICCV'23. Code at https://github.com/google-research/pix2seq"},{"id":"http://arxiv.org/abs/2310.08746v1","updated":"2023-10-12T22:19:36Z","published":"2023-10-12T22:19:36Z","title":"Robustness to Multi-Modal Environment Uncertainty in MARL using\n  Curriculum Learning","summary":"  Multi-agent reinforcement learning (MARL) plays a pivotal role in tackling\nreal-world challenges. However, the seamless transition of trained policies\nfrom simulations to real-world requires it to be robust to various\nenvironmental uncertainties. Existing works focus on finding Nash Equilibrium\nor the optimal policy under uncertainty in one environment variable (i.e.\naction, state or reward). This is because a multi-agent system itself is highly\ncomplex and unstationary. However, in real-world situation uncertainty can\noccur in multiple environment variables simultaneously. This work is the first\nto formulate the generalised problem of robustness to multi-modal environment\nuncertainty in MARL. To this end, we propose a general robust training approach\nfor multi-modal uncertainty based on curriculum learning techniques. We handle\ntwo distinct environmental uncertainty simultaneously and present extensive\nresults across both cooperative and competitive MARL environments,\ndemonstrating that our approach achieves state-of-the-art levels of robustness.\n","authors":["Aakriti Agrawal","Rohith Aralikatti","Yanchao Sun","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2310.08746v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08744v1","updated":"2023-10-12T22:12:28Z","published":"2023-10-12T22:12:28Z","title":"Circuit Component Reuse Across Tasks in Transformer Language Models","summary":"  Recent work in mechanistic interpretability has shown that behaviors in\nlanguage models can be successfully reverse-engineered through circuit\nanalysis. A common criticism, however, is that each circuit is task-specific,\nand thus such analysis cannot contribute to understanding the models at a\nhigher level. In this work, we present evidence that insights (both low-level\nfindings about specific heads and higher-level findings about general\nalgorithms) can indeed generalize across tasks. Specifically, we study the\ncircuit discovered in Wang et al. (2022) for the Indirect Object Identification\n(IOI) task and 1.) show that it reproduces on a larger GPT2 model, and 2.) that\nit is mostly reused to solve a seemingly different task: Colored Objects\n(Ippolito & Callison-Burch, 2023). We provide evidence that the process\nunderlying both tasks is functionally very similar, and contains about a 78%\noverlap in in-circuit attention heads. We further present a proof-of-concept\nintervention experiment, in which we adjust four attention heads in middle\nlayers in order to 'repair' the Colored Objects circuit and make it behave like\nthe IOI circuit. In doing so, we boost accuracy from 49.6% to 93.7% on the\nColored Objects task and explain most sources of error. The intervention\naffects downstream attention heads in specific ways predicted by their\ninteractions in the IOI circuit, indicating that this subcircuit behavior is\ninvariant to the different task inputs. Overall, our results provide evidence\nthat it may yet be possible to explain large language models' behavior in terms\nof a relatively small number of interpretable task-general algorithmic building\nblocks and computational components.\n","authors":["Jack Merullo","Carsten Eickhoff","Ellie Pavlick"],"pdf_url":"https://arxiv.org/pdf/2310.08744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08743v1","updated":"2023-10-12T22:09:53Z","published":"2023-10-12T22:09:53Z","title":"Development and Validation of a Deep Learning-Based Microsatellite\n  Instability Predictor from Prostate Cancer Whole-Slide Images","summary":"  Microsatellite instability-high (MSI-H) is a tumor agnostic biomarker for\nimmune checkpoint inhibitor therapy. However, MSI status is not routinely\ntested in prostate cancer, in part due to low prevalence and assay cost. As\nsuch, prediction of MSI status from hematoxylin and eosin (H&E) stained\nwhole-slide images (WSIs) could identify prostate cancer patients most likely\nto benefit from confirmatory testing and becoming eligible for immunotherapy.\nProstate biopsies and surgical resections from de-identified records of\nconsecutive prostate cancer patients referred to our institution were analyzed.\nTheir MSI status was determined by next generation sequencing. Patients before\na cutoff date were split into an algorithm development set (n=4015, MSI-H 1.8%)\nand a paired validation set (n=173, MSI-H 19.7%) that consisted of two serial\nsections from each sample, one stained and scanned internally and the other at\nan external site. Patients after the cutoff date formed the temporal validation\nset (n=1350, MSI-H 2.3%). Attention-based multiple instance learning models\nwere trained to predict MSI-H from H&E WSIs. The MSI-H predictor achieved area\nunder the receiver operating characteristic curve values of 0.78 (95% CI\n[0.69-0.86]), 0.72 (95% CI [0.63-0.81]), and 0.72 (95% CI [0.62-0.82]) on the\ninternally prepared, externally prepared, and temporal validation sets,\nrespectively. While MSI-H status is significantly correlated with Gleason\nscore, the model remained predictive within each Gleason score subgroup. In\nsummary, we developed and validated an AI-based MSI-H diagnostic model on a\nlarge real-world cohort of routine H&E slides, which effectively generalized to\nexternally stained and scanned samples and a temporally independent validation\ncohort. This algorithm has the potential to direct prostate cancer patients\ntoward immunotherapy and to identify MSI-H cases secondary to Lynch syndrome.\n","authors":["Qiyuan Hu","Abbas A. Rizvi","Geoffery Schau","Kshitij Ingale","Yoni Muller","Rachel Baits","Sebastian Pretzer","Aïcha BenTaieb","Abigail Gordhamer","Roberto Nussenzveig","Adam Cole","Matthew O. Leavitt","Rohan P. Joshi","Nike Beaubier","Martin C. Stumpe","Kunal Nagpal"],"pdf_url":"https://arxiv.org/pdf/2310.08743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14600v2","updated":"2023-10-12T21:58:06Z","published":"2023-05-24T00:46:02Z","title":"Learning Semantic Role Labeling from Compatible Label Sequences","summary":"  Semantic role labeling (SRL) has multiple disjoint label sets, e.g., VerbNet\nand PropBank. Creating these datasets is challenging, therefore a natural\nquestion is how to use each one to help the other. Prior work has shown that\ncross-task interaction helps, but only explored multitask learning so far. A\ncommon issue with multi-task setup is that argument sequences are still\nseparately decoded, running the risk of generating structurally inconsistent\nlabel sequences (as per lexicons like Semlink). In this paper, we eliminate\nsuch issue with a framework that jointly models VerbNet and PropBank labels as\none sequence. In this setup, we show that enforcing Semlink constraints during\ndecoding constantly improves the overall F1. With special input constructions,\nour joint model infers VerbNet arguments from given PropBank arguments with\nover 99 F1. For learning, we propose a constrained marginal model that learns\nwith knowledge defined in Semlink to further benefit from the large amounts of\nPropBank-only data. On the joint benchmark based on CoNLL05, our models achieve\nstate-of-the-art F1's, outperforming the prior best in-domain model by 3.5\n(VerbNet) and 0.8 (PropBank). For out-of-domain generalization, our models\nsurpass the prior best by 3.4 (VerbNet) and 0.2 (PropBank).\n","authors":["Tao Li","Ghazaleh Kazeminejad","Susan W. Brown","Martha Palmer","Vivek Srikumar"],"pdf_url":"https://arxiv.org/pdf/2305.14600v2.pdf","comment":"Accepted at Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.08738v1","updated":"2023-10-12T21:51:25Z","published":"2023-10-12T21:51:25Z","title":"Splicing Up Your Predictions with RNA Contrastive Learning","summary":"  In the face of rapidly accumulating genomic data, our understanding of the\nRNA regulatory code remains incomplete. Recent self-supervised methods in other\ndomains have demonstrated the ability to learn rules underlying the\ndata-generating process such as sentence structure in language. Inspired by\nthis, we extend contrastive learning techniques to genomic data by utilizing\nfunctional similarities between sequences generated through alternative\nsplicing and gene duplication. Our novel dataset and contrastive objective\nenable the learning of generalized RNA isoform representations. We validate\ntheir utility on downstream tasks such as RNA half-life and mean ribosome load\nprediction. Our pre-training strategy yields competitive results using linear\nprobing on both tasks, along with up to a two-fold increase in Pearson\ncorrelation in low-data conditions. Importantly, our exploration of the learned\nlatent space reveals that our contrastive objective yields semantically\nmeaningful representations, underscoring its potential as a valuable\ninitialization technique for RNA property prediction.\n","authors":["Philip Fradkin","Ruian Shi","Bo Wang","Brendan Frey","Leo J. Lee"],"pdf_url":"https://arxiv.org/pdf/2310.08738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.00051v2","updated":"2023-10-12T21:47:53Z","published":"2022-12-30T20:38:54Z","title":"Learning from Guided Play: Improving Exploration for Adversarial\n  Imitation Learning with Simple Auxiliary Tasks","summary":"  Adversarial imitation learning (AIL) has become a popular alternative to\nsupervised imitation learning that reduces the distribution shift suffered by\nthe latter. However, AIL requires effective exploration during an online\nreinforcement learning phase. In this work, we show that the standard, naive\napproach to exploration can manifest as a suboptimal local maximum if a policy\nlearned with AIL sufficiently matches the expert distribution without fully\nlearning the desired task. This can be particularly catastrophic for\nmanipulation tasks, where the difference between an expert and a non-expert\nstate-action pair is often subtle. We present Learning from Guided Play (LfGP),\na framework in which we leverage expert demonstrations of multiple exploratory,\nauxiliary tasks in addition to a main task. The addition of these auxiliary\ntasks forces the agent to explore states and actions that standard AIL may\nlearn to ignore. Additionally, this particular formulation allows for the\nreusability of expert data between main tasks. Our experimental results in a\nchallenging multitask robotic manipulation domain indicate that LfGP\nsignificantly outperforms both AIL and behaviour cloning, while also being more\nexpert sample efficient than these baselines. To explain this performance gap,\nwe provide further analysis of a toy problem that highlights the coupling\nbetween a local maximum and poor exploration, and also visualize the\ndifferences between the learned models from AIL and LfGP.\n","authors":["Trevor Ablett","Bryan Chan","Jonathan Kelly"],"pdf_url":"https://arxiv.org/pdf/2301.00051v2.pdf","comment":"In IEEE Robotics and Automation Letters (RA-L) and presented at the\n  IEEE/RSJ International Conference on Intelligent Robots and Systems\n  (IROS'23), Detroit, MI, USA, Oct. 1-5, 2023. arXiv admin note: substantial\n  text overlap with arXiv:2112.08932"},{"id":"http://arxiv.org/abs/2206.14697v3","updated":"2023-10-12T21:47:28Z","published":"2022-06-29T14:54:49Z","title":"Hidden Parameter Recurrent State Space Models For Changing Dynamics\n  Scenarios","summary":"  Recurrent State-space models (RSSMs) are highly expressive models for\nlearning patterns in time series data and system identification. However, these\nmodels assume that the dynamics are fixed and unchanging, which is rarely the\ncase in real-world scenarios. Many control applications often exhibit tasks\nwith similar but not identical dynamics which can be modeled as a latent\nvariable. We introduce the Hidden Parameter Recurrent State Space Models\n(HiP-RSSMs), a framework that parametrizes a family of related dynamical\nsystems with a low-dimensional set of latent factors. We present a simple and\neffective way of learning and performing inference over this Gaussian graphical\nmodel that avoids approximations like variational inference. We show that\nHiP-RSSMs outperforms RSSMs and competing multi-task models on several\nchallenging robotic benchmarks both on real-world systems and simulations.\n","authors":["Vaisakh Shaj","Dieter Buchler","Rohit Sonker","Philipp Becker","Gerhard Neumann"],"pdf_url":"https://arxiv.org/pdf/2206.14697v3.pdf","comment":"Published at the International Conference on Learning\n  Representations, ICLR 2022"},{"id":"http://arxiv.org/abs/2305.16130v2","updated":"2023-10-12T21:43:18Z","published":"2023-05-25T15:04:01Z","title":"A Mechanism for Solving Relational Tasks in Transformer Language Models","summary":"  A primary criticism towards language models (LMs) is their inscrutability.\nThis paper presents evidence that, despite their size and complexity, LMs\nsometimes exploit a simple computational mechanism to solve one-to-one\nrelational tasks (e.g., capital_of(Poland)=Warsaw). We investigate a range of\nlanguage model sizes (from 124M parameters to 176B parameters) in an in-context\nlearning setting, and find that for a variety of tasks (involving capital\ncities, upper-casing, and past-tensing) a key part of the mechanism reduces to\na simple linear update typically applied by the feedforward (FFN) networks.\nThese updates also tend to promote the output of the relation in a\ncontent-independent way (e.g., encoding Poland:Warsaw::China:Beijing),\nrevealing a predictable pattern that these models take in solving these tasks.\nWe further show that this mechanism is specific to tasks that require retrieval\nfrom pretraining memory, rather than retrieval from local context. Our results\ncontribute to a growing body of work on the mechanistic interpretability of\nLLMs, and offer reason to be optimistic that, despite the massive and\nnon-linear nature of the models, the strategies they ultimately use to solve\ntasks can sometimes reduce to familiar and even intuitive algorithms.\n","authors":["Jack Merullo","Carsten Eickhoff","Ellie Pavlick"],"pdf_url":"https://arxiv.org/pdf/2305.16130v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08732v1","updated":"2023-10-12T21:39:16Z","published":"2023-10-12T21:39:16Z","title":"Provably Robust Cost-Sensitive Learning via Randomized Smoothing","summary":"  We focus on learning adversarially robust classifiers under a cost-sensitive\nscenario, where the potential harm of different classwise adversarial\ntransformations is encoded in a binary cost matrix. Existing methods are either\nempirical that cannot certify robustness or suffer from inherent scalability\nissues. In this work, we study whether randomized smoothing, a more scalable\nrobustness certification framework, can be leveraged to certify cost-sensitive\nrobustness. Built upon a notion of cost-sensitive certified radius, we show how\nto adapt the standard randomized smoothing certification pipeline to produce\ntight robustness guarantees for any cost matrix. In addition, with fine-grained\ncertified radius optimization schemes specifically designed for different data\nsubgroups, we propose an algorithm to train smoothed classifiers that are\noptimized for cost-sensitive robustness. Extensive experiments on image\nbenchmarks and a real-world medical dataset demonstrate the superiority of our\nmethod in achieving significantly improved performance of certified\ncost-sensitive robustness while having a negligible impact on overall accuracy.\n","authors":["Yuan Xin","Michael Backes","Xiao Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08732v1.pdf","comment":"18 pages, 7 tables, 4 figures"},{"id":"http://arxiv.org/abs/2310.03320v2","updated":"2023-10-12T21:38:27Z","published":"2023-10-05T05:30:42Z","title":"BioBridge: Bridging Biomedical Foundation Models via Knowledge Graph","summary":"  Foundation models (FMs) are able to leverage large volumes of unlabeled data\nto demonstrate superior performance across a wide range of tasks. However, FMs\ndeveloped for biomedical domains have largely remained unimodal, i.e.,\nindependently trained and used for tasks on protein sequences alone, small\nmolecule structures alone, or clinical data alone. To overcome this limitation\nof biomedical FMs, we present BioBridge, a novel parameter-efficient learning\nframework, to bridge independently trained unimodal FMs to establish multimodal\nbehavior. BioBridge achieves it by utilizing Knowledge Graphs (KG) to learn\ntransformations between one unimodal FM and another without fine-tuning any\nunderlying unimodal FMs. Our empirical results demonstrate that BioBridge can\nbeat the best baseline KG embedding methods (on average by around 76.3%) in\ncross-modal retrieval tasks. We also identify BioBridge demonstrates\nout-of-domain generalization ability by extrapolating to unseen modalities or\nrelations. Additionally, we also show that BioBridge presents itself as a\ngeneral purpose retriever that can aid biomedical multimodal question answering\nas well as enhance the guided generation of novel drugs.\n","authors":["Zifeng Wang","Zichen Wang","Balasubramaniam Srinivasan","Vassilis N. Ioannidis","Huzefa Rangwala","Rishita Anubhai"],"pdf_url":"https://arxiv.org/pdf/2310.03320v2.pdf","comment":"this paper needs further internal review for being published"},{"id":"http://arxiv.org/abs/2310.08731v1","updated":"2023-10-12T21:38:07Z","published":"2023-10-12T21:38:07Z","title":"A Simple Way to Incorporate Novelty Detection in World Models","summary":"  Reinforcement learning (RL) using world models has found significant recent\nsuccesses. However, when a sudden change to world mechanics or properties\noccurs then agent performance and reliability can dramatically decline. We\nrefer to the sudden change in visual properties or state transitions as {\\em\nnovelties}. Implementing novelty detection within generated world model\nframeworks is a crucial task for protecting the agent when deployed. In this\npaper, we propose straightforward bounding approaches to incorporate novelty\ndetection into world model RL agents, by utilizing the misalignment of the\nworld model's hallucinated states and the true observed states as an anomaly\nscore. We first provide an ontology of novelty detection relevant to sequential\ndecision making, then we provide effective approaches to detecting novelties in\na distribution of transitions learned by an agent in a world model. Finally, we\nshow the advantage of our work in a novel environment compared to traditional\nmachine learning novelty detection methods as well as currently accepted RL\nfocused novelty detection algorithms.\n","authors":["Geigh Zollicoffer","Kenneth Eaton","Jonathan Balloch","Julia Kim","Mark O. Riedl","Robert Wright"],"pdf_url":"https://arxiv.org/pdf/2310.08731v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2112.09726v2","updated":"2023-10-12T17:57:51Z","published":"2021-12-17T19:22:01Z","title":"Soundify: Matching Sound Effects to Video","summary":"  In the art of video editing, sound helps add character to an object and\nimmerse the viewer within a space. Through formative interviews with\nprofessional editors (N=10), we found that the task of adding sounds to video\ncan be challenging. This paper presents Soundify, a system that assists editors\nin matching sounds to video. Given a video, Soundify identifies matching\nsounds, synchronizes the sounds to the video, and dynamically adjusts panning\nand volume to create spatial audio. In a human evaluation study (N=889), we\nshow that Soundify is capable of matching sounds to video out-of-the-box for a\ndiverse range of audio categories. In a within-subjects expert study (N=12), we\ndemonstrate the usefulness of Soundify in helping video editors match sounds to\nvideo with lighter workload, reduced task completion time, and improved\nusability.\n","authors":["David Chuan-En Lin","Anastasis Germanidis","Cristóbal Valenzuela","Yining Shi","Nikolas Martelaro"],"pdf_url":"https://arxiv.org/pdf/2112.09726v2.pdf","comment":"Full paper in UIST 2023; Short paper in NeurIPS 2021 ML4CD Workshop;\n  Online demo: https://soundify.cc"},{"id":"http://arxiv.org/abs/2310.08475v1","updated":"2023-10-12T16:32:44Z","published":"2023-10-12T16:32:44Z","title":"Can We Edit Multimodal Large Language Models?","summary":"  In this paper, we focus on editing Multimodal Large Language Models (MLLMs).\nCompared to editing single-modal LLMs, multimodal model editing is more\nchallenging, which demands a higher level of scrutiny and careful consideration\nin the editing process. To facilitate research in this area, we construct a new\nbenchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite\nof innovative metrics for evaluation. We conduct comprehensive experiments\ninvolving various model editing baselines and analyze the impact of editing\ndifferent components for multimodal LLMs. Empirically, we notice that previous\nbaselines can implement editing multimodal LLMs to some extent, but the effect\nis still barely satisfactory, indicating the potential difficulty of this task.\nWe hope that our work can provide the NLP community with insights\\footnote{Code\nand dataset are available in https://github.com/zjunlp/EasyEdit.\n","authors":["Siyuan Cheng","Bozhong Tian","Qingbin Liu","Xi Chen","Yongheng Wang","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08475v1.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.08205v1","updated":"2023-10-12T10:51:17Z","published":"2023-10-12T10:51:17Z","title":"LiveVV: Human-Centered Live Volumetric Video Streaming System","summary":"  Volumetric video has emerged as a prominent medium within the realm of\neXtended Reality (XR) with the advancements in computer graphics and depth\ncapture hardware. Users can fully immersive themselves in volumetric video with\nthe ability to switch their viewport in six degree-of-freedom (DOF), including\nthree rotational dimensions (yaw, pitch, roll) and three translational\ndimensions (X, Y, Z). Different from traditional 2D videos that are composed of\npixel matrices, volumetric videos employ point clouds, meshes, or voxels to\nrepresent a volumetric scene, resulting in significantly larger data sizes.\nWhile previous works have successfully achieved volumetric video streaming in\nvideo-on-demand scenarios, the live streaming of volumetric video remains an\nunresolved challenge due to the limited network bandwidth and stringent latency\nconstraints. In this paper, we for the first time propose a holistic live\nvolumetric video streaming system, LiveVV, which achieves multi-view capture,\nscene segmentation \\& reuse, adaptive transmission, and rendering. LiveVV\ncontains multiple lightweight volumetric video capture modules that are capable\nof being deployed without prior preparation. To reduce bandwidth consumption,\nLiveVV processes static and dynamic volumetric content separately by reusing\nstatic data with low disparity and decimating data with low visual saliency.\nBesides, to deal with network fluctuation, LiveVV integrates a volumetric video\nadaptive bitrate streaming algorithm (VABR) to enable fluent playback with the\nmaximum quality of experience. Extensive real-world experiment shows that\nLiveVV can achieve live volumetric video streaming at a frame rate of 24 fps\nwith a latency of less than 350ms.\n","authors":["Kaiyuan Hu","Yongting Chen","Kaiying Han","Junhua Liu","Haowen Yang","Yili Jin","Boyan Li","Fangxin Wang"],"pdf_url":"https://arxiv.org/pdf/2310.08205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08006v1","updated":"2023-10-12T03:19:13Z","published":"2023-10-12T03:19:13Z","title":"MCPNS: A Macropixel Collocated Position and Its Neighbors Search for\n  Plenoptic 2.0 Video Coding","summary":"  Recently, it was demonstrated that a newly focused plenoptic 2.0 camera can\ncapture much higher spatial resolution owing to its effective light field\nsampling, as compared to a traditional unfocused plenoptic 1.0 camera. However,\ndue to the nature difference of the optical structure between the plenoptic 1.0\nand 2.0 cameras, the existing fast motion estimation (ME) method for plenoptic\n1.0 videos is expected to be sub-optimal for encoding plenoptic 2.0 videos. In\nthis paper, we point out the main motion characteristic differences between\nplenoptic 1.0 and 2.0 videos and then propose a new fast ME, called macropixel\ncollocated position and its neighbors search (MCPNS) for plenoptic 2.0 videos.\nIn detail, we propose to reduce the number of macropixel collocated position\n(MCP) search candidates based on the new observation of center-biased motion\nvector distribution at macropixel resolution. After that, due to large motion\ndeviation behavior around each MCP location in plenoptic 2.0 videos, we propose\nto select a certain number of key MCP locations with the lowest matching cost\nto perform the neighbors MCP search to improve the motion search accuracy.\nDifferent from existing methods, our method can achieve better performance\nwithout requiring prior knowledge of microlens array orientations. Our\nsimulation results confirmed the effectiveness of the proposed algorithm in\nterms of both bitrate savings and computational costs compared to existing\nmethods.\n","authors":["Vinh Van Duong","Thuc Nguyen Huu","Jonghoon Yim","Byeungwoo Jeon"],"pdf_url":"https://arxiv.org/pdf/2310.08006v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.06366v4","updated":"2023-10-12T22:25:43Z","published":"2022-10-12T16:18:25Z","title":"A Generalist Framework for Panoptic Segmentation of Images and Videos","summary":"  Panoptic segmentation assigns semantic and instance ID labels to every pixel\nof an image. As permutations of instance IDs are also valid solutions, the task\nrequires learning of high-dimensional one-to-many mapping. As a result,\nstate-of-the-art approaches use customized architectures and task-specific loss\nfunctions. We formulate panoptic segmentation as a discrete data generation\nproblem, without relying on inductive bias of the task. A diffusion model is\nproposed to model panoptic masks, with a simple architecture and generic loss\nfunction. By simply adding past predictions as a conditioning signal, our\nmethod is capable of modeling video (in a streaming setting) and thereby learns\nto track object instances automatically. With extensive experiments, we\ndemonstrate that our simple approach can perform competitively to\nstate-of-the-art specialist methods in similar settings.\n","authors":["Ting Chen","Lala Li","Saurabh Saxena","Geoffrey Hinton","David J. Fleet"],"pdf_url":"https://arxiv.org/pdf/2210.06366v4.pdf","comment":"ICCV'23. Code at https://github.com/google-research/pix2seq"},{"id":"http://arxiv.org/abs/2309.08730v2","updated":"2023-10-12T21:28:02Z","published":"2023-09-15T19:31:40Z","title":"MusiLingo: Bridging Music and Text with Pre-trained Language Models for\n  Music Captioning and Query Response","summary":"  Large Language Models (LLMs) have shown immense potential in multimodal\napplications, yet the convergence of textual and musical domains remains\nrelatively unexplored. To address this gap, we present MusiLingo, a novel\nsystem for music caption generation and music-related query responses.\nMusiLingo employs a single projection layer to align music representations from\nthe pre-trained frozen music audio model MERT with the frozen Vicuna-7B\nlanguage model (an adaption of LLaMA), bridging the gap between music audio and\ntextual contexts. We train it on an extensive music caption dataset and\nfine-tune it with instructional data. Due to the scarcity of high-quality music\nQ\\&A datasets, we created the Music Instruct (MI) dataset from captions in the\nMusicCaps datasets, tailored for open-ended music inquiries. Empirical\nevaluations demonstrate its competitive performance in generating music\ncaptions and composing music-related Q&A pairs.\n","authors":["Zihao Deng","Yinghao Ma","Yudong Liu","Rongchen Guo","Ge Zhang","Wenhu Chen","Wenhao Huang","Emmanouil Benetos"],"pdf_url":"https://arxiv.org/pdf/2309.08730v2.pdf","comment":null}]},"2023-10-13T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2310.09266v1","updated":"2023-10-13T17:24:52Z","published":"2023-10-13T17:24:52Z","title":"User Inference Attacks on Large Language Models","summary":"  Fine-tuning is a common and effective method for tailoring large language\nmodels (LLMs) to specialized tasks and applications. In this paper, we study\nthe privacy implications of fine-tuning LLMs on user data. To this end, we\ndefine a realistic threat model, called user inference, wherein an attacker\ninfers whether or not a user's data was used for fine-tuning. We implement\nattacks for this threat model that require only a small set of samples from a\nuser (possibly different from the samples used for training) and black-box\naccess to the fine-tuned LLM. We find that LLMs are susceptible to user\ninference attacks across a variety of fine-tuning datasets, at times with near\nperfect attack success rates. Further, we investigate which properties make\nusers vulnerable to user inference, finding that outlier users (i.e. those with\ndata distributions sufficiently different from other users) and users who\ncontribute large quantities of data are most susceptible to attack. Finally, we\nexplore several heuristics for mitigating privacy attacks. We find that\ninterventions in the training algorithm, such as batch or per-example gradient\nclipping and early stopping fail to prevent user inference. However, limiting\nthe number of fine-tuning samples from a single user can reduce attack\neffectiveness, albeit at the cost of reducing the total amount of fine-tuning\ndata.\n","authors":["Nikhil Kandpal","Krishna Pillutla","Alina Oprea","Peter Kairouz","Christopher A. Choquette-Choo","Zheng Xu"],"pdf_url":"https://arxiv.org/pdf/2310.09266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09265v1","updated":"2023-10-13T17:23:17Z","published":"2023-10-13T17:23:17Z","title":"PromptRE: Weakly-Supervised Document-Level Relation Extraction via\n  Prompting-Based Data Programming","summary":"  Relation extraction aims to classify the relationships between two entities\ninto pre-defined categories. While previous research has mainly focused on\nsentence-level relation extraction, recent studies have expanded the scope to\ndocument-level relation extraction. Traditional relation extraction methods\nheavily rely on human-annotated training data, which is time-consuming and\nlabor-intensive. To mitigate the need for manual annotation, recent\nweakly-supervised approaches have been developed for sentence-level relation\nextraction while limited work has been done on document-level relation\nextraction. Weakly-supervised document-level relation extraction faces\nsignificant challenges due to an imbalanced number \"no relation\" instances and\nthe failure of directly probing pretrained large language models for document\nrelation extraction. To address these challenges, we propose PromptRE, a novel\nweakly-supervised document-level relation extraction method that combines\nprompting-based techniques with data programming. Furthermore, PromptRE\nincorporates the label distribution and entity types as prior knowledge to\nimprove the performance. By leveraging the strengths of both prompting and data\nprogramming, PromptRE achieves improved performance in relation classification\nand effectively handles the \"no relation\" problem. Experimental results on\nReDocRED, a benchmark dataset for document-level relation extraction,\ndemonstrate the superiority of PromptRE over baseline approaches.\n","authors":["Chufan Gao","Xulin Fan","Jimeng Sun","Xuan Wang"],"pdf_url":"https://arxiv.org/pdf/2310.09265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.00807v3","updated":"2023-10-13T17:23:04Z","published":"2023-03-01T20:21:23Z","title":"UDAPDR: Unsupervised Domain Adaptation via LLM Prompting and\n  Distillation of Rerankers","summary":"  Many information retrieval tasks require large labeled datasets for\nfine-tuning. However, such datasets are often unavailable, and their utility\nfor real-world applications can diminish quickly due to domain shifts. To\naddress this challenge, we develop and motivate a method for using large\nlanguage models (LLMs) to generate large numbers of synthetic queries cheaply.\nThe method begins by generating a small number of synthetic queries using an\nexpensive LLM. After that, a much less expensive one is used to create large\nnumbers of synthetic queries, which are used to fine-tune a family of reranker\nmodels. These rerankers are then distilled into a single efficient retriever\nfor use in the target domain. We show that this technique boosts zero-shot\naccuracy in long-tail domains and achieves substantially lower latency than\nstandard reranking methods.\n","authors":["Jon Saad-Falcon","Omar Khattab","Keshav Santhanam","Radu Florian","Martin Franz","Salim Roukos","Avirup Sil","Md Arafat Sultan","Christopher Potts"],"pdf_url":"https://arxiv.org/pdf/2303.00807v3.pdf","comment":"Long Paper at Empirical Methods in Natural Language Processing\n  (EMNLP) 2023"},{"id":"http://arxiv.org/abs/2310.09263v1","updated":"2023-10-13T17:20:56Z","published":"2023-10-13T17:20:56Z","title":"Table-GPT: Table-tuned GPT for Diverse Table Tasks","summary":"  Language models, such as GPT-3.5 and ChatGPT, demonstrate remarkable\nabilities to follow diverse human instructions and perform a wide range of\ntasks. However, when probing language models using a range of basic\ntable-understanding tasks, we observe that today's language models are still\nsub-optimal in many table-related tasks, likely because they are pre-trained\npredominantly on \\emph{one-dimensional} natural-language texts, whereas\nrelational tables are \\emph{two-dimensional} objects.\n  In this work, we propose a new \"\\emph{table-tuning}\" paradigm, where we\ncontinue to train/fine-tune language models like GPT-3.5 and ChatGPT, using\ndiverse table-tasks synthesized from real tables as training data, with the\ngoal of enhancing language models' ability to understand tables and perform\ntable tasks. We show that our resulting Table-GPT models demonstrate (1) better\n\\emph{table-understanding} capabilities, by consistently outperforming the\nvanilla GPT-3.5 and ChatGPT, on a wide-range of table tasks, including holdout\nunseen tasks, and (2) strong \\emph{generalizability}, in its ability to respond\nto diverse human instructions to perform new table-tasks, in a manner similar\nto GPT-3.5 and ChatGPT.\n","authors":["Peng Li","Yeye He","Dror Yashar","Weiwei Cui","Song Ge","Haidong Zhang","Danielle Rifinski Fainman","Dongmei Zhang","Surajit Chaudhuri"],"pdf_url":"https://arxiv.org/pdf/2310.09263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09256v1","updated":"2023-10-13T17:13:00Z","published":"2023-10-13T17:13:00Z","title":"Political claim identification and categorization in a multilingual\n  setting: First experiments","summary":"  The identification and classification of political claims is an important\nstep in the analysis of political newspaper reports; however, resources for\nthis task are few and far between. This paper explores different strategies for\nthe cross-lingual projection of political claims analysis. We conduct\nexperiments on a German dataset, DebateNet2.0, covering the policy debate\nsparked by the 2015 refugee crisis. Our evaluation involves two tasks (claim\nidentification and categorization), three languages (German, English, and\nFrench) and two methods (machine translation -- the best method in our\nexperiments -- and multilingual embeddings).\n","authors":["Urs Zaberer","Sebastian Padó","Gabriella Lapesa"],"pdf_url":"https://arxiv.org/pdf/2310.09256v1.pdf","comment":"Presented at KONVENS 2023, Ingolstadt, Germany"},{"id":"http://arxiv.org/abs/2307.02599v2","updated":"2023-10-13T17:01:11Z","published":"2023-07-05T18:48:28Z","title":"Evade ChatGPT Detectors via A Single Space","summary":"  ChatGPT brings revolutionary social value but also raises concerns about the\nmisuse of AI-generated text. Consequently, an important question is how to\ndetect whether texts are generated by ChatGPT or by human. Existing detectors\nare built upon the assumption that there are distributional gaps between\nhuman-generated and AI-generated text. These gaps are typically identified\nusing statistical information or classifiers. Our research challenges the\ndistributional gap assumption in detectors. We find that detectors do not\neffectively discriminate the semantic and stylistic gaps between\nhuman-generated and AI-generated text. Instead, the \"subtle differences\", such\nas an extra space, become crucial for detection. Based on this discovery, we\npropose the SpaceInfi strategy to evade detection. Experiments demonstrate the\neffectiveness of this strategy across multiple benchmarks and detectors. We\nalso provide a theoretical explanation for why SpaceInfi is successful in\nevading perplexity-based detection. And we empirically show that a phenomenon\ncalled token mutation causes the evasion for language model-based detectors.\nOur findings offer new insights and challenges for understanding and\nconstructing more applicable ChatGPT detectors.\n","authors":["Shuyang Cai","Wanyun Cui"],"pdf_url":"https://arxiv.org/pdf/2307.02599v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09247v1","updated":"2023-10-13T16:53:25Z","published":"2023-10-13T16:53:25Z","title":"Hypernymy Understanding Evaluation of Text-to-Image Models via WordNet\n  Hierarchy","summary":"  Text-to-image synthesis has recently attracted widespread attention due to\nrapidly improving quality and numerous practical applications. However, the\nlanguage understanding capabilities of text-to-image models are still poorly\nunderstood, which makes it difficult to reason about prompt formulations that a\ngiven model would understand well. In this work, we measure the capability of\npopular text-to-image models to understand $\\textit{hypernymy}$, or the \"is-a\"\nrelation between words. We design two automatic metrics based on the WordNet\nsemantic hierarchy and existing image classifiers pretrained on ImageNet. These\nmetrics both enable broad quantitative comparison of linguistic capabilities\nfor text-to-image models and offer a way of finding fine-grained qualitative\ndifferences, such as words that are unknown to models and thus are difficult\nfor them to draw. We comprehensively evaluate popular text-to-image models,\nincluding GLIDE, Latent Diffusion, and Stable Diffusion, showing how our\nmetrics can provide a better understanding of the individual strengths and\nweaknesses of these models.\n","authors":["Anton Baryshnikov","Max Ryabinin"],"pdf_url":"https://arxiv.org/pdf/2310.09247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09241v1","updated":"2023-10-13T16:47:20Z","published":"2023-10-13T16:47:20Z","title":"Precedent-Enhanced Legal Judgment Prediction with LLM and Domain-Model\n  Collaboration","summary":"  Legal Judgment Prediction (LJP) has become an increasingly crucial task in\nLegal AI, i.e., predicting the judgment of the case in terms of case fact\ndescription. Precedents are the previous legal cases with similar facts, which\nare the basis for the judgment of the subsequent case in national legal\nsystems. Thus, it is worthwhile to explore the utilization of precedents in the\nLJP. Recent advances in deep learning have enabled a variety of techniques to\nbe used to solve the LJP task. These can be broken down into two categories:\nlarge language models (LLMs) and domain-specific models. LLMs are capable of\ninterpreting and generating complex natural language, while domain models are\nefficient in learning task-specific information. In this paper, we propose the\nprecedent-enhanced LJP framework (PLJP), a system that leverages the strength\nof both LLM and domain models in the context of precedents. Specifically, the\ndomain models are designed to provide candidate labels and find the proper\nprecedents efficiently, and the large models will make the final prediction\nwith an in-context precedents comprehension. Experiments on the real-world\ndataset demonstrate the effectiveness of our PLJP. Moreover, our work shows a\npromising direction for LLM and domain-model collaboration that can be\ngeneralized to other vertical domains.\n","authors":["Yiquan Wu","Siying Zhou","Yifei Liu","Weiming Lu","Xiaozhong Liu","Yating Zhang","Changlong Sun","Fei Wu","Kun Kuang"],"pdf_url":"https://arxiv.org/pdf/2310.09241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09238v1","updated":"2023-10-13T16:46:38Z","published":"2023-10-13T16:46:38Z","title":"BanglaNLP at BLP-2023 Task 2: Benchmarking different Transformer Models\n  for Sentiment Analysis of Bangla Social Media Posts","summary":"  Bangla is the 7th most widely spoken language globally, with a staggering 234\nmillion native speakers primarily hailing from India and Bangladesh. This\nmorphologically rich language boasts a rich literary tradition, encompassing\ndiverse dialects and language-specific challenges. Despite its linguistic\nrichness and history, Bangla remains categorized as a low-resource language\nwithin the natural language processing (NLP) and speech community. This paper\npresents our submission to Task 2 (Sentiment Analysis of Bangla Social Media\nPosts) of the BLP Workshop. We experiment with various Transformer-based\narchitectures to solve this task. Our quantitative results show that transfer\nlearning really helps in better learning of the models in this low-resource\nlanguage scenario. This becomes evident when we further finetune a model which\nhas already been finetuned on twitter data for sentiment analysis task and that\nfinetuned model performs the best among all other models. We also perform a\ndetailed error analysis where we find some instances where ground truth labels\nneed to be relooked at. We obtain a micro-F1 of 67.02\\% on the test set and our\nperformance in this shared task is ranked at 21 in the leaderboard.\n","authors":["Saumajit Saha","Albert Nanda"],"pdf_url":"https://arxiv.org/pdf/2310.09238v1.pdf","comment":"7 pages, 2 figures, workshop"},{"id":"http://arxiv.org/abs/2310.09233v1","updated":"2023-10-13T16:37:14Z","published":"2023-10-13T16:37:14Z","title":"AgentCF: Collaborative Learning with Autonomous Language Agents for\n  Recommender Systems","summary":"  Recently, there has been an emergence of employing LLM-powered agents as\nbelievable human proxies, based on their remarkable decision-making capability.\nHowever, existing studies mainly focus on simulating human dialogue. Human\nnon-verbal behaviors, such as item clicking in recommender systems, although\nimplicitly exhibiting user preferences and could enhance the modeling of users,\nhave not been deeply explored. The main reasons lie in the gap between language\nmodeling and behavior modeling, as well as the incomprehension of LLMs about\nuser-item relations.\n  To address this issue, we propose AgentCF for simulating user-item\ninteractions in recommender systems through agent-based collaborative\nfiltering. We creatively consider not only users but also items as agents, and\ndevelop a collaborative learning approach that optimizes both kinds of agents\ntogether. Specifically, at each time step, we first prompt the user and item\nagents to interact autonomously. Then, based on the disparities between the\nagents' decisions and real-world interaction records, user and item agents are\nprompted to reflect on and adjust the misleading simulations collaboratively,\nthereby modeling their two-sided relations. The optimized agents can also\npropagate their preferences to other agents in subsequent interactions,\nimplicitly capturing the collaborative filtering idea. Overall, the optimized\nagents exhibit diverse interaction behaviors within our framework, including\nuser-item, user-user, item-item, and collective interactions. The results show\nthat these agents can demonstrate personalized behaviors akin to those of\nreal-world individuals, sparking the development of next-generation user\nbehavior simulation.\n","authors":["Junjie Zhang","Yupeng Hou","Ruobing Xie","Wenqi Sun","Julian McAuley","Wayne Xin Zhao","Leyu Lin","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2310.09233v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09223v1","updated":"2023-10-13T16:21:07Z","published":"2023-10-13T16:21:07Z","title":"Automated Claim Matching with Large Language Models: Empowering\n  Fact-Checkers in the Fight Against Misinformation","summary":"  In today's digital era, the rapid spread of misinformation poses threats to\npublic well-being and societal trust. As online misinformation proliferates,\nmanual verification by fact checkers becomes increasingly challenging. We\nintroduce FACT-GPT (Fact-checking Augmentation with Claim matching\nTask-oriented Generative Pre-trained Transformer), a framework designed to\nautomate the claim matching phase of fact-checking using Large Language Models\n(LLMs). This framework identifies new social media content that either supports\nor contradicts claims previously debunked by fact-checkers. Our approach\nemploys GPT-4 to generate a labeled dataset consisting of simulated social\nmedia posts. This data set serves as a training ground for fine-tuning more\nspecialized LLMs. We evaluated FACT-GPT on an extensive dataset of social media\ncontent related to public health. The results indicate that our fine-tuned LLMs\nrival the performance of larger pre-trained LLMs in claim matching tasks,\naligning closely with human annotations. This study achieves three key\nmilestones: it provides an automated framework for enhanced fact-checking;\ndemonstrates the potential of LLMs to complement human expertise; offers public\nresources, including datasets and models, to further research and applications\nin the fact-checking domain.\n","authors":["Eun Cheol Choi","Emilio Ferrara"],"pdf_url":"https://arxiv.org/pdf/2310.09223v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03026v2","updated":"2023-10-13T16:13:43Z","published":"2023-10-04T17:59:49Z","title":"LanguageMPC: Large Language Models as Decision Makers for Autonomous\n  Driving","summary":"  Existing learning-based autonomous driving (AD) systems face challenges in\ncomprehending high-level information, generalizing to rare events, and\nproviding interpretability. To address these problems, this work employs Large\nLanguage Models (LLMs) as a decision-making component for complex AD scenarios\nthat require human commonsense understanding. We devise cognitive pathways to\nenable comprehensive reasoning with LLMs, and develop algorithms for\ntranslating LLM decisions into actionable driving commands. Through this\napproach, LLM decisions are seamlessly integrated with low-level controllers by\nguided parameter matrix adaptation. Extensive experiments demonstrate that our\nproposed method not only consistently surpasses baseline approaches in\nsingle-vehicle tasks, but also helps handle complex driving behaviors even\nmulti-vehicle coordination, thanks to the commonsense reasoning capabilities of\nLLMs. This paper presents an initial step toward leveraging LLMs as effective\ndecision-makers for intricate AD scenarios in terms of safety, efficiency,\ngeneralizability, and interoperability. We aspire for it to serve as\ninspiration for future research in this field. Project page:\nhttps://sites.google.com/view/llm-mpc\n","authors":["Hao Sha","Yao Mu","Yuxuan Jiang","Li Chen","Chenfeng Xu","Ping Luo","Shengbo Eben Li","Masayoshi Tomizuka","Wei Zhan","Mingyu Ding"],"pdf_url":"https://arxiv.org/pdf/2310.03026v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09219v1","updated":"2023-10-13T16:12:57Z","published":"2023-10-13T16:12:57Z","title":"\"Kelly is a Warm Person, Joseph is a Role Model\": Gender Biases in\n  LLM-Generated Reference Letters","summary":"  As generative language models advance, users have started to utilize Large\nLanguage Models (LLMs) to assist in writing various types of content, including\nprofessional documents such as recommendation letters. Despite their\nconvenience, these applications introduce unprecedented fairness concerns. As\ngenerated reference letters might be directly utilized by users in professional\nor academic scenarios, they have the potential to cause direct social harms,\nsuch as lowering success rates for female applicants. Therefore, it is imminent\nand necessary to comprehensively study fairness issues and associated harms in\nsuch real-world use cases for future mitigation and monitoring. In this paper,\nwe critically examine gender bias in LLM-generated reference letters. Inspired\nby findings in social science, we design evaluation methods to manifest gender\nbiases in LLM-generated letters through 2 dimensions: biases in language style\nand biases in lexical content. Furthermore, we investigate the extent of bias\npropagation by separately analyze bias amplification in model-hallucinated\ncontents, which we define to be the hallucination bias of model-generated\ndocuments. Through benchmarking evaluation on 4 popular LLMs, including\nChatGPT, Alpaca, Vicuna and StableLM, our study reveals significant gender\nbiases in LLM-generated recommendation letters. Our findings further point\ntowards the importance and imminence to recognize biases in LLM-generated\nprofessional documents.\n","authors":["Yixin Wan","George Pu","Jiao Sun","Aparna Garimella","Kai-Wei Chang","Nanyun Peng"],"pdf_url":"https://arxiv.org/pdf/2310.09219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.01248v3","updated":"2023-10-13T15:53:00Z","published":"2023-03-01T06:16:14Z","title":"Can ChatGPT Assess Human Personalities? A General Evaluation Framework","summary":"  Large Language Models (LLMs) especially ChatGPT have produced impressive\nresults in various areas, but their potential human-like psychology is still\nlargely unexplored. Existing works study the virtual personalities of LLMs but\nrarely explore the possibility of analyzing human personalities via LLMs. This\npaper presents a generic evaluation framework for LLMs to assess human\npersonalities based on Myers Briggs Type Indicator (MBTI) tests. Specifically,\nwe first devise unbiased prompts by randomly permuting options in MBTI\nquestions and adopt the average testing result to encourage more impartial\nanswer generation. Then, we propose to replace the subject in question\nstatements to enable flexible queries and assessments on different subjects\nfrom LLMs. Finally, we re-formulate the question instructions in a manner of\ncorrectness evaluation to facilitate LLMs to generate clearer responses. The\nproposed framework enables LLMs to flexibly assess personalities of different\ngroups of people. We further propose three evaluation metrics to measure the\nconsistency, robustness, and fairness of assessment results from\nstate-of-the-art LLMs including ChatGPT and GPT-4. Our experiments reveal\nChatGPT's ability to assess human personalities, and the average results\ndemonstrate that it can achieve more consistent and fairer assessments in spite\nof lower robustness against prompt biases compared with InstructGPT.\n","authors":["Haocong Rao","Cyril Leung","Chunyan Miao"],"pdf_url":"https://arxiv.org/pdf/2303.01248v3.pdf","comment":"Accepted to EMNLP 2023. Our codes are available at\n  https://github.com/Kali-Hac/ChatGPT-MBTI"},{"id":"http://arxiv.org/abs/2308.01497v2","updated":"2023-10-13T15:51:46Z","published":"2023-08-03T01:46:27Z","title":"Large Language Model Displays Emergent Ability to Interpret Novel\n  Literary Metaphors","summary":"  Recent advances in the performance of large language models (LLMs) have\nsparked debate over whether, given sufficient training, high-level human\nabilities emerge in such generic forms of artificial intelligence (AI). Despite\nthe exceptional performance of LLMs on a wide range of tasks involving natural\nlanguage processing and reasoning, there has been sharp disagreement as to\nwhether their abilities extend to more creative human abilities. A core example\nis the ability to interpret novel metaphors. Given the enormous and non curated\ntext corpora used to train LLMs, a serious obstacle to designing tests is the\nrequirement of finding novel yet high quality metaphors that are unlikely to\nhave been included in the training data. Here we assessed the ability of GPT4,\na state of the art large language model, to provide natural-language\ninterpretations of novel literary metaphors drawn from Serbian poetry and\ntranslated into English. Despite exhibiting no signs of having been exposed to\nthese metaphors previously, the AI system consistently produced detailed and\nincisive interpretations. Human judges, blind to the fact that an AI model was\ninvolved, rated metaphor interpretations generated by GPT4 as superior to those\nprovided by a group of college students. In interpreting reversed metaphors,\nGPT4, as well as humans, exhibited signs of sensitivity to the Gricean\ncooperative principle. In addition, for several novel English poems GPT4\nproduced interpretations that were rated as excellent or good by a human\nliterary critic. These results indicate that LLMs such as GPT4 have acquired an\nemergent ability to interpret complex metaphors, including those embedded in\nnovel poems.\n","authors":["Nicholas Ichien","Dušan Stamenković","Keith J. Holyoak"],"pdf_url":"https://arxiv.org/pdf/2308.01497v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.10977v2","updated":"2023-10-13T15:38:46Z","published":"2022-05-23T00:57:33Z","title":"What should I Ask: A Knowledge-driven Approach for Follow-up Questions\n  Generation in Conversational Surveys","summary":"  Generating follow-up questions on the fly could significantly improve\nconversational survey quality and user experiences by enabling a more dynamic\nand personalized survey structure. In this paper, we proposed a novel task for\nknowledge-driven follow-up question generation in conversational surveys. We\nconstructed a new human-annotated dataset of human-written follow-up questions\nwith dialogue history and labeled knowledge in the context of conversational\nsurveys. Along with the dataset, we designed and validated a set of\nreference-free Gricean-inspired evaluation metrics to systematically evaluate\nthe quality of generated follow-up questions. We then propose a two-staged\nknowledge-driven model for the task, which generates informative and coherent\nfollow-up questions by using knowledge to steer the generation process. The\nexperiments demonstrate that compared to GPT-based baseline models, our\ntwo-staged model generates more informative, coherent, and clear follow-up\nquestions.\n","authors":["Yubin Ge","Ziang Xiao","Jana Diesner","Heng Ji","Karrie Karahalios","Hari Sundaram"],"pdf_url":"https://arxiv.org/pdf/2205.10977v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17216v3","updated":"2023-10-13T15:35:42Z","published":"2023-05-26T19:22:03Z","title":"Generating Images with Multimodal Language Models","summary":"  We propose a method to fuse frozen text-only large language models (LLMs)\nwith pre-trained image encoder and decoder models, by mapping between their\nembedding spaces. Our model demonstrates a wide suite of multimodal\ncapabilities: image retrieval, novel image generation, and multimodal dialogue.\nOurs is the first approach capable of conditioning on arbitrarily interleaved\nimage and text inputs to generate coherent image (and text) outputs. To achieve\nstrong performance on image generation, we propose an efficient mapping network\nto ground the LLM to an off-the-shelf text-to-image generation model. This\nmapping network translates hidden representations of text into the embedding\nspace of the visual models, enabling us to leverage the strong text\nrepresentations of the LLM for visual outputs. Our approach outperforms\nbaseline generation models on tasks with longer and more complex language. In\naddition to novel image generation, our model is also capable of image\nretrieval from a prespecified dataset, and decides whether to retrieve or\ngenerate at inference time. This is done with a learnt decision module which\nconditions on the hidden representations of the LLM. Our model exhibits a wider\nrange of capabilities compared to prior multimodal language models. It can\nprocess image-and-text inputs, and produce retrieved images, generated images,\nand generated text -- outperforming non-LLM based generation models across\nseveral text-to-image tasks that measure context dependence.\n","authors":["Jing Yu Koh","Daniel Fried","Ruslan Salakhutdinov"],"pdf_url":"https://arxiv.org/pdf/2305.17216v3.pdf","comment":"NeurIPS 2023. Project page: http://jykoh.com/gill"},{"id":"http://arxiv.org/abs/2308.02490v2","updated":"2023-10-13T15:16:59Z","published":"2023-08-04T17:59:47Z","title":"MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities","summary":"  We propose MM-Vet, an evaluation benchmark that examines large multimodal\nmodels (LMMs) on complicated multimodal tasks. Recent LMMs have shown various\nintriguing abilities, such as solving math problems written on the blackboard,\nreasoning about events and celebrities in news images, and explaining visual\njokes. Rapid model advancements pose challenges to evaluation benchmark\ndevelopment. Problems include: (1) How to systematically structure and evaluate\nthe complicated multimodal tasks; (2) How to design evaluation metrics that\nwork well across question and answer types; and (3) How to give model insights\nbeyond a simple performance ranking. To this end, we present MM-Vet, designed\nbased on the insight that the intriguing ability to solve complicated tasks is\noften achieved by a generalist model being able to integrate different core\nvision-language (VL) capabilities. MM-Vet defines 6 core VL capabilities and\nexamines the 16 integrations of interest derived from the capability\ncombination. For evaluation metrics, we propose an LLM-based evaluator for\nopen-ended outputs. The evaluator enables the evaluation across different\nquestion types and answer styles, resulting in a unified scoring metric. We\nevaluate representative LMMs on MM-Vet, providing insights into the\ncapabilities of different LMM system paradigms and models. Code and data are\navailable at https://github.com/yuweihao/MM-Vet.\n","authors":["Weihao Yu","Zhengyuan Yang","Linjie Li","Jianfeng Wang","Kevin Lin","Zicheng Liu","Xinchao Wang","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.02490v2.pdf","comment":"Update results of OpenFlamingo-9B (MPT), LLaMA-Adapter v2-7B, and\n  Otter-9B (MPT). Code, data and leaderboard:\n  https://github.com/yuweihao/MM-Vet"},{"id":"http://arxiv.org/abs/2310.05597v2","updated":"2023-10-13T15:07:28Z","published":"2023-10-09T10:34:38Z","title":"Can language models learn analogical reasoning? Investigating training\n  objectives and comparisons to human performance","summary":"  While analogies are a common way to evaluate word embeddings in NLP, it is\nalso of interest to investigate whether or not analogical reasoning is a task\nin itself that can be learned. In this paper, we test several ways to learn\nbasic analogical reasoning, specifically focusing on analogies that are more\ntypical of what is used to evaluate analogical reasoning in humans than those\nin commonly used NLP benchmarks. Our experiments find that models are able to\nlearn analogical reasoning, even with a small amount of data. We additionally\ncompare our models to a dataset with a human baseline, and find that after\ntraining, models approach human performance.\n","authors":["Molly R. Petersen","Lonneke van der Plas"],"pdf_url":"https://arxiv.org/pdf/2310.05597v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09168v1","updated":"2023-10-13T15:03:15Z","published":"2023-10-13T15:03:15Z","title":"Explore-Instruct: Enhancing Domain-Specific Instruction Coverage through\n  Active Exploration","summary":"  Instruction-tuning can be substantially optimized through enhanced diversity,\nresulting in models capable of handling a broader spectrum of tasks. However,\nexisting data employed for such tuning often exhibit an inadequate coverage of\nindividual domains, limiting the scope for nuanced comprehension and\ninteractions within these areas. To address this deficiency, we propose\nExplore-Instruct, a novel approach to enhance the data coverage to be used in\ndomain-specific instruction-tuning through active exploration via Large\nLanguage Models (LLMs). Built upon representative domain use cases,\nExplore-Instruct explores a multitude of variations or possibilities by\nimplementing a search algorithm to obtain diversified and domain-focused\ninstruction-tuning data. Our data-centric analysis validates the effectiveness\nof this proposed approach in improving domain-specific instruction coverage.\nMoreover, our model's performance demonstrates considerable advancements over\nmultiple baselines, including those utilizing domain-specific data enhancement.\nOur findings offer a promising opportunity to improve instruction coverage,\nespecially in domain-specific contexts, thereby advancing the development of\nadaptable language models. Our code, model weights, and data are public at\n\\url{https://github.com/fanqiwan/Explore-Instruct}.\n","authors":["Fanqi Wan","Xinting Huang","Tao Yang","Xiaojun Quan","Wei Bi","Shuming Shi"],"pdf_url":"https://arxiv.org/pdf/2310.09168v1.pdf","comment":"Accepted to EMNLP 2023 (Main Conference)"},{"id":"http://arxiv.org/abs/2310.09166v1","updated":"2023-10-13T15:01:17Z","published":"2023-10-13T15:01:17Z","title":"Developing a Natural Language Understanding Model to Characterize Cable\n  News Bias","summary":"  Media bias has been extensively studied by both social and computational\nsciences. However, current work still has a large reliance on human input and\nsubjective assessment to label biases. This is especially true for cable news\nresearch. To address these issues, we develop an unsupervised machine learning\nmethod to characterize the bias of cable news programs without any human input.\nThis method relies on the analysis of what topics are mentioned through Named\nEntity Recognition and how those topics are discussed through Stance Analysis\nin order to cluster programs with similar biases together. Applying our method\nto 2020 cable news transcripts, we find that program clusters are consistent\nover time and roughly correspond to the cable news network of the program. This\nmethod reveals the potential for future tools to objectively assess media bias\nand characterize unfamiliar media environments.\n","authors":["Seth P. Benson","Iain J. Cruickshank"],"pdf_url":"https://arxiv.org/pdf/2310.09166v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09151v1","updated":"2023-10-13T14:44:34Z","published":"2023-10-13T14:44:34Z","title":"BibRank: Automatic Keyphrase Extraction Platform Using~Metadata","summary":"  Automatic Keyphrase Extraction involves identifying essential phrases in a\ndocument. These keyphrases are crucial in various tasks such as document\nclassification, clustering, recommendation, indexing, searching, summarization,\nand text simplification. This paper introduces a platform that integrates\nkeyphrase datasets and facilitates the evaluation of keyphrase extraction\nalgorithms. The platform includes BibRank, an automatic keyphrase extraction\nalgorithm that leverages a rich dataset obtained by parsing bibliographic data\nin BibTeX format. BibRank combines innovative weighting techniques with\npositional, statistical, and word co-occurrence information to extract\nkeyphrases from documents. The platform proves valuable for researchers and\ndevelopers seeking to enhance their keyphrase extraction algorithms and advance\nthe field of natural language processing.\n","authors":["Abdelrhman Eldallal","Eduard Barbu"],"pdf_url":"https://arxiv.org/pdf/2310.09151v1.pdf","comment":"12 pages , 4 figures, 8 tables"},{"id":"http://arxiv.org/abs/2310.09141v1","updated":"2023-10-13T14:33:02Z","published":"2023-10-13T14:33:02Z","title":"PuoBERTa: Training and evaluation of a curated language model for\n  Setswana","summary":"  Natural language processing (NLP) has made significant progress for\nwell-resourced languages such as English but lagged behind for low-resource\nlanguages like Setswana. This paper addresses this gap by presenting PuoBERTa,\na customised masked language model trained specifically for Setswana. We cover\nhow we collected, curated, and prepared diverse monolingual texts to generate a\nhigh-quality corpus for PuoBERTa's training. Building upon previous efforts in\ncreating monolingual resources for Setswana, we evaluated PuoBERTa across\nseveral NLP tasks, including part-of-speech (POS) tagging, named entity\nrecognition (NER), and news categorisation. Additionally, we introduced a new\nSetswana news categorisation dataset and provided the initial benchmarks using\nPuoBERTa. Our work demonstrates the efficacy of PuoBERTa in fostering NLP\ncapabilities for understudied languages like Setswana and paves the way for\nfuture research directions.\n","authors":["Vukosi Marivate","Moseli Mots'Oehli","Valencia Wagner","Richard Lastrucci","Isheanesu Dzingirai"],"pdf_url":"https://arxiv.org/pdf/2310.09141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00639v2","updated":"2023-10-13T14:28:03Z","published":"2023-06-01T13:06:43Z","title":"Being Right for Whose Right Reasons?","summary":"  Explainability methods are used to benchmark the extent to which model\npredictions align with human rationales i.e., are 'right for the right\nreasons'. Previous work has failed to acknowledge, however, that what counts as\na rationale is sometimes subjective. This paper presents what we think is a\nfirst of its kind, a collection of human rationale annotations augmented with\nthe annotators demographic information. We cover three datasets spanning\nsentiment analysis and common-sense reasoning, and six demographic groups\n(balanced across age and ethnicity). Such data enables us to ask both what\ndemographics our predictions align with and whose reasoning patterns our\nmodels' rationales align with. We find systematic inter-group annotator\ndisagreement and show how 16 Transformer-based models align better with\nrationales provided by certain demographic groups: We find that models are\nbiased towards aligning best with older and/or white annotators. We zoom in on\nthe effects of model size and model distillation, finding -- contrary to our\nexpectations -- negative correlations between model size and rationale\nagreement as well as no evidence that either model size or model distillation\nimproves fairness.\n","authors":["Terne Sasha Thorn Jakobsen","Laura Cabello","Anders Søgaard"],"pdf_url":"https://arxiv.org/pdf/2306.00639v2.pdf","comment":"In Proceedings of ACL 2023"},{"id":"http://arxiv.org/abs/2310.09139v1","updated":"2023-10-13T14:27:21Z","published":"2023-10-13T14:27:21Z","title":"The Consensus Game: Language Model Generation via Equilibrium Search","summary":"  When applied to question answering and other text generation tasks, language\nmodels (LMs) may be queried generatively (by sampling answers from their output\ndistribution) or discriminatively (by using them to score or rank a set of\ncandidate outputs). These procedures sometimes yield very different\npredictions. How do we reconcile mutually incompatible scoring procedures to\nobtain coherent LM predictions? We introduce a new, a training-free,\ngame-theoretic procedure for language model decoding. Our approach casts\nlanguage model decoding as a regularized imperfect-information sequential\nsignaling game - which we term the CONSENSUS GAME - in which a GENERATOR seeks\nto communicate an abstract correctness parameter using natural language\nsentences to a DISCRIMINATOR. We develop computational procedures for finding\napproximate equilibria of this game, resulting in a decoding algorithm we call\nEQUILIBRIUM-RANKING. Applied to a large number of tasks (including reading\ncomprehension, commonsense reasoning, mathematical problem-solving, and\ndialog), EQUILIBRIUM-RANKING consistently, and sometimes substantially,\nimproves performance over existing LM decoding procedures - on multiple\nbenchmarks, we observe that applying EQUILIBRIUM-RANKING to LLaMA-7B\noutperforms the much larger LLaMA-65B and PaLM-540B models. These results\nhighlight the promise of game-theoretic tools for addressing fundamental\nchallenges of truthfulness and consistency in LMs.\n","authors":["Athul Paul Jacob","Yikang Shen","Gabriele Farina","Jacob Andreas"],"pdf_url":"https://arxiv.org/pdf/2310.09139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07321v2","updated":"2023-10-13T14:24:31Z","published":"2023-10-11T09:09:55Z","title":"On the Impact of Cross-Domain Data on German Language Models","summary":"  Traditionally, large language models have been either trained on general web\ncrawls or domain-specific data. However, recent successes of generative large\nlanguage models, have shed light on the benefits of cross-domain datasets. To\nexamine the significance of prioritizing data diversity over quality, we\npresent a German dataset comprising texts from five domains, along with another\ndataset aimed at containing high-quality data. Through training a series of\nmodels ranging between 122M and 750M parameters on both datasets, we conduct a\ncomprehensive benchmark on multiple downstream tasks. Our findings demonstrate\nthat the models trained on the cross-domain dataset outperform those trained on\nquality data alone, leading to improvements up to $4.45\\%$ over the previous\nstate-of-the-art. The models are available at\nhttps://huggingface.co/ikim-uk-essen\n","authors":["Amin Dada","Aokun Chen","Cheng Peng","Kaleb E Smith","Ahmad Idrissi-Yaghir","Constantin Marc Seibold","Jianning Li","Lars Heiliger","Xi Yang","Christoph M. Friedrich","Daniel Truhn","Jan Egger","Jiang Bian","Jens Kleesiek","Yonghui Wu"],"pdf_url":"https://arxiv.org/pdf/2310.07321v2.pdf","comment":"13 pages, 1 figure, accepted at Findings of the Association for\n  Computational Linguistics: EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.09135v1","updated":"2023-10-13T14:23:33Z","published":"2023-10-13T14:23:33Z","title":"HierarchicalContrast: A Coarse-to-Fine Contrastive Learning Framework\n  for Cross-Domain Zero-Shot Slot Filling","summary":"  In task-oriented dialogue scenarios, cross-domain zero-shot slot filling\nplays a vital role in leveraging source domain knowledge to learn a model with\nhigh generalization ability in unknown target domain where annotated data is\nunavailable. However, the existing state-of-the-art zero-shot slot filling\nmethods have limited generalization ability in target domain, they only show\neffective knowledge transfer on seen slots and perform poorly on unseen slots.\nTo alleviate this issue, we present a novel Hierarchical Contrastive Learning\nFramework (HiCL) for zero-shot slot filling. Specifically, we propose a coarse-\nto fine-grained contrastive learning based on Gaussian-distributed embedding to\nlearn the generalized deep semantic relations between utterance-tokens, by\noptimizing inter- and intra-token distribution distance. This encourages HiCL\nto generalize to the slot types unseen at training phase. Furthermore, we\npresent a new iterative label set semantics inference method to unbiasedly and\nseparately evaluate the performance of unseen slot types which entangled with\ntheir counterparts (i.e., seen slot types) in the previous zero-shot slot\nfilling evaluation methods. The extensive empirical experiments on four\ndatasets demonstrate that the proposed method achieves comparable or even\nbetter performance than the current state-of-the-art zero-shot slot filling\napproaches.\n","authors":["Junwen Zhang","Yin Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.09135v1.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.09119v1","updated":"2023-10-13T14:03:01Z","published":"2023-10-13T14:03:01Z","title":"A Frustratingly Easy Plug-and-Play Detection-and-Reasoning Module for\n  Chinese Spelling Check","summary":"  In recent years, Chinese Spelling Check (CSC) has been greatly improved by\ndesigning task-specific pre-training methods or introducing auxiliary tasks,\nwhich mostly solve this task in an end-to-end fashion. In this paper, we\npropose to decompose the CSC workflow into detection, reasoning, and searching\nsubtasks so that the rich external knowledge about the Chinese language can be\nleveraged more directly and efficiently. Specifically, we design a\nplug-and-play detection-and-reasoning module that is compatible with existing\nSOTA non-autoregressive CSC models to further boost their performance. We find\nthat the detection-and-reasoning module trained for one model can also benefit\nother models. We also study the primary interpretability provided by the task\ndecomposition. Extensive experiments and detailed analyses demonstrate the\neffectiveness and competitiveness of the proposed module.\n","authors":["Haojing Huang","Jingheng Ye","Qingyu Zhou","Yinghui Li","Yangning Li","Feng Zhou","Hai-Tao Zheng"],"pdf_url":"https://arxiv.org/pdf/2310.09119v1.pdf","comment":"Accepted for publication in Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2308.16687v2","updated":"2023-10-13T13:56:28Z","published":"2023-08-31T12:43:18Z","title":"DictaBERT: A State-of-the-Art BERT Suite for Modern Hebrew","summary":"  We present DictaBERT, a new state-of-the-art pre-trained BERT model for\nmodern Hebrew, outperforming existing models on most benchmarks. Additionally,\nwe release three fine-tuned versions of the model, designed to perform three\nspecific foundational tasks in the analysis of Hebrew texts: prefix\nsegmentation, morphological tagging and question answering. These fine-tuned\nmodels allow any developer to perform prefix segmentation, morphological\ntagging and question answering of a Hebrew input with a single call to a\nHuggingFace model, without the need to integrate any additional libraries or\ncode. In this paper we describe the details of the training as well and the\nresults on the different benchmarks. We release the models to the community,\nalong with sample code demonstrating their use. We release these models as part\nof our goal to help further research and development in Hebrew NLP.\n","authors":["Shaltiel Shmidman","Avi Shmidman","Moshe Koppel"],"pdf_url":"https://arxiv.org/pdf/2308.16687v2.pdf","comment":"Updated second version, with links to two question-answering models"},{"id":"http://arxiv.org/abs/2310.09107v1","updated":"2023-10-13T13:52:15Z","published":"2023-10-13T13:52:15Z","title":"GLoRE: Evaluating Logical Reasoning of Large Language Models","summary":"  Recently, large language models (LLMs), including notable models such as\nGPT-4 and burgeoning community models, have showcased significant general\nlanguage understanding abilities. However, there has been a scarcity of\nattempts to assess the logical reasoning capacities of these LLMs, an essential\nfacet of natural language understanding. To encourage further investigation in\nthis area, we introduce GLoRE, a meticulously assembled General Logical\nReasoning Evaluation benchmark comprised of 12 datasets that span three\ndifferent types of tasks. Our experimental results show that compared to the\nperformance of human and supervised fine-tuning, the logical reasoning\ncapabilities of open LLM models necessitate additional improvement; ChatGPT and\nGPT-4 show a strong capability of logical reasoning, with GPT-4 surpassing\nChatGPT by a large margin. We propose a self-consistency probing method to\nenhance the accuracy of ChatGPT and a fine-tuned method to boost the\nperformance of an open LLM. We release the datasets and evaluation programs to\nfacilitate future research.\n","authors":["Hanmeng liu","Zhiyang Teng","Ruoxi Ning","Jian Liu","Qiji Zhou","Yue Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.09107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06927v2","updated":"2023-10-13T13:47:44Z","published":"2023-10-10T18:28:38Z","title":"Sparse Fine-tuning for Inference Acceleration of Large Language Models","summary":"  We consider the problem of accurate sparse fine-tuning of large language\nmodels (LLMs), that is, fine-tuning pretrained LLMs on specialized tasks, while\ninducing sparsity in their weights. On the accuracy side, we observe that\nstandard loss-based fine-tuning may fail to recover accuracy, especially at\nhigh sparsities. To address this, we perform a detailed study of\ndistillation-type losses, determining an L2-based distillation approach we term\nSquareHead which enables accurate recovery even at higher sparsities, across\nall model types. On the practical efficiency side, we show that sparse LLMs can\nbe executed with speedups by taking advantage of sparsity, for both CPU and GPU\nruntimes. While the standard approach is to leverage sparsity for computational\nreduction, we observe that in the case of memory-bound LLMs sparsity can also\nbe leveraged for reducing memory bandwidth. We exhibit end-to-end results\nshowing speedups due to sparsity, while recovering accuracy, on T5 (language\ntranslation), Whisper (speech translation), and open GPT-type (MPT for text\ngeneration). For MPT text generation, we show for the first time that sparse\nfine-tuning can reach 75% sparsity without accuracy drops, provide notable\nend-to-end speedups for both CPU and GPU inference, and highlight that sparsity\nis also compatible with quantization approaches. Models and software for\nreproducing our results are provided in Section 6.\n","authors":["Eldar Kurtic","Denis Kuznedelev","Elias Frantar","Michael Goin","Dan Alistarh"],"pdf_url":"https://arxiv.org/pdf/2310.06927v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02071v2","updated":"2023-10-13T13:43:53Z","published":"2023-10-03T14:13:36Z","title":"Towards End-to-End Embodied Decision Making via Multi-modal Large\n  Language Model: Explorations with GPT4-Vision and Beyond","summary":"  In this study, we explore the potential of Multimodal Large Language Models\n(MLLMs) in improving embodied decision-making processes for agents. While Large\nLanguage Models (LLMs) have been widely used due to their advanced reasoning\nskills and vast world knowledge, MLLMs like GPT4-Vision offer enhanced visual\nunderstanding and reasoning capabilities. We investigate whether\nstate-of-the-art MLLMs can handle embodied decision-making in an end-to-end\nmanner and whether collaborations between LLMs and MLLMs can enhance\ndecision-making. To address these questions, we introduce a new benchmark\ncalled PCA-EVAL, which evaluates embodied decision-making from the perspectives\nof Perception, Cognition, and Action. Additionally, we propose HOLMES, a\nmulti-agent cooperation framework that allows LLMs to leverage MLLMs and APIs\nto gather multimodal information for informed decision-making. We compare\nend-to-end embodied decision-making and HOLMES on our benchmark and find that\nthe GPT4-Vision model demonstrates strong end-to-end embodied decision-making\nabilities, outperforming GPT4-HOLMES in terms of average decision accuracy\n(+3%). However, this performance is exclusive to the latest GPT4-Vision model,\nsurpassing the open-source state-of-the-art MLLM by 26%. Our results indicate\nthat powerful MLLMs like GPT4-Vision hold promise for decision-making in\nembodied agents, offering new avenues for MLLM research. Code and data are open\nat https://github.com/pkunlp-icler/PCA-EVAL/.\n","authors":["Liang Chen","Yichi Zhang","Shuhuai Ren","Haozhe Zhao","Zefan Cai","Yuchi Wang","Peiyi Wang","Tianyu Liu","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2310.02071v2.pdf","comment":"18 pages, 10 figures, Code and data:\n  https://github.com/pkunlp-icler/PCA-EVAL/"},{"id":"http://arxiv.org/abs/2211.03462v2","updated":"2023-10-13T13:20:51Z","published":"2022-11-07T11:25:21Z","title":"NAPG: Non-Autoregressive Program Generation for Hybrid Tabular-Textual\n  Question Answering","summary":"  Hybrid tabular-textual question answering (QA) requires reasoning from\nheterogeneous information, and the types of reasoning are mainly divided into\nnumerical reasoning and span extraction. Current numerical reasoning methods\nautoregressively decode program sequences, and each decoding step produces\neither an operator or an operand. However, the step-by-step decoding suffers\nfrom exposure bias, and the accuracy of program generation drops sharply as the\ndecoding steps unfold due to error propagation. In this paper, we propose a\nnon-autoregressive program generation framework, which independently generates\ncomplete program tuples containing both operators and operands, can address the\nerror propagation issue while significantly boosting the speed of program\ngeneration. Experiments on the ConvFinQA and MultiHiertt datasets show that our\nnon-autoregressive program generation method can bring about substantial\nimprovements over the strong FinQANet (+5.06 Exe Acc and +4.80 Prog Acc points)\nand MT2Net (+7.97 EM and +6.38 F1 points) baselines, establishing the new\nstate-of-the-art performance, while being much faster (21x) in program\ngeneration. Finally, with increasing numbers of numerical reasoning steps the\nperformance drop of our method is significantly smaller than that of the\nbaselines. Our code will be publicly available soon.\n","authors":["Tengxun Zhang","Hongfei Xu","Josef van Genabith","Deyi Xiong","Hongying Zan"],"pdf_url":"https://arxiv.org/pdf/2211.03462v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09089v1","updated":"2023-10-13T13:17:03Z","published":"2023-10-13T13:17:03Z","title":"Qilin-Med: Multi-stage Knowledge Injection Advanced Medical Large\n  Language Model","summary":"  Integrating large language models (LLMs) into healthcare presents potential\nbut faces challenges. Directly pre-training LLMs for domains like medicine is\nresource-heavy and sometimes unfeasible. Sole reliance on Supervised\nFine-tuning (SFT) can result in overconfident predictions and may not tap into\ndomain specific insights. Addressing these challenges, we present a multi-stage\ntraining method combining Domain-specific Continued Pre-training (DCPT), SFT,\nand Direct Preference Optimization (DPO). A notable contribution of our study\nis the introduction of a 3Gb Chinese Medicine (ChiMed) dataset, encompassing\nmedical question answering, plain texts, knowledge graphs, and dialogues,\nsegmented into three training stages. The medical LLM trained with our\npipeline, Qilin-Med, exhibits significant performance boosts. In the CPT and\nSFT phases, it achieves 38.4% and 40.0% accuracy on the CMExam, surpassing\nBaichuan-7B's 33.5%. In the DPO phase, on the Huatuo-26M test set, it scores\n16.66 in BLEU-1 and 27.44 in ROUGE1, outperforming the SFT's 12.69 and 24.21.\nThis highlights the strength of our training approach in refining LLMs for\nmedical applications.\n","authors":["Qichen Ye","Junling Liu","Dading Chong","Peilin Zhou","Yining Hua","Andrew Liu"],"pdf_url":"https://arxiv.org/pdf/2310.09089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09088v1","updated":"2023-10-13T13:16:57Z","published":"2023-10-13T13:16:57Z","title":"Dialect Transfer for Swiss German Speech Translation","summary":"  This paper investigates the challenges in building Swiss German speech\ntranslation systems, specifically focusing on the impact of dialect diversity\nand differences between Swiss German and Standard German. Swiss German is a\nspoken language with no formal writing system, it comprises many diverse\ndialects and is a low-resource language with only around 5 million speakers.\nThe study is guided by two key research questions: how does the inclusion and\nexclusion of dialects during the training of speech translation models for\nSwiss German impact the performance on specific dialects, and how do the\ndifferences between Swiss German and Standard German impact the performance of\nthe systems? We show that dialect diversity and linguistic differences pose\nsignificant challenges to Swiss German speech translation, which is in line\nwith linguistic hypotheses derived from empirical investigations.\n","authors":["Claudio Paonessa","Yanick Schraner","Jan Deriu","Manuela Hürlimann","Manfred Vogel","Mark Cieliebak"],"pdf_url":"https://arxiv.org/pdf/2310.09088v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.13619v2","updated":"2023-10-13T12:13:21Z","published":"2023-02-27T09:40:41Z","title":"Orca: A Few-shot Benchmark for Chinese Conversational Machine Reading\n  Comprehension","summary":"  The conversational machine reading comprehension (CMRC) task aims to answer\nquestions in conversations, which has been a hot research topic in recent years\nbecause of its wide applications. However, existing CMRC benchmarks in which\neach conversation is assigned a static passage are inconsistent with real\nscenarios. Thus, model's comprehension ability towards real scenarios are hard\nto evaluate reasonably. To this end, we propose the first Chinese CMRC\nbenchmark Orca and further provide zero-shot/few-shot settings to evaluate\nmodel's generalization ability towards diverse domains. We collect 831\nhot-topic driven conversations with 4,742 turns in total. Each turn of a\nconversation is assigned with a response-related passage, aiming to evaluate\nmodel's comprehension ability more reasonably. The topics of conversations are\ncollected from social media platform and cover 33 domains, trying to be\nconsistent with real scenarios. Importantly, answers in Orca are all\nwell-annotated natural responses rather than the specific spans or short phrase\nin previous datasets. Besides, we implement three strong baselines to tackle\nthe challenge in Orca. The results indicate the great challenge of our CMRC\nbenchmark. Our datatset and checkpoints are available at\nhttps://github.com/nuochenpku/Orca.\n","authors":["Nuo Chen","Hongguang Li","Junqing He","Yinan Bao","Xinshi Lin","Qi Yang","Jianfeng Liu","Ruyi Gan","Jiaxing Zhang","Baoyuan Wang","Jia Li"],"pdf_url":"https://arxiv.org/pdf/2302.13619v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2310.09044v1","updated":"2023-10-13T12:12:34Z","published":"2023-10-13T12:12:34Z","title":"KCTS: Knowledge-Constrained Tree Search Decoding with Token-Level\n  Hallucination Detection","summary":"  Large Language Models (LLMs) have demonstrated remarkable human-level natural\nlanguage generation capabilities. However, their potential to generate\nmisinformation, often called the hallucination problem, poses a significant\nrisk to their deployment. A common approach to address this issue is to\nretrieve relevant knowledge and fine-tune the LLM with the knowledge in its\ninput. Unfortunately, this method incurs high training costs and may cause\ncatastrophic forgetting for multi-tasking models. To overcome these\nlimitations, we propose a knowledge-constrained decoding method called KCTS\n(Knowledge-Constrained Tree Search), which guides a frozen LM to generate text\naligned with the reference knowledge at each decoding step using a knowledge\nclassifier score and MCTS (Monte-Carlo Tree Search). To adapt the\nsequence-level knowledge classifier to token-level guidance, we also propose a\nnovel token-level hallucination detection method called RIPA (Reward Inflection\nPoint Approximation). Our empirical results on knowledge-grounded dialogue and\nabstractive summarization demonstrate the strength of KCTS as a plug-and-play,\nmodel-agnostic decoding method that can effectively reduce hallucinations in\nnatural language generation.\n","authors":["Sehyun Choi","Tianqing Fang","Zhaowei Wang","Yangqiu Song"],"pdf_url":"https://arxiv.org/pdf/2310.09044v1.pdf","comment":"Accepted at EMNLP 2023 Main Conference"},{"id":"http://arxiv.org/abs/2310.02778v2","updated":"2023-10-13T12:10:01Z","published":"2023-10-04T12:50:26Z","title":"Integrating UMLS Knowledge into Large Language Models for Medical\n  Question Answering","summary":"  Large language models (LLMs) have demonstrated powerful text generation\ncapabilities, bringing unprecedented innovation to the healthcare field. While\nLLMs hold immense promise for applications in healthcare, applying them to real\nclinical scenarios presents significant challenges, as these models may\ngenerate content that deviates from established medical facts and even exhibit\npotential biases. In our research, we develop an augmented LLM framework based\non the Unified Medical Language System (UMLS), aiming to better serve the\nhealthcare community. We employ LLaMa2-13b-chat and ChatGPT-3.5 as our\nbenchmark models, and conduct automatic evaluations using the ROUGE Score and\nBERTScore on 104 questions from the LiveQA test set. Additionally, we establish\ncriteria for physician-evaluation based on four dimensions: Factuality,\nCompleteness, Readability and Relevancy. ChatGPT-3.5 is used for physician\nevaluation with 20 questions on the LiveQA test set. Multiple resident\nphysicians conducted blind reviews to evaluate the generated content, and the\nresults indicate that this framework effectively enhances the factuality,\ncompleteness, and relevance of generated content. Our research demonstrates the\neffectiveness of using UMLS-augmented LLMs and highlights the potential\napplication value of LLMs in in medical question-answering.\n","authors":["Rui Yang","Edison Marrese-Taylor","Yuhe Ke","Lechao Cheng","Qingyu Chen","Irene Li"],"pdf_url":"https://arxiv.org/pdf/2310.02778v2.pdf","comment":"12 pages, 3 figures"},{"id":"http://arxiv.org/abs/2210.03029v3","updated":"2023-10-13T11:58:30Z","published":"2022-10-06T16:26:03Z","title":"Efficiently Enhancing Zero-Shot Performance of Instruction Following\n  Model via Retrieval of Soft Prompt","summary":"  Enhancing the zero-shot performance of instruction-following models requires\nheavy computation, either by scaling the total number of training datasets or\nthe model size. In this work, we explore how retrieval of soft prompts obtained\nthrough prompt tuning can efficiently assist hard prompts in zero-shot task\ngeneralization. Specifically, we train soft prompt embeddings for each prompt\nthrough prompt tuning, store the samples of the training instances mapped with\nthe prompt embeddings, and retrieve the corresponding prompt embedding of the\ntraining instance closest to the query instance during inference. While only\nadding 0.007% additional parameters, retrieval of soft prompt enhances the\nperformance of T0 on unseen tasks by outperforming it on 10 out of 11 datasets\nas well as improving the mean accuracy of T0 on BIG-bench benchmark by 2.39%\npoints. Also, we report an interesting finding that retrieving source\nembeddings trained on similar answer choice formats is more important than\nthose on similar task types.\n","authors":["Seonghyeon Ye","Joel Jang","Doyoung Kim","Yongrae Jo","Minjoon Seo"],"pdf_url":"https://arxiv.org/pdf/2210.03029v3.pdf","comment":"EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2310.09036v1","updated":"2023-10-13T11:57:04Z","published":"2023-10-13T11:57:04Z","title":"MM-BigBench: Evaluating Multimodal Models on Multimodal Content\n  Comprehension Tasks","summary":"  The popularity of multimodal large language models (MLLMs) has triggered a\nrecent surge in research efforts dedicated to evaluating these models.\nNevertheless, existing evaluation studies of MLLMs primarily focus on the\ncomprehension and reasoning of unimodal (vision) content, neglecting\nperformance evaluations in the domain of multimodal (vision-language) content\nunderstanding. Beyond multimodal reasoning, tasks related to multimodal content\ncomprehension necessitate a profound understanding of multimodal contexts,\nachieved through the multimodal interaction to obtain a final answer. In this\npaper, we introduce a comprehensive assessment framework called MM-BigBench,\nwhich incorporates a diverse range of metrics to offer an extensive evaluation\nof the performance of various models and instructions across a wide spectrum of\ndiverse multimodal content comprehension tasks. Consequently, our work\ncomplements research on the performance of MLLMs in multimodal comprehension\ntasks, achieving a more comprehensive and holistic evaluation of MLLMs. To\nbegin, we employ the Best Performance metric to ascertain each model's\nperformance upper bound on different datasets. Subsequently, the Mean Relative\nGain metric offers an assessment of the overall performance of various models\nand instructions, while the Stability metric measures their sensitivity.\nFurthermore, previous research centers on evaluating models independently or\nsolely assessing instructions, neglecting the adaptability between models and\ninstructions. We propose the Adaptability metric to quantify the adaptability\nbetween models and instructions. Our paper evaluates a total of 20 language\nmodels (14 MLLMs) on 14 multimodal datasets spanning 6 tasks, with 10\ninstructions for each task, and derives novel insights. Our code will be\nreleased at https://github.com/declare-lab/MM-BigBench.\n","authors":["Xiaocui Yang","Wenfang Wu","Shi Feng","Ming Wang","Daling Wang","Yang Li","Qi Sun","Yifei Zhang","Xiaoming Fu","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2310.09036v1.pdf","comment":"Underview"},{"id":"http://arxiv.org/abs/2310.09017v1","updated":"2023-10-13T11:28:02Z","published":"2023-10-13T11:28:02Z","title":"Dont Add, dont Miss: Effective Content Preserving Generation from\n  Pre-Selected Text Spans","summary":"  The recently introduced Controlled Text Reduction (CTR) task isolates the\ntext generation step within typical summarization-style tasks. It does so by\nchallenging models to generate coherent text conforming to pre-selected content\nwithin the input text (\"highlights\").\n  This framing enables increased modularity in summarization-like tasks,\nallowing to couple a single CTR model with various content-selection setups and\nmodules.\n  However, there are currently no reliable CTR models, while the performance of\nthe existing baseline for the task is mediocre, falling short of practical\nutility.\n  Here, we address this gap by introducing a high-quality, open-source CTR\nmodel that tackles two prior key limitations: inadequate enforcement of the\ncontent-preservation constraint, and suboptimal silver training data.\n  Addressing these, we amplify the content-preservation constraint in both\ntraining, via RL, and inference, via a controlled decoding strategy.\n  Further, we substantially improve the silver training data quality via GPT-4\ndistillation.\n  Overall, pairing the distilled dataset with the highlight-adherence\nstrategies yields marked gains over the current baseline, of up to 30 ROUGE-L\npoints, providing a reliable CTR model for downstream use.\n","authors":["Aviv Slobodkin","Avi Caciularu","Eran Hirsch","Ido Dagan"],"pdf_url":"https://arxiv.org/pdf/2310.09017v1.pdf","comment":"EMNLP 2023, findings"},{"id":"http://arxiv.org/abs/2305.13066v2","updated":"2023-10-13T11:19:19Z","published":"2023-05-22T14:36:32Z","title":"Biomedical Named Entity Recognition via Dictionary-based Synonym\n  Generalization","summary":"  Biomedical named entity recognition is one of the core tasks in biomedical\nnatural language processing (BioNLP). To tackle this task, numerous\nsupervised/distantly supervised approaches have been proposed. Despite their\nremarkable success, these approaches inescapably demand laborious human effort.\nTo alleviate the need of human effort, dictionary-based approaches have been\nproposed to extract named entities simply based on a given dictionary. However,\none downside of existing dictionary-based approaches is that they are\nchallenged to identify concept synonyms that are not listed in the given\ndictionary, which we refer as the synonym generalization problem. In this\nstudy, we propose a novel Synonym Generalization (SynGen) framework that\nrecognizes the biomedical concepts contained in the input text using span-based\npredictions. In particular, SynGen introduces two regularization terms, namely,\n(1) a synonym distance regularizer; and (2) a noise perturbation regularizer,\nto minimize the synonym generalization error. To demonstrate the effectiveness\nof our approach, we provide a theoretical analysis of the bound of synonym\ngeneralization error. We extensively evaluate our approach on a wide range of\nbenchmarks and the results verify that SynGen outperforms previous\ndictionary-based models by notable margins. Lastly, we provide a detailed\nanalysis to further reveal the merits and inner-workings of our approach.\n","authors":["Zihao Fu","Yixuan Su","Zaiqiao Meng","Nigel Collier"],"pdf_url":"https://arxiv.org/pdf/2305.13066v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07397v2","updated":"2023-10-13T11:16:58Z","published":"2023-10-11T11:32:57Z","title":"Target-oriented Proactive Dialogue Systems with Personalization: Problem\n  Formulation and Dataset Curation","summary":"  Target-oriented dialogue systems, designed to proactively steer conversations\ntoward predefined targets or accomplish specific system-side goals, are an\nexciting area in conversational AI. In this work, by formulating a <dialogue\nact, topic> pair as the conversation target, we explore a novel problem of\npersonalized target-oriented dialogue by considering personalization during the\ntarget accomplishment process. However, there remains an emergent need for\nhigh-quality datasets, and building one from scratch requires tremendous human\neffort. To address this, we propose an automatic dataset curation framework\nusing a role-playing approach. Based on this framework, we construct a\nlarge-scale personalized target-oriented dialogue dataset, TopDial, which\ncomprises about 18K multi-turn dialogues. The experimental results show that\nthis dataset is of high quality and could contribute to exploring personalized\ntarget-oriented dialogue.\n","authors":["Jian Wang","Yi Cheng","Dongding Lin","Chak Tou Leong","Wenjie Li"],"pdf_url":"https://arxiv.org/pdf/2310.07397v2.pdf","comment":"Accepted to EMNLP-2023 main conference"},{"id":"http://arxiv.org/abs/2305.13026v2","updated":"2023-10-13T10:43:05Z","published":"2023-05-22T13:27:37Z","title":"DUMB: A Benchmark for Smart Evaluation of Dutch Models","summary":"  We introduce the Dutch Model Benchmark: DUMB. The benchmark includes a\ndiverse set of datasets for low-, medium- and high-resource tasks. The total\nset of nine tasks includes four tasks that were previously not available in\nDutch. Instead of relying on a mean score across tasks, we propose Relative\nError Reduction (RER), which compares the DUMB performance of language models\nto a strong baseline which can be referred to in the future even when assessing\ndifferent sets of language models. Through a comparison of 14 pre-trained\nlanguage models (mono- and multi-lingual, of varying sizes), we assess the\ninternal consistency of the benchmark tasks, as well as the factors that likely\nenable high performance. Our results indicate that current Dutch monolingual\nmodels under-perform and suggest training larger Dutch models with other\narchitectures and pre-training objectives. At present, the highest performance\nis achieved by DeBERTaV3 (large), XLM-R (large) and mDeBERTaV3 (base). In\naddition to highlighting best strategies for training larger Dutch models, DUMB\nwill foster further research on Dutch. A public leaderboard is available at\nhttps://dumbench.nl.\n","authors":["Wietse de Vries","Martijn Wieling","Malvina Nissim"],"pdf_url":"https://arxiv.org/pdf/2305.13026v2.pdf","comment":"EMNLP 2023 camera-ready"},{"id":"http://arxiv.org/abs/2310.08992v1","updated":"2023-10-13T10:17:48Z","published":"2023-10-13T10:17:48Z","title":"CodeChain: Towards Modular Code Generation Through Chain of\n  Self-revisions with Representative Sub-modules","summary":"  Large Language Models (LLMs) have already become quite proficient at solving\nsimpler programming tasks like those in HumanEval or MBPP benchmarks. However,\nsolving more complex and competitive programming tasks is still quite\nchallenging for these models - possibly due to their tendency to generate\nsolutions as monolithic code blocks instead of decomposing them into logical\nsub-tasks and sub-modules. On the other hand, experienced programmers\ninstinctively write modularized code with abstraction for solving complex\ntasks, often reusing previously developed modules. To address this gap, we\npropose CodeChain, a novel framework for inference that elicits modularized\ncode generation through a chain of self-revisions, each being guided by some\nrepresentative sub-modules generated in previous iterations. Concretely,\nCodeChain first instructs the LLM to generate modularized codes through\nchain-of-thought prompting. Then it applies a chain of self-revisions by\niterating the two steps: 1) extracting and clustering the generated sub-modules\nand selecting the cluster representatives as the more generic and re-usable\nimplementations, and 2) augmenting the original chain-of-thought prompt with\nthese selected module-implementations and instructing the LLM to re-generate\nnew modularized solutions. We find that by naturally encouraging the LLM to\nreuse the previously developed and verified sub-modules, CodeChain can\nsignificantly boost both modularity as well as correctness of the generated\nsolutions, achieving relative pass@1 improvements of 35% on APPS and 76% on\nCodeContests. It is shown to be effective on both OpenAI LLMs as well as\nopen-sourced LLMs like WizardCoder. We also conduct comprehensive ablation\nstudies with different methods of prompting, number of clusters, model sizes,\nprogram qualities, etc., to provide useful insights that underpin CodeChain's\nsuccess.\n","authors":["Hung Le","Hailin Chen","Amrita Saha","Akash Gokul","Doyen Sahoo","Shafiq Joty"],"pdf_url":"https://arxiv.org/pdf/2310.08992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05364v3","updated":"2023-10-13T09:47:08Z","published":"2023-10-09T02:50:54Z","title":"Universal Multi-modal Entity Alignment via Iteratively Fusing Modality\n  Similarity Paths","summary":"  The objective of Entity Alignment (EA) is to identify equivalent entity pairs\nfrom multiple Knowledge Graphs (KGs) and create a more comprehensive and\nunified KG. The majority of EA methods have primarily focused on the structural\nmodality of KGs, lacking exploration of multi-modal information. A few\nmulti-modal EA methods have made good attempts in this field. Still, they have\ntwo shortcomings: (1) inconsistent and inefficient modality modeling that\ndesigns complex and distinct models for each modality; (2) ineffective modality\nfusion due to the heterogeneous nature of modalities in EA. To tackle these\nchallenges, we propose PathFusion, consisting of two main components: (1) MSP,\na unified modeling approach that simplifies the alignment process by\nconstructing paths connecting entities and modality nodes to represent multiple\nmodalities; (2) IRF, an iterative fusion method that effectively combines\ninformation from different modalities using the path as an information carrier.\nExperimental results on real-world datasets demonstrate the superiority of\nPathFusion over state-of-the-art methods, with 22.4%-28.9% absolute improvement\non Hits@1, and 0.194-0.245 absolute improvement on MRR.\n","authors":["Bolin Zhu","Xiaoze Liu","Xin Mao","Zhuo Chen","Lingbing Guo","Tao Gui","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.05364v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08975v1","updated":"2023-10-13T09:45:14Z","published":"2023-10-13T09:45:14Z","title":"ChatKBQA: A Generate-then-Retrieve Framework for Knowledge Base Question\n  Answering with Fine-tuned Large Language Models","summary":"  Knowledge Base Question Answering (KBQA) aims to derive answers to natural\nlanguage questions over large-scale knowledge bases (KBs), which are generally\ndivided into two research components: knowledge retrieval and semantic parsing.\nHowever, three core challenges remain, including inefficient knowledge\nretrieval, retrieval errors adversely affecting semantic parsing, and the\ncomplexity of previous KBQA methods. In the era of large language models\n(LLMs), we introduce ChatKBQA, a novel generate-then-retrieve KBQA framework\nbuilt on fine-tuning open-source LLMs such as Llama-2, ChatGLM2 and Baichuan2.\nChatKBQA proposes generating the logical form with fine-tuned LLMs first, then\nretrieving and replacing entities and relations through an unsupervised\nretrieval method, which improves both generation and retrieval more\nstraightforwardly. Experimental results reveal that ChatKBQA achieves new\nstate-of-the-art performance on standard KBQA datasets, WebQSP, and\nComplexWebQuestions (CWQ). This work also provides a new paradigm for combining\nLLMs with knowledge graphs (KGs) for interpretable and knowledge-required\nquestion answering. Our code is publicly available.\n","authors":["Haoran Luo","Haihong E","Zichen Tang","Shiyao Peng","Yikai Guo","Wentai Zhang","Chenghao Ma","Guanting Dong","Meina Song","Wei Lin"],"pdf_url":"https://arxiv.org/pdf/2310.08975v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2310.08967v1","updated":"2023-10-13T09:18:57Z","published":"2023-10-13T09:18:57Z","title":"Towards Example-Based NMT with Multi-Levenshtein Transformers","summary":"  Retrieval-Augmented Machine Translation (RAMT) is attracting growing\nattention. This is because RAMT not only improves translation metrics, but is\nalso assumed to implement some form of domain adaptation. In this contribution,\nwe study another salient trait of RAMT, its ability to make translation\ndecisions more transparent by allowing users to go back to examples that\ncontributed to these decisions.\n  For this, we propose a novel architecture aiming to increase this\ntransparency. This model adapts a retrieval-augmented version of the\nLevenshtein Transformer and makes it amenable to simultaneously edit multiple\nfuzzy matches found in memory. We discuss how to perform training and inference\nin this model, based on multi-way alignment algorithms and imitation learning.\nOur experiments show that editing several examples positively impacts\ntranslation scores, notably increasing the number of target spans that are\ncopied from existing instances.\n","authors":["Maxime Bouthors","Josep Crego","François Yvon"],"pdf_url":"https://arxiv.org/pdf/2310.08967v1.pdf","comment":"17 pages, EMNLP 2023 submission"},{"id":"http://arxiv.org/abs/2310.08958v1","updated":"2023-10-13T09:07:13Z","published":"2023-10-13T09:07:13Z","title":"xDial-Eval: A Multilingual Open-Domain Dialogue Evaluation Benchmark","summary":"  Recent advancements in reference-free learned metrics for open-domain\ndialogue evaluation have been driven by the progress in pre-trained language\nmodels and the availability of dialogue data with high-quality human\nannotations. However, current studies predominantly concentrate on English\ndialogues, and the generalization of these metrics to other languages has not\nbeen fully examined. This is largely due to the absence of a multilingual\ndialogue evaluation benchmark. To address the issue, we introduce xDial-Eval,\nbuilt on top of open-source English dialogue evaluation datasets. xDial-Eval\nincludes 12 turn-level and 6 dialogue-level English datasets, comprising 14930\nannotated turns and 8691 annotated dialogues respectively. The English dialogue\ndata are extended to nine other languages with commercial machine translation\nsystems. On xDial-Eval, we conduct comprehensive analyses of previous\nBERT-based metrics and the recently-emerged large language models. Lastly, we\nestablish strong self-supervised and multilingual baselines. In terms of\naverage Pearson correlations over all datasets and languages, the best baseline\noutperforms OpenAI's ChatGPT by absolute improvements of 6.5% and 4.6% at the\nturn and dialogue levels respectively, albeit with much fewer parameters. The\ndata and code are publicly available at https://github.com/e0397123/xDial-Eval.\n","authors":["Chen Zhang","Luis Fernando D'Haro","Chengguang Tang","Ke Shi","Guohua Tang","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2310.08958v1.pdf","comment":"Accepted to EMNLP-2023 Findings"},{"id":"http://arxiv.org/abs/2305.02615v2","updated":"2023-10-13T09:02:23Z","published":"2023-05-04T07:45:49Z","title":"How to Enhance Causal Discrimination of Utterances: A Case on Affective\n  Reasoning","summary":"  Our investigation into the Affective Reasoning in Conversation (ARC) task\nhighlights the challenge of causal discrimination. Almost all existing models,\nincluding large language models (LLMs), excel at capturing semantic\ncorrelations within utterance embeddings but fall short in determining the\nspecific causal relationships. To overcome this limitation, we propose the\nincorporation of \\textit{i.i.d.} noise terms into the conversation process,\nthereby constructing a structural causal model (SCM). It explores how distinct\ncausal relationships of fitted embeddings can be discerned through independent\nconditions. To facilitate the implementation of deep learning, we introduce the\ncogn frameworks to handle unstructured conversation data, and employ an\nautoencoder architecture to regard the unobservable noise as learnable\n\"implicit causes.\" Moreover, we curate a synthetic dataset that includes i.i.d.\nnoise. Through comprehensive experiments, we validate the effectiveness and\ninterpretability of our approach. Our code is available in\nhttps://github.com/Zodiark-ch/mater-of-our-EMNLP2023-paper.\n","authors":["Hang Chen","Jing Luo","Xinyu Yang","Wenjing Zhu"],"pdf_url":"https://arxiv.org/pdf/2305.02615v2.pdf","comment":"accepted via EMNLP2023-main"},{"id":"http://arxiv.org/abs/2310.08954v1","updated":"2023-10-13T08:55:19Z","published":"2023-10-13T08:55:19Z","title":"Textual Analysis of ICALEPCS and IPAC Conference Proceedings: Revealing\n  Research Trends, Topics, and Collaborations for Future Insights and Advanced\n  Search","summary":"  In this paper, we show a textual analysis of past ICALEPCS and IPAC\nconference proceedings to gain insights into the research trends and topics\ndiscussed in the field. We use natural language processing techniques to\nextract meaningful information from the abstracts and papers of past conference\nproceedings. We extract topics to visualize and identify trends, analyze their\nevolution to identify emerging research directions, and highlight interesting\npublications based solely on their content with an analysis of their network.\nAdditionally, we will provide an advanced search tool to better search the\nexisting papers to prevent duplication and easier reference findings. Our\nanalysis provides a comprehensive overview of the research landscape in the\nfield and helps researchers and practitioners to better understand the\nstate-of-the-art and identify areas for future research.\n","authors":["Antonin Sulc","Annika Eichler","Tim Wilksen"],"pdf_url":"https://arxiv.org/pdf/2310.08954v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08949v1","updated":"2023-10-13T08:38:56Z","published":"2023-10-13T08:38:56Z","title":"Making Multimodal Generation Easier: When Diffusion Models Meet LLMs","summary":"  We present EasyGen, an efficient model designed to enhance multimodal\nunderstanding and generation by harnessing the capabilities of diffusion models\nand large language models (LLMs). Unlike existing multimodal models that\npredominately depend on encoders like CLIP or ImageBind and need ample amounts\nof training data to bridge the gap between modalities, EasyGen is built upon a\nbidirectional conditional diffusion model named BiDiffuser, which promotes more\nefficient interactions between modalities. EasyGen handles image-to-text\ngeneration by integrating BiDiffuser and an LLM via a simple projection layer.\nUnlike most existing multimodal models that are limited to generating text\nresponses, EasyGen can also facilitate text-to-image generation by leveraging\nthe LLM to create textual descriptions, which can be interpreted by BiDiffuser\nto generate appropriate visual responses. Extensive quantitative and\nqualitative experiments demonstrate the effectiveness of EasyGen, whose\ntraining can be easily achieved in a lab setting. The source code is available\nat https://github.com/zxy556677/EasyGen.\n","authors":["Xiangyu Zhao","Bo Liu","Qijiong Liu","Guangyuan Shi","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2310.08949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08944v1","updated":"2023-10-13T08:19:31Z","published":"2023-10-13T08:19:31Z","title":"CAMELL: Confidence-based Acquisition Model for Efficient Self-supervised\n  Active Learning with Label Validation","summary":"  Supervised neural approaches are hindered by their dependence on large,\nmeticulously annotated datasets, a requirement that is particularly cumbersome\nfor sequential tasks. The quality of annotations tends to deteriorate with the\ntransition from expert-based to crowd-sourced labelling. To address these\nchallenges, we present \\textbf{CAMELL} (Confidence-based Acquisition Model for\nEfficient self-supervised active Learning with Label validation), a pool-based\nactive learning framework tailored for sequential multi-output problems. CAMELL\npossesses three core features: (1) it requires expert annotators to label only\na fraction of a chosen sequence, (2) it facilitates self-supervision for the\nremainder of the sequence, and (3) it employs a label validation mechanism to\nprevent erroneous labels from contaminating the dataset and harming model\nperformance. We evaluate CAMELL on sequential tasks, with a special emphasis on\ndialogue belief tracking, a task plagued by the constraints of limited and\nnoisy datasets. Our experiments demonstrate that CAMELL outperforms the\nbaselines in terms of efficiency. Furthermore, the data corrections suggested\nby our method contribute to an overall improvement in the quality of the\nresulting datasets.\n","authors":["Carel van Niekerk","Christian Geishauser","Michael Heck","Shutong Feng","Hsien-chin Lin","Nurul Lubis","Benjamin Ruppik","Renato Vukovic","Milica Gašić"],"pdf_url":"https://arxiv.org/pdf/2310.08944v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08943v1","updated":"2023-10-13T08:16:27Z","published":"2023-10-13T08:16:27Z","title":"Multi-level Adaptive Contrastive Learning for Knowledge Internalization\n  in Dialogue Generation","summary":"  Knowledge-grounded dialogue generation aims to mitigate the issue of text\ndegeneration by incorporating external knowledge to supplement the context.\nHowever, the model often fails to internalize this information into responses\nin a human-like manner. Instead, it simply inserts segments of the provided\nknowledge into generic responses. As a result, the generated responses tend to\nbe tedious, incoherent, and in lack of interactivity which means the\ndegeneration problem is still unsolved. In this work, we first find that such\ncopying-style degeneration is primarily due to the weak likelihood objective,\nwhich allows the model to \"cheat\" the objective by merely duplicating knowledge\nsegments in a superficial pattern matching based on overlap. To overcome this\nchallenge, we then propose a Multi-level Adaptive Contrastive Learning (MACL)\nframework that dynamically samples negative examples and subsequently penalizes\ndegeneration behaviors at both the token-level and sequence-level. Extensive\nexperiments on the WoW dataset demonstrate the effectiveness of our approach\nacross various pre-trained models.\n","authors":["Chenxu Yang","Zheng Lin","Lanrui Wang","Chong Tian","Liang Pang","Jiangnan Li","Yanan Cao","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2310.08943v1.pdf","comment":"EMNLP 2023 main conference"},{"id":"http://arxiv.org/abs/2310.08923v1","updated":"2023-10-13T07:49:11Z","published":"2023-10-13T07:49:11Z","title":"Towards Informative Few-Shot Prompt with Maximum Information Gain for\n  In-Context Learning","summary":"  Large Language models (LLMs) possess the capability to engage In-context\nLearning (ICL) by leveraging a few demonstrations pertaining to a new\ndownstream task as conditions. However, this particular learning paradigm\nsuffers from high instability stemming from substantial variances induced by\nfactors such as the input distribution of selected examples, their ordering,\nand prompt formats. In this work, we demonstrate that even when all these\nfactors are held constant, the random selection of examples still results in\nhigh variance. Consequently, we aim to explore the informative ability of data\nexamples by quantifying the Information Gain (IG) obtained in prediction after\nobserving a given example candidate. Then we propose to sample those with\nmaximum IG. Additionally, we identify the presence of template bias, which can\nlead to unfair evaluations of IG during the sampling process. To mitigate this\nbias, we introduce Calibration Before Sampling strategy. The experimental\nresults illustrate that our proposed method can yield an average relative\nimprovement of 14.3% across six classification tasks using three LLMs.\n","authors":["Hongfu Liu","Ye Wang"],"pdf_url":"https://arxiv.org/pdf/2310.08923v1.pdf","comment":"Accepted to the Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.08917v1","updated":"2023-10-13T07:40:12Z","published":"2023-10-13T07:40:12Z","title":"Relation-aware Ensemble Learning for Knowledge Graph Embedding","summary":"  Knowledge graph (KG) embedding is a fundamental task in natural language\nprocessing, and various methods have been proposed to explore semantic patterns\nin distinctive ways. In this paper, we propose to learn an ensemble by\nleveraging existing methods in a relation-aware manner. However, exploring\nthese semantics using relation-aware ensemble leads to a much larger search\nspace than general ensemble methods. To address this issue, we propose a\ndivide-search-combine algorithm RelEns-DSC that searches the relation-wise\nensemble weights independently. This algorithm has the same computation cost as\ngeneral ensemble methods but with much better performance. Experimental results\non benchmark datasets demonstrate the effectiveness of the proposed method in\nefficiently searching relation-aware ensemble weights and achieving\nstate-of-the-art embedding performance. The code is public at\nhttps://github.com/LARS-research/RelEns.\n","authors":["Ling Yue","Yongqi Zhang","Quanming Yao","Yong Li","Xian Wu","Ziheng Zhang","Zhenxi Lin","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2310.08917v1.pdf","comment":"This short paper has been accepted by EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.08908v1","updated":"2023-10-13T07:30:27Z","published":"2023-10-13T07:30:27Z","title":"Human-in-the-loop Machine Translation with Large Language Model","summary":"  The large language model (LLM) has garnered significant attention due to its\nin-context learning mechanisms and emergent capabilities. The research\ncommunity has conducted several pilot studies to apply LLMs to machine\ntranslation tasks and evaluate their performance from diverse perspectives.\nHowever, previous research has primarily focused on the LLM itself and has not\nexplored human intervention in the inference process of LLM. The\ncharacteristics of LLM, such as in-context learning and prompt engineering,\nclosely mirror human cognitive abilities in language tasks, offering an\nintuitive solution for human-in-the-loop generation. In this study, we propose\na human-in-the-loop pipeline that guides LLMs to produce customized outputs\nwith revision instructions. The pipeline initiates by prompting the LLM to\nproduce a draft translation, followed by the utilization of automatic retrieval\nor human feedback as supervision signals to enhance the LLM's translation\nthrough in-context learning. The human-machine interactions generated in this\npipeline are also stored in an external database to expand the in-context\nretrieval database, enabling us to leverage human supervision in an offline\nsetting. We evaluate the proposed pipeline using GPT-3.5-turbo API on five\ndomain-specific benchmarks for German-English translation. The results\ndemonstrate the effectiveness of the pipeline in tailoring in-domain\ntranslations and improving translation performance compared to direct\ntranslation. Additionally, we discuss the results from the following\nperspectives: 1) the effectiveness of different in-context retrieval methods;\n2) the construction of a retrieval database under low-resource scenarios; 3)\nthe observed domains differences; 4) the quantitative analysis of linguistic\nstatistics; and 5) the qualitative analysis of translation cases. The code and\ndata are available at https://github.com/NLP2CT/HIL-MT/.\n","authors":["Xinyi Yang","Runzhe Zhan","Derek F. Wong","Junchao Wu","Lidia S. Chao"],"pdf_url":"https://arxiv.org/pdf/2310.08908v1.pdf","comment":"Accepted to MT Summit 2023"},{"id":"http://arxiv.org/abs/2310.08903v1","updated":"2023-10-13T07:18:53Z","published":"2023-10-13T07:18:53Z","title":"SeqXGPT: Sentence-Level AI-Generated Text Detection","summary":"  Widely applied large language models (LLMs) can generate human-like content,\nraising concerns about the abuse of LLMs. Therefore, it is important to build\nstrong AI-generated text (AIGT) detectors. Current works only consider\ndocument-level AIGT detection, therefore, in this paper, we first introduce a\nsentence-level detection challenge by synthesizing a dataset that contains\ndocuments that are polished with LLMs, that is, the documents contain sentences\nwritten by humans and sentences modified by LLMs. Then we propose\n\\textbf{Seq}uence \\textbf{X} (Check) \\textbf{GPT}, a novel method that utilizes\nlog probability lists from white-box LLMs as features for sentence-level AIGT\ndetection. These features are composed like \\textit{waves} in speech processing\nand cannot be studied by LLMs. Therefore, we build SeqXGPT based on convolution\nand self-attention networks. We test it in both sentence and document-level\ndetection challenges. Experimental results show that previous methods struggle\nin solving sentence-level AIGT detection, while our method not only\nsignificantly surpasses baseline methods in both sentence and document-level\ndetection challenges but also exhibits strong generalization capabilities.\n","authors":["Pengyu Wang","Linyang Li","Ke Ren","Botian Jiang","Dong Zhang","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2310.08903v1.pdf","comment":"Accepted by EMNLP2023"},{"id":"http://arxiv.org/abs/2310.08901v1","updated":"2023-10-13T07:15:32Z","published":"2023-10-13T07:15:32Z","title":"Welfare Diplomacy: Benchmarking Language Model Cooperation","summary":"  The growing capabilities and increasingly widespread deployment of AI systems\nnecessitate robust benchmarks for measuring their cooperative capabilities.\nUnfortunately, most multi-agent benchmarks are either zero-sum or purely\ncooperative, providing limited opportunities for such measurements. We\nintroduce a general-sum variant of the zero-sum board game Diplomacy -- called\nWelfare Diplomacy -- in which players must balance investing in military\nconquest and domestic welfare. We argue that Welfare Diplomacy facilitates both\na clearer assessment of and stronger training incentives for cooperative\ncapabilities. Our contributions are: (1) proposing the Welfare Diplomacy rules\nand implementing them via an open-source Diplomacy engine; (2) constructing\nbaseline agents using zero-shot prompted language models; and (3) conducting\nexperiments where we find that baselines using state-of-the-art models attain\nhigh social welfare but are exploitable. Our work aims to promote societal\nsafety by aiding researchers in developing and assessing multi-agent AI\nsystems. Code to evaluate Welfare Diplomacy and reproduce our experiments is\navailable at https://github.com/mukobi/welfare-diplomacy.\n","authors":["Gabriel Mukobi","Hannah Erlebach","Niklas Lauffer","Lewis Hammond","Alan Chan","Jesse Clifton"],"pdf_url":"https://arxiv.org/pdf/2310.08901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08899v1","updated":"2023-10-13T07:03:39Z","published":"2023-10-13T07:03:39Z","title":"Exploration with Principles for Diverse AI Supervision","summary":"  Training large transformers using next-token prediction has given rise to\ngroundbreaking advancements in AI. While this generative AI approach has\nproduced impressive results, it heavily leans on human supervision. Even\nstate-of-the-art AI models like ChatGPT depend on fine-tuning through human\ndemonstrations, demanding extensive human input and domain expertise. This\nstrong reliance on human oversight poses a significant hurdle to the\nadvancement of AI innovation. To address this limitation, we propose a novel\nparadigm termed Exploratory AI (EAI) aimed at autonomously generating\nhigh-quality training data. Drawing inspiration from unsupervised reinforcement\nlearning (RL) pretraining, EAI achieves exploration within the natural language\nspace. We accomplish this by harnessing large language models to assess the\nnovelty of generated content. Our approach employs two key components: an actor\nthat generates novel content following exploration principles and a critic that\nevaluates the generated content, offering critiques to guide the actor.\nEmpirical evaluations demonstrate that EAI significantly boosts model\nperformance on complex reasoning tasks, addressing the limitations of\nhuman-intensive supervision.\n","authors":["Hao Liu","Matei Zaharia","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2310.08899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08889v1","updated":"2023-10-13T06:50:15Z","published":"2023-10-13T06:50:15Z","title":"PerturbScore: Connecting Discrete and Continuous Perturbations in NLP","summary":"  With the rapid development of neural network applications in NLP, model\nrobustness problem is gaining more attention. Different from computer vision,\nthe discrete nature of texts makes it more challenging to explore robustness in\nNLP. Therefore, in this paper, we aim to connect discrete perturbations with\ncontinuous perturbations, therefore we can use such connections as a bridge to\nhelp understand discrete perturbations in NLP models. Specifically, we first\nexplore how to connect and measure the correlation between discrete\nperturbations and continuous perturbations. Then we design a regression task as\na PerturbScore to learn the correlation automatically. Through experimental\nresults, we find that we can build a connection between discrete and continuous\nperturbations and use the proposed PerturbScore to learn such correlation,\nsurpassing previous methods used in discrete perturbation measuring. Further,\nthe proposed PerturbScore can be well generalized to different datasets,\nperturbation methods, indicating that we can use it as a powerful tool to study\nmodel robustness in NLP.\n","authors":["Linyang Li","Ke Ren","Yunfan Shao","Pengyu Wang","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2310.08889v1.pdf","comment":"Accepted by Findings of EMNLP2023"},{"id":"http://arxiv.org/abs/2310.08885v1","updated":"2023-10-13T06:36:26Z","published":"2023-10-13T06:36:26Z","title":"InstructTODS: Large Language Models for End-to-End Task-Oriented\n  Dialogue Systems","summary":"  Large language models (LLMs) have been used for diverse tasks in natural\nlanguage processing (NLP), yet remain under-explored for task-oriented dialogue\nsystems (TODS), especially for end-to-end TODS. We present InstructTODS, a\nnovel off-the-shelf framework for zero-shot end-to-end task-oriented dialogue\nsystems that can adapt to diverse domains without fine-tuning. By leveraging\nLLMs, InstructTODS generates a proxy belief state that seamlessly translates\nuser intentions into dynamic queries for efficient interaction with any KB. Our\nextensive experiments demonstrate that InstructTODS achieves comparable\nperformance to fully fine-tuned TODS in guiding dialogues to successful\ncompletion without prior knowledge or task-specific data. Furthermore, a\nrigorous human evaluation of end-to-end TODS shows that InstructTODS produces\ndialogue responses that notably outperform both the gold responses and the\nstate-of-the-art TODS in terms of helpfulness, informativeness, and humanness.\nMoreover, the effectiveness of LLMs in TODS is further supported by our\ncomprehensive evaluations on TODS subtasks: dialogue state tracking, intent\nclassification, and response generation. Code and implementations could be\nfound here https://github.com/WillyHC22/InstructTODS/\n","authors":["Willy Chung","Samuel Cahyawijaya","Bryan Wilie","Holy Lovenia","Pascale Fung"],"pdf_url":"https://arxiv.org/pdf/2310.08885v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08877v1","updated":"2023-10-13T06:03:47Z","published":"2023-10-13T06:03:47Z","title":"Retrieval-Generation Alignment for End-to-End Task-Oriented Dialogue\n  System","summary":"  Developing an efficient retriever to retrieve knowledge from a large-scale\nknowledge base (KB) is critical for task-oriented dialogue systems to\neffectively handle localized and specialized tasks. However, widely used\ngenerative models such as T5 and ChatGPT often struggle to differentiate subtle\ndifferences among the retrieved KB records when generating responses, resulting\nin suboptimal quality of generated responses. In this paper, we propose the\napplication of maximal marginal likelihood to train a perceptive retriever by\nutilizing signals from response generation for supervision. In addition, our\napproach goes beyond considering solely retrieved entities and incorporates\nvarious meta knowledge to guide the generator, thus improving the utilization\nof knowledge. We evaluate our approach on three task-oriented dialogue datasets\nusing T5 and ChatGPT as the backbone models. The results demonstrate that when\ncombined with meta knowledge, the response generator can effectively leverage\nhigh-quality knowledge records from the retriever and enhance the quality of\ngenerated responses. The codes and models of this paper are available at\nhttps://github.com/shenwzh3/MK-TOD.\n","authors":["Weizhou Shen","Yingqi Gao","Canbin Huang","Fanqi Wan","Xiaojun Quan","Wei Bi"],"pdf_url":"https://arxiv.org/pdf/2310.08877v1.pdf","comment":"Accepted to EMNLP 2023 Main Conference"},{"id":"http://arxiv.org/abs/2310.04443v2","updated":"2023-10-13T05:07:49Z","published":"2023-10-02T21:24:26Z","title":"Human Mobility Question Answering (Vision Paper)","summary":"  Question answering (QA) systems have attracted much attention from the\nartificial intelligence community as they can learn to answer questions based\non the given knowledge source (e.g., images in visual question answering).\nHowever, the research into question answering systems with human mobility data\nremains unexplored. Mining human mobility data is crucial for various\napplications such as smart city planning, pandemic management, and personalised\nrecommendation system. In this paper, we aim to tackle this gap and introduce a\nnovel task, that is, human mobility question answering (MobQA). The aim of the\ntask is to let the intelligent system learn from mobility data and answer\nrelated questions. This task presents a new paradigm change in mobility\nprediction research and further facilitates the research of human mobility\nrecommendation systems. To better support this novel research topic, this\nvision paper also proposes an initial design of the dataset and a potential\ndeep learning model framework for the introduced MobQA task. We hope that this\npaper will provide novel insights and open new directions in human mobility\nresearch and question answering research.\n","authors":["Hao Xue","Flora D. Salim"],"pdf_url":"https://arxiv.org/pdf/2310.04443v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08860v1","updated":"2023-10-13T05:03:13Z","published":"2023-10-13T05:03:13Z","title":"Guiding AMR Parsing with Reverse Graph Linearization","summary":"  Abstract Meaning Representation (AMR) parsing aims to extract an abstract\nsemantic graph from a given sentence. The sequence-to-sequence approaches,\nwhich linearize the semantic graph into a sequence of nodes and edges and\ngenerate the linearized graph directly, have achieved good performance.\nHowever, we observed that these approaches suffer from structure loss\naccumulation during the decoding process, leading to a much lower F1-score for\nnodes and edges decoded later compared to those decoded earlier. To address\nthis issue, we propose a novel Reverse Graph Linearization (RGL) enhanced\nframework. RGL defines both default and reverse linearization orders of an AMR\ngraph, where most structures at the back part of the default order appear at\nthe front part of the reversed order and vice versa. RGL incorporates the\nreversed linearization to the original AMR parser through a two-pass\nself-distillation mechanism, which guides the model when generating the default\nlinearizations. Our analysis shows that our proposed method significantly\nmitigates the problem of structure loss accumulation, outperforming the\npreviously best AMR parsing model by 0.8 and 0.5 Smatch scores on the AMR 2.0\nand AMR 3.0 dataset, respectively. The code are available at\nhttps://github.com/pkunlp-icler/AMR_reverse_graph_linearization.\n","authors":["Bofei Gao","Liang Chen","Peiyi Wang","Zhifang Sui","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2310.08860v1.pdf","comment":"Findings of EMNLP2023"},{"id":"http://arxiv.org/abs/2306.17439v2","updated":"2023-10-13T04:50:04Z","published":"2023-06-30T07:24:32Z","title":"Provable Robust Watermarking for AI-Generated Text","summary":"  We study the problem of watermarking large language models (LLMs) generated\ntext -- one of the most promising approaches for addressing the safety\nchallenges of LLM usage. In this paper, we propose a rigorous theoretical\nframework to quantify the effectiveness and robustness of LLM watermarks. We\npropose a robust and high-quality watermark method, Unigram-Watermark, by\nextending an existing approach with a simplified fixed grouping strategy. We\nprove that our watermark method enjoys guaranteed generation quality,\ncorrectness in watermark detection, and is robust against text editing and\nparaphrasing. Experiments on three varying LLMs and two datasets verify that\nour Unigram-Watermark achieves superior detection accuracy and comparable\ngeneration quality in perplexity, thus promoting the responsible use of LLMs.\nCode is available at https://github.com/XuandongZhao/Unigram-Watermark.\n","authors":["Xuandong Zhao","Prabhanjan Ananth","Lei Li","Yu-Xiang Wang"],"pdf_url":"https://arxiv.org/pdf/2306.17439v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10687v2","updated":"2023-10-13T03:38:57Z","published":"2023-09-16T00:55:08Z","title":"EchoPrompt: Instructing the Model to Rephrase Queries for Improved\n  In-context Learning","summary":"  Language models are achieving impressive performance on various tasks by\naggressively adopting inference-time prompting techniques, such as zero-shot\nand few-shot prompting. In this work, we introduce EchoPrompt, a simple yet\neffective approach that prompts the model to rephrase its queries before\nanswering them. EchoPrompt is adapted for both zero-shot and few-shot\nin-context learning with standard and chain-of-thought prompting. Experimental\nresults show that EchoPrompt yields substantial improvements across all these\nsettings for four families of causal language models. These improvements are\nobserved across various numerical reasoning (e.g. GSM8K, SVAMP), reading\ncomprehension (e.g. DROP), and logical reasoning (e.g. Coin Flipping) tasks. On\naverage, EchoPrompt improves the Zero-shot-CoT performance of code-davinci-002\nby 5% in numerical tasks and 13% in reading comprehension tasks. We investigate\nthe factors contributing to EchoPrompt's effectiveness through ablation\nstudies, which reveal that both the original query and the model-generated\nrephrased version are instrumental in its performance gains. Our empirical\nresults indicate that EchoPrompt is an effective technique that enhances\nin-context learning performance. We recommend incorporating EchoPrompt into\nvarious baseline prompting strategies to achieve performance boosts.\n","authors":["Rajasekhar Reddy Mekala","Yasaman Razeghi","Sameer Singh"],"pdf_url":"https://arxiv.org/pdf/2309.10687v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08840v1","updated":"2023-10-13T03:38:38Z","published":"2023-10-13T03:38:38Z","title":"Large Language Models as Source Planner for Personalized\n  Knowledge-grounded Dialogue","summary":"  Open-domain dialogue system usually requires different sources of knowledge\nto generate more informative and evidential responses. However, existing\nknowledge-grounded dialogue systems either focus on a single knowledge source\nor overlook the dependency between multiple sources of knowledge, which may\nresult in generating inconsistent or even paradoxical responses. To incorporate\nmultiple knowledge sources and dependencies between them, we propose SAFARI, a\nnovel framework that leverages the exceptional capabilities of large language\nmodels (LLMs) in planning, understanding, and incorporating under both\nsupervised and unsupervised settings. Specifically, SAFARI decouples the\nknowledge grounding into multiple sources and response generation, which allows\neasy extension to various knowledge sources including the possibility of not\nusing any sources. To study the problem, we construct a personalized\nknowledge-grounded dialogue dataset \\textit{\\textbf{K}nowledge \\textbf{B}ehind\n\\textbf{P}ersona}~(\\textbf{KBP}), which is the first to consider the dependency\nbetween persona and implicit knowledge. Experimental results on the KBP dataset\ndemonstrate that the SAFARI framework can effectively produce\npersona-consistent and knowledge-enhanced responses.\n","authors":["Hongru Wang","Minda Hu","Yang Deng","Rui Wang","Fei Mi","Weichao Wang","Yasheng Wang","Wai-Chung Kwan","Irwin King","Kam-Fai Wong"],"pdf_url":"https://arxiv.org/pdf/2310.08840v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12966v3","updated":"2023-10-13T02:41:28Z","published":"2023-08-24T17:59:17Z","title":"Qwen-VL: A Versatile Vision-Language Model for Understanding,\n  Localization, Text Reading, and Beyond","summary":"  In this work, we introduce the Qwen-VL series, a set of large-scale\nvision-language models (LVLMs) designed to perceive and understand both texts\nand images. Starting from the Qwen-LM as a foundation, we endow it with visual\ncapacity by the meticulously designed (i) visual receptor, (ii) input-output\ninterface, (iii) 3-stage training pipeline, and (iv) multilingual multimodal\ncleaned corpus. Beyond the conventional image description and\nquestion-answering, we implement the grounding and text-reading ability of\nQwen-VLs by aligning image-caption-box tuples. The resulting models, including\nQwen-VL and Qwen-VL-Chat, set new records for generalist models under similar\nmodel scales on a broad range of visual-centric benchmarks (e.g., image\ncaptioning, question answering, visual grounding) and different settings (e.g.,\nzero-shot, few-shot). Moreover, on real-world dialog benchmarks, our\ninstruction-tuned Qwen-VL-Chat also demonstrates superiority compared to\nexisting vision-language chatbots. Code, demo and models are available at\nhttps://github.com/QwenLM/Qwen-VL.\n","authors":["Jinze Bai","Shuai Bai","Shusheng Yang","Shijie Wang","Sinan Tan","Peng Wang","Junyang Lin","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.12966v3.pdf","comment":"Code, demo and models are available at\n  https://github.com/QwenLM/Qwen-VL"},{"id":"http://arxiv.org/abs/2310.07849v2","updated":"2023-10-13T01:31:59Z","published":"2023-10-11T19:51:13Z","title":"Synthetic Data Generation with Large Language Models for Text\n  Classification: Potential and Limitations","summary":"  The collection and curation of high-quality training data is crucial for\ndeveloping text classification models with superior performance, but it is\noften associated with significant costs and time investment. Researchers have\nrecently explored using large language models (LLMs) to generate synthetic\ndatasets as an alternative approach. However, the effectiveness of the\nLLM-generated synthetic data in supporting model training is inconsistent\nacross different classification tasks. To better understand factors that\nmoderate the effectiveness of the LLM-generated synthetic data, in this study,\nwe look into how the performance of models trained on these synthetic data may\nvary with the subjectivity of classification. Our results indicate that\nsubjectivity, at both the task level and instance level, is negatively\nassociated with the performance of the model trained on synthetic data. We\nconclude by discussing the implications of our work on the potential and\nlimitations of leveraging LLM for synthetic data generation.\n","authors":["Zhuoyan Li","Hangxiao Zhu","Zhuoran Lu","Ming Yin"],"pdf_url":"https://arxiv.org/pdf/2310.07849v2.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.08475v2","updated":"2023-10-13T01:12:25Z","published":"2023-10-12T16:32:44Z","title":"Can We Edit Multimodal Large Language Models?","summary":"  In this paper, we focus on editing Multimodal Large Language Models (MLLMs).\nCompared to editing single-modal LLMs, multimodal model editing is more\nchallenging, which demands a higher level of scrutiny and careful consideration\nin the editing process. To facilitate research in this area, we construct a new\nbenchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite\nof innovative metrics for evaluation. We conduct comprehensive experiments\ninvolving various model editing baselines and analyze the impact of editing\ndifferent components for multimodal LLMs. Empirically, we notice that previous\nbaselines can implement editing multimodal LLMs to some extent, but the effect\nis still barely satisfactory, indicating the potential difficulty of this task.\nWe hope that our work can provide the NLP community with insights. Code and\ndataset are available in https://github.com/zjunlp/EasyEdit.\n","authors":["Siyuan Cheng","Bozhong Tian","Qingbin Liu","Xi Chen","Yongheng Wang","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08475v2.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.08797v1","updated":"2023-10-13T01:00:15Z","published":"2023-10-13T01:00:15Z","title":"A Comparative Analysis of Task-Agnostic Distillation Methods for\n  Compressing Transformer Language Models","summary":"  Large language models have become a vital component in modern NLP, achieving\nstate of the art performance in a variety of tasks. However, they are often\ninefficient for real-world deployment due to their expensive inference costs.\nKnowledge distillation is a promising technique to improve their efficiency\nwhile retaining most of their effectiveness. In this paper, we reproduce,\ncompare and analyze several representative methods for task-agnostic\n(general-purpose) distillation of Transformer language models. Our target of\nstudy includes Output Distribution (OD) transfer, Hidden State (HS) transfer\nwith various layer mapping strategies, and Multi-Head Attention (MHA) transfer\nbased on MiniLMv2. Through our extensive experiments, we study the\neffectiveness of each method for various student architectures in both\nmonolingual (English) and multilingual settings. Overall, we show that MHA\ntransfer based on MiniLMv2 is generally the best option for distillation and\nexplain the potential reasons behind its success. Moreover, we show that HS\ntransfer remains as a competitive baseline, especially under a sophisticated\nlayer mapping strategy, while OD transfer consistently lags behind other\napproaches. Findings from this study helped us deploy efficient yet effective\nstudent models for latency-critical applications.\n","authors":["Takuma Udagawa","Aashka Trivedi","Michele Merler","Bishwaranjan Bhattacharjee"],"pdf_url":"https://arxiv.org/pdf/2310.08797v1.pdf","comment":"Accepted to EMNLP 2023 Industry Track"},{"id":"http://arxiv.org/abs/2310.08796v1","updated":"2023-10-13T00:49:59Z","published":"2023-10-13T00:49:59Z","title":"End-to-end Story Plot Generator","summary":"  Story plots, while short, carry most of the essential information of a full\nstory that may contain tens of thousands of words. We study the problem of\nautomatic generation of story plots, which includes story premise, character\ndescriptions, plot outlines, etc. To generate a single engaging plot, existing\nplot generators (e.g., DOC (Yang et al., 2022a)) require hundreds to thousands\nof calls to LLMs (e.g., OpenAI API) in the planning stage of the story plot,\nwhich is costly and takes at least several minutes. Moreover, the hard-wired\nnature of the method makes the pipeline non-differentiable, blocking fast\nspecialization and personalization of the plot generator. In this paper, we\npropose three models, $\\texttt{OpenPlot}$, $\\texttt{E2EPlot}$ and\n$\\texttt{RLPlot}$, to address these challenges. $\\texttt{OpenPlot}$ replaces\nexpensive OpenAI API calls with LLaMA2 (Touvron et al., 2023) calls via careful\nprompt designs, which leads to inexpensive generation of high-quality training\ndatasets of story plots. We then train an end-to-end story plot generator,\n$\\texttt{E2EPlot}$, by supervised fine-tuning (SFT) using approximately 13000\nstory plots generated by $\\texttt{OpenPlot}$. $\\texttt{E2EPlot}$ generates\nstory plots of comparable quality to $\\texttt{OpenPlot}$, and is > 10$\\times$\nfaster (1k tokens in only 30 seconds on average). Finally, we obtain\n$\\texttt{RLPlot}$ that is further fine-tuned with RLHF on several different\nreward models for different aspects of story quality, which yields 60.0$\\%$\nwinning rate against $\\texttt{E2EPlot}$ along the aspect of suspense and\nsurprise.\n","authors":["Hanlin Zhu","Andrew Cohen","Danqing Wang","Kevin Yang","Xiaomeng Yang","Jiantao Jiao","Yuandong Tian"],"pdf_url":"https://arxiv.org/pdf/2310.08796v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2310.08795v1","updated":"2023-10-13T00:49:09Z","published":"2023-10-13T00:49:09Z","title":"Mitigating Bias for Question Answering Models by Tracking Bias Influence","summary":"  Models of various NLP tasks have been shown to exhibit stereotypes, and the\nbias in the question answering (QA) models is especially harmful as the output\nanswers might be directly consumed by the end users. There have been datasets\nto evaluate bias in QA models, while bias mitigation technique for the QA\nmodels is still under-explored. In this work, we propose BMBI, an approach to\nmitigate the bias of multiple-choice QA models. Based on the intuition that a\nmodel would lean to be more biased if it learns from a biased example, we\nmeasure the bias level of a query instance by observing its influence on\nanother instance. If the influenced instance is more biased, we derive that the\nquery instance is biased. We then use the bias level detected as an\noptimization objective to form a multi-task learning setting in addition to the\noriginal QA task. We further introduce a new bias evaluation metric to quantify\nbias in a comprehensive and sensitive way. We show that our method could be\napplied to multiple QA formulations across multiple bias categories. It can\nsignificantly reduce the bias level in all 9 bias categories in the BBQ dataset\nwhile maintaining comparable QA accuracy.\n","authors":["Mingyu Derek Ma","Jiun-Yu Kao","Arpit Gupta","Yu-Hsiang Lin","Wenbo Zhao","Tagyoung Chung","Wei Wang","Kai-Wei Chang","Nanyun Peng"],"pdf_url":"https://arxiv.org/pdf/2310.08795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08072v2","updated":"2023-10-13T00:40:29Z","published":"2023-10-12T06:46:07Z","title":"Training Generative Question-Answering on Synthetic Data Obtained from\n  an Instruct-tuned Model","summary":"  This paper presents a simple and cost-effective method for synthesizing data\nto train question-answering systems. For training, fine-tuning GPT models is a\ncommon practice in resource-rich languages like English, however, it becomes\nchallenging for non-English languages due to the scarcity of sufficient\nquestion-answer (QA) pairs. Existing approaches use question and answer\ngenerators trained on human-authored QA pairs, which involves substantial human\nexpenses. In contrast, we use an instruct-tuned model to generate QA pairs in a\nzero-shot or few-shot manner. We conduct experiments to compare various\nstrategies for obtaining QA pairs from the instruct-tuned model. The results\ndemonstrate that a model trained on our proposed synthetic data achieves\ncomparable performance to a model trained on manually curated datasets, without\nincurring human costs.\n","authors":["Kosuke Takahashi","Takahiro Omi","Kosuke Arima","Tatsuya Ishigaki"],"pdf_url":"https://arxiv.org/pdf/2310.08072v2.pdf","comment":"PACLIC 2023 short paper, 4 pages (6 pages including references), 4\n  figures"},{"id":"http://arxiv.org/abs/2310.08780v1","updated":"2023-10-13T00:03:37Z","published":"2023-10-13T00:03:37Z","title":"\"Im not Racist but...\": Discovering Bias in the Internal Knowledge of\n  Large Language Models","summary":"  Large language models (LLMs) have garnered significant attention for their\nremarkable performance in a continuously expanding set of natural language\nprocessing tasks. However, these models have been shown to harbor inherent\nsocietal biases, or stereotypes, which can adversely affect their performance\nin their many downstream applications. In this paper, we introduce a novel,\npurely prompt-based approach to uncover hidden stereotypes within any arbitrary\nLLM. Our approach dynamically generates a knowledge representation of internal\nstereotypes, enabling the identification of biases encoded within the LLM's\ninternal knowledge. By illuminating the biases present in LLMs and offering a\nsystematic methodology for their analysis, our work contributes to advancing\ntransparency and promoting fairness in natural language processing systems.\n","authors":["Abel Salinas","Louis Penafiel","Robert McCormack","Fred Morstatter"],"pdf_url":"https://arxiv.org/pdf/2310.08780v1.pdf","comment":"Warning: This paper discusses and contains content that is offensive\n  or upsetting"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2310.09291v1","updated":"2023-10-13T17:59:38Z","published":"2023-10-13T17:59:38Z","title":"Vision-by-Language for Training-Free Compositional Image Retrieval","summary":"  Given an image and a target modification (e.g an image of the Eiffel tower\nand the text \"without people and at night-time\"), Compositional Image Retrieval\n(CIR) aims to retrieve the relevant target image in a database. While\nsupervised approaches rely on annotating triplets that is costly (i.e. query\nimage, textual modification, and target image), recent research sidesteps this\nneed by using large-scale vision-language models (VLMs), performing Zero-Shot\nCIR (ZS-CIR). However, state-of-the-art approaches in ZS-CIR still require\ntraining task-specific, customized models over large amounts of image-text\npairs. In this work, we propose to tackle CIR in a training-free manner via our\nCompositional Image Retrieval through Vision-by-Language (CIReVL), a simple,\nyet human-understandable and scalable pipeline that effectively recombines\nlarge-scale VLMs with large language models (LLMs). By captioning the reference\nimage using a pre-trained generative VLM and asking a LLM to recompose the\ncaption based on the textual target modification for subsequent retrieval via\ne.g. CLIP, we achieve modular language reasoning. In four ZS-CIR benchmarks, we\nfind competitive, in-part state-of-the-art performance - improving over\nsupervised methods. Moreover, the modularity of CIReVL offers simple\nscalability without re-training, allowing us to both investigate scaling laws\nand bottlenecks for ZS-CIR while easily scaling up to in parts more than double\nof previously reported results. Finally, we show that CIReVL makes CIR\nhuman-understandable by composing image and text in a modular fashion in the\nlanguage domain, thereby making it intervenable, allowing to post-hoc re-align\nfailure cases. Code will be released upon acceptance.\n","authors":["Shyamgopal Karthik","Karsten Roth","Massimiliano Mancini","Zeynep Akata"],"pdf_url":"https://arxiv.org/pdf/2310.09291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09289v1","updated":"2023-10-13T17:59:02Z","published":"2023-10-13T17:59:02Z","title":"An Unbiased Look at Datasets for Visuo-Motor Pre-Training","summary":"  Visual representation learning hold great promise for robotics, but is\nseverely hampered by the scarcity and homogeneity of robotics datasets. Recent\nworks address this problem by pre-training visual representations on\nlarge-scale but out-of-domain data (e.g., videos of egocentric interactions)\nand then transferring them to target robotics tasks. While the field is heavily\nfocused on developing better pre-training algorithms, we find that dataset\nchoice is just as important to this paradigm's success. After all, the\nrepresentation can only learn the structures or priors present in the\npre-training dataset. To this end, we flip the focus on algorithms, and instead\nconduct a dataset centric analysis of robotic pre-training. Our findings call\ninto question some common wisdom in the field. We observe that traditional\nvision datasets (like ImageNet, Kinetics and 100 Days of Hands) are\nsurprisingly competitive options for visuo-motor representation learning, and\nthat the pre-training dataset's image distribution matters more than its size.\nFinally, we show that common simulation benchmarks are not a reliable proxy for\nreal world performance and that simple regularization strategies can\ndramatically improve real world policy learning.\nhttps://data4robotics.github.io\n","authors":["Sudeep Dasari","Mohan Kumar Srirama","Unnat Jain","Abhinav Gupta"],"pdf_url":"https://arxiv.org/pdf/2310.09289v1.pdf","comment":"Accepted to CoRL 2023"},{"id":"http://arxiv.org/abs/2310.09285v1","updated":"2023-10-13T17:52:16Z","published":"2023-10-13T17:52:16Z","title":"SAIR: Learning Semantic-aware Implicit Representation","summary":"  Implicit representation of an image can map arbitrary coordinates in the\ncontinuous domain to their corresponding color values, presenting a powerful\ncapability for image reconstruction. Nevertheless, existing implicit\nrepresentation approaches only focus on building continuous appearance mapping,\nignoring the continuities of the semantic information across pixels. As a\nresult, they can hardly achieve desired reconstruction results when the\nsemantic information within input images is corrupted, for example, a large\nregion misses. To address the issue, we propose to learn semantic-aware\nimplicit representation (SAIR), that is, we make the implicit representation of\neach pixel rely on both its appearance and semantic information (\\eg, which\nobject does the pixel belong to). To this end, we propose a framework with two\nmodules: (1) building a semantic implicit representation (SIR) for a corrupted\nimage whose large regions miss. Given an arbitrary coordinate in the continuous\ndomain, we can obtain its respective text-aligned embedding indicating the\nobject the pixel belongs. (2) building an appearance implicit representation\n(AIR) based on the SIR. Given an arbitrary coordinate in the continuous domain,\nwe can reconstruct its color whether or not the pixel is missed in the input.\nWe validate the novel semantic-aware implicit representation method on the\nimage inpainting task, and the extensive experiments demonstrate that our\nmethod surpasses state-of-the-art approaches by a significant margin.\n","authors":["Canyu Zhang","Xiaoguang Li","Qing Guo","Song Wang"],"pdf_url":"https://arxiv.org/pdf/2310.09285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15523v3","updated":"2023-10-13T17:47:41Z","published":"2023-05-24T19:20:59Z","title":"Task-aware Distributed Source Coding under Dynamic Bandwidth","summary":"  Efficient compression of correlated data is essential to minimize\ncommunication overload in multi-sensor networks. In such networks, each sensor\nindependently compresses the data and transmits them to a central node due to\nlimited communication bandwidth. A decoder at the central node decompresses and\npasses the data to a pre-trained machine learning-based task to generate the\nfinal output. Thus, it is important to compress the features that are relevant\nto the task. Additionally, the final performance depends heavily on the total\navailable bandwidth. In practice, it is common to encounter varying\navailability in bandwidth, and higher bandwidth results in better performance\nof the task. We design a novel distributed compression framework composed of\nindependent encoders and a joint decoder, which we call neural distributed\nprincipal component analysis (NDPCA). NDPCA flexibly compresses data from\nmultiple sources to any available bandwidth with a single model, reducing\ncomputing and storage overhead. NDPCA achieves this by learning low-rank task\nrepresentations and efficiently distributing bandwidth among sensors, thus\nproviding a graceful trade-off between performance and bandwidth. Experiments\nshow that NDPCA improves the success rate of multi-view robotic arm\nmanipulation by 9% and the accuracy of object detection tasks on satellite\nimagery by 14% compared to an autoencoder with uniform bandwidth allocation.\n","authors":["Po-han Li","Sravan Kumar Ankireddy","Ruihan Zhao","Hossein Nourkhiz Mahjoub","Ehsan Moradi-Pari","Ufuk Topcu","Sandeep Chinchali","Hyeji Kim"],"pdf_url":"https://arxiv.org/pdf/2305.15523v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09276v1","updated":"2023-10-13T17:38:45Z","published":"2023-10-13T17:38:45Z","title":"Transformer-based Multimodal Change Detection with Multitask Consistency\n  Constraints","summary":"  Change detection plays a fundamental role in Earth observation for analyzing\ntemporal iterations over time. However, recent studies have largely neglected\nthe utilization of multimodal data that presents significant practical and\ntechnical advantages compared to single-modal approaches. This research focuses\non leveraging digital surface model (DSM) data and aerial images captured at\ndifferent times for detecting change beyond 2D. We observe that the current\nchange detection methods struggle with the multitask conflicts between semantic\nand height change detection tasks. To address this challenge, we propose an\nefficient Transformer-based network that learns shared representation between\ncross-dimensional inputs through cross-attention. It adopts a consistency\nconstraint to establish the multimodal relationship, which involves obtaining\npseudo change through height change thresholding and minimizing the difference\nbetween semantic and pseudo change within their overlapping regions. A\nDSM-to-image multimodal dataset encompassing three cities in the Netherlands\nwas constructed. It lays a new foundation for beyond-2D change detection from\ncross-dimensional inputs. Compared to five state-of-the-art change detection\nmethods, our model demonstrates consistent multitask superiority in terms of\nsemantic and height change detection. Furthermore, the consistency strategy can\nbe seamlessly adapted to the other methods, yielding promising improvements.\n","authors":["Biyuan Liu","Huaixin Chen","Kun Li","Michael Ying Yang"],"pdf_url":"https://arxiv.org/pdf/2310.09276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09275v1","updated":"2023-10-13T17:38:41Z","published":"2023-10-13T17:38:41Z","title":"Understanding and Modeling the Effects of Task and Context on Drivers'\n  Gaze Allocation","summary":"  Understanding what drivers look at is important for many applications,\nincluding driver training, monitoring, and assistance, as well as self-driving.\nTraditionally, factors affecting human visual attention have been divided into\nbottom-up (involuntary attraction to salient regions) and top-down (task- and\ncontext-driven). Although both play a role in drivers' gaze allocation, most of\nthe existing modeling approaches apply techniques developed for bottom-up\nsaliency and do not consider task and context influences explicitly. Likewise,\ncommon driving attention benchmarks lack relevant task and context annotations.\nTherefore, to enable analysis and modeling of these factors for drivers' gaze\nprediction, we propose the following: 1) address some shortcomings of the\npopular DR(eye)VE dataset and extend it with per-frame annotations for driving\ntask and context; 2) benchmark a number of baseline and SOTA models for\nsaliency and driver gaze prediction and analyze them w.r.t. the new\nannotations; and finally, 3) a novel model that modulates drivers' gaze\nprediction with explicit action and context information, and as a result\nsignificantly improves SOTA performance on DR(eye)VE overall (by 24\\% KLD and\n89\\% NSS) and on a subset of action and safety-critical intersection scenarios\n(by 10--30\\% KLD). Extended annotations, code for model and evaluation will be\nmade publicly available.\n","authors":["Iuliia Kotseruba","John K. Tsotsos"],"pdf_url":"https://arxiv.org/pdf/2310.09275v1.pdf","comment":"12 pages, 8 figures, 8 tables"},{"id":"http://arxiv.org/abs/2310.09247v1","updated":"2023-10-13T16:53:25Z","published":"2023-10-13T16:53:25Z","title":"Hypernymy Understanding Evaluation of Text-to-Image Models via WordNet\n  Hierarchy","summary":"  Text-to-image synthesis has recently attracted widespread attention due to\nrapidly improving quality and numerous practical applications. However, the\nlanguage understanding capabilities of text-to-image models are still poorly\nunderstood, which makes it difficult to reason about prompt formulations that a\ngiven model would understand well. In this work, we measure the capability of\npopular text-to-image models to understand $\\textit{hypernymy}$, or the \"is-a\"\nrelation between words. We design two automatic metrics based on the WordNet\nsemantic hierarchy and existing image classifiers pretrained on ImageNet. These\nmetrics both enable broad quantitative comparison of linguistic capabilities\nfor text-to-image models and offer a way of finding fine-grained qualitative\ndifferences, such as words that are unknown to models and thus are difficult\nfor them to draw. We comprehensively evaluate popular text-to-image models,\nincluding GLIDE, Latent Diffusion, and Stable Diffusion, showing how our\nmetrics can provide a better understanding of the individual strengths and\nweaknesses of these models.\n","authors":["Anton Baryshnikov","Max Ryabinin"],"pdf_url":"https://arxiv.org/pdf/2310.09247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09236v1","updated":"2023-10-13T16:40:29Z","published":"2023-10-13T16:40:29Z","title":"Time CNN and Graph Convolution Network for Epileptic Spike Detection in\n  MEG Data","summary":"  Magnetoencephalography (MEG) recordings of patients with epilepsy exhibit\nspikes, a typical biomarker of the pathology. Detecting those spikes allows\naccurate localization of brain regions triggering seizures. Spike detection is\noften performed manually. However, it is a burdensome and error prone task due\nto the complexity of MEG data. To address this problem, we propose a 1D\ntemporal convolutional neural network (Time CNN) coupled with a graph\nconvolutional network (GCN) to classify short time frames of MEG recording as\ncontaining a spike or not. Compared to other recent approaches, our models have\nfewer parameters to train and we propose to use a GCN to account for MEG\nsensors spatial relationships. Our models produce clinically relevant results\nand outperform deep learning-based state-of-the-art methods reaching a\nclassification f1-score of 76.7% on a balanced dataset and of 25.5% on a\nrealistic, highly imbalanced dataset, for the spike class.\n","authors":["Pauline Mouches","Thibaut Dejean","Julien Jung","Romain Bouet","Carole Lartizien","Romain Quentin"],"pdf_url":"https://arxiv.org/pdf/2310.09236v1.pdf","comment":"This work has been submitted to IEEE ISBI 2024 for possible\n  publication"},{"id":"http://arxiv.org/abs/2310.09221v1","updated":"2023-10-13T16:18:48Z","published":"2023-10-13T16:18:48Z","title":"Ultrasound Image Segmentation of Thyroid Nodule via Latent Semantic\n  Feature Co-Registration","summary":"  Segmentation of nodules in thyroid ultrasound imaging plays a crucial role in\nthe detection and treatment of thyroid cancer. However, owing to the diversity\nof scanner vendors and imaging protocols in different hospitals, the automatic\nsegmentation model, which has already demonstrated expert-level accuracy in the\nfield of medical image segmentation, finds its accuracy reduced as the result\nof its weak generalization performance when being applied in clinically\nrealistic environments. To address this issue, the present paper proposes ASTN,\na framework for thyroid nodule segmentation achieved through a new type\nco-registration network. By extracting latent semantic information from the\natlas and target images and utilizing in-depth features to accomplish the\nco-registration of nodules in thyroid ultrasound images, this framework can\nensure the integrity of anatomical structure and reduce the impact on\nsegmentation as the result of overall differences in image caused by different\ndevices. In addition, this paper also provides an atlas selection algorithm to\nmitigate the difficulty of co-registration. As shown by the evaluation results\ncollected from the datasets of different devices, thanks to the method we\nproposed, the model generalization has been greatly improved while maintaining\na high level of segmentation accuracy.\n","authors":["Xuewei Li","Yaqiao Zhu","Jie Gao","Xi Wei","Ruixuan Zhang","Yuan Tian","Mei Yu"],"pdf_url":"https://arxiv.org/pdf/2310.09221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03026v2","updated":"2023-10-13T16:13:43Z","published":"2023-10-04T17:59:49Z","title":"LanguageMPC: Large Language Models as Decision Makers for Autonomous\n  Driving","summary":"  Existing learning-based autonomous driving (AD) systems face challenges in\ncomprehending high-level information, generalizing to rare events, and\nproviding interpretability. To address these problems, this work employs Large\nLanguage Models (LLMs) as a decision-making component for complex AD scenarios\nthat require human commonsense understanding. We devise cognitive pathways to\nenable comprehensive reasoning with LLMs, and develop algorithms for\ntranslating LLM decisions into actionable driving commands. Through this\napproach, LLM decisions are seamlessly integrated with low-level controllers by\nguided parameter matrix adaptation. Extensive experiments demonstrate that our\nproposed method not only consistently surpasses baseline approaches in\nsingle-vehicle tasks, but also helps handle complex driving behaviors even\nmulti-vehicle coordination, thanks to the commonsense reasoning capabilities of\nLLMs. This paper presents an initial step toward leveraging LLMs as effective\ndecision-makers for intricate AD scenarios in terms of safety, efficiency,\ngeneralizability, and interoperability. We aspire for it to serve as\ninspiration for future research in this field. Project page:\nhttps://sites.google.com/view/llm-mpc\n","authors":["Hao Sha","Yao Mu","Yuxuan Jiang","Li Chen","Chenfeng Xu","Ping Luo","Shengbo Eben Li","Masayoshi Tomizuka","Wei Zhan","Mingyu Ding"],"pdf_url":"https://arxiv.org/pdf/2310.03026v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09109v2","updated":"2023-10-13T16:12:32Z","published":"2023-06-15T13:11:30Z","title":"NAVI: Category-Agnostic Image Collections with High-Quality 3D Shape and\n  Pose Annotations","summary":"  Recent advances in neural reconstruction enable high-quality 3D object\nreconstruction from casually captured image collections. Current techniques\nmostly analyze their progress on relatively simple image collections where\nStructure-from-Motion (SfM) techniques can provide ground-truth (GT) camera\nposes. We note that SfM techniques tend to fail on in-the-wild image\ncollections such as image search results with varying backgrounds and\nilluminations. To enable systematic research progress on 3D reconstruction from\ncasual image captures, we propose NAVI: a new dataset of category-agnostic\nimage collections of objects with high-quality 3D scans along with per-image\n2D-3D alignments providing near-perfect GT camera parameters. These 2D-3D\nalignments allow us to extract accurate derivative annotations such as dense\npixel correspondences, depth and segmentation maps. We demonstrate the use of\nNAVI image collections on different problem settings and show that NAVI enables\nmore thorough evaluations that were not possible with existing datasets. We\nbelieve NAVI is beneficial for systematic research progress on 3D\nreconstruction and correspondence estimation. Project page:\nhttps://navidataset.github.io\n","authors":["Varun Jampani","Kevis-Kokitsi Maninis","Andreas Engelhardt","Arjun Karpur","Karen Truong","Kyle Sargent","Stefan Popov","André Araujo","Ricardo Martin-Brualla","Kaushal Patel","Daniel Vlasic","Vittorio Ferrari","Ameesh Makadia","Ce Liu","Yuanzhen Li","Howard Zhou"],"pdf_url":"https://arxiv.org/pdf/2306.09109v2.pdf","comment":"NeurIPS 2023 camera ready. Project page:\n  https://navidataset.github.io"},{"id":"http://arxiv.org/abs/2310.09213v1","updated":"2023-10-13T16:07:31Z","published":"2023-10-13T16:07:31Z","title":"Unseen Image Synthesis with Diffusion Models","summary":"  While the current trend in the generative field is scaling up towards larger\nmodels and more training data for generalized domain representations, we go the\nopposite direction in this work by synthesizing unseen domain images without\nadditional training. We do so via latent sampling and geometric optimization\nusing pre-trained and frozen Denoising Diffusion Probabilistic Models (DDPMs)\non single-domain datasets. Our key observation is that DDPMs pre-trained even\njust on single-domain images are already equipped with sufficient\nrepresentation abilities to reconstruct arbitrary images from the inverted\nlatent encoding following bi-directional deterministic diffusion and denoising\ntrajectories. This motivates us to investigate the statistical and geometric\nbehaviors of the Out-Of-Distribution (OOD) samples from unseen image domains in\nthe latent spaces along the denoising chain. Notably, we theoretically and\nempirically show that the inverted OOD samples also establish Gaussians that\nare distinguishable from the original In-Domain (ID) samples in the\nintermediate latent spaces, which allows us to sample from them directly.\nGeometrical domain-specific and model-dependent information of the unseen\nsubspace (e.g., sample-wise distance and angles) is used to further optimize\nthe sampled OOD latent encodings from the estimated Gaussian prior. We conduct\nextensive analysis and experiments using pre-trained diffusion models (DDPM,\niDDPM) on different datasets (AFHQ, CelebA-HQ, LSUN-Church, and LSUN-Bedroom),\nproving the effectiveness of this novel perspective to explore and re-think the\ndiffusion models' data synthesis generalization ability.\n","authors":["Ye Zhu","Yu Wu","Zhiwei Deng","Olga Russakovsky","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2310.09213v1.pdf","comment":"28 pages including appendices"},{"id":"http://arxiv.org/abs/2303.10959v2","updated":"2023-10-13T15:56:51Z","published":"2023-03-20T09:33:05Z","title":"Constructing Metric-Semantic Maps using Floor Plan Priors for Long-Term\n  Indoor Localization","summary":"  Object-based maps are relevant for scene understanding since they integrate\ngeometric and semantic information of the environment, allowing autonomous\nrobots to robustly localize and interact with on objects. In this paper, we\naddress the task of constructing a metric-semantic map for the purpose of\nlong-term object-based localization. We exploit 3D object detections from\nmonocular RGB frames for both, the object-based map construction, and for\nglobally localizing in the constructed map. To tailor the approach to a target\nenvironment, we propose an efficient way of generating 3D annotations to\nfinetune the 3D object detection model. We evaluate our map construction in an\noffice building, and test our long-term localization approach on challenging\nsequences recorded in the same environment over nine months. The experiments\nsuggest that our approach is suitable for constructing metric-semantic maps,\nand that our localization approach is robust to long-term changes. Both, the\nmapping algorithm and the localization pipeline can run online on an onboard\ncomputer. We release an open-source C++/ROS implementation of our approach.\n","authors":["Nicky Zimmerman","Matteo Sodano","Elias Marks","Jens Behley","Cyrill Stachniss"],"pdf_url":"https://arxiv.org/pdf/2303.10959v2.pdf","comment":"7 pages, accepted to IROS 2023"},{"id":"http://arxiv.org/abs/2310.09199v1","updated":"2023-10-13T15:45:19Z","published":"2023-10-13T15:45:19Z","title":"PaLI-3 Vision Language Models: Smaller, Faster, Stronger","summary":"  This paper presents PaLI-3, a smaller, faster, and stronger vision language\nmodel (VLM) that compares favorably to similar models that are 10x larger. As\npart of arriving at this strong performance, we compare Vision Transformer\n(ViT) models pretrained using classification objectives to contrastively\n(SigLIP) pretrained ones. We find that, while slightly underperforming on\nstandard image classification benchmarks, SigLIP-based PaLI shows superior\nperformance across various multimodal benchmarks, especially on localization\nand visually-situated text understanding. We scale the SigLIP image encoder up\nto 2 billion parameters, and achieves a new state-of-the-art on multilingual\ncross-modal retrieval. We hope that PaLI-3, at only 5B parameters, rekindles\nresearch on fundamental pieces of complex VLMs, and could fuel a new generation\nof scaled-up models.\n","authors":["Xi Chen","Xiao Wang","Lucas Beyer","Alexander Kolesnikov","Jialin Wu","Paul Voigtlaender","Basil Mustafa","Sebastian Goodman","Ibrahim Alabdulmohsin","Piotr Padlewski","Daniel Salz","Xi Xiong","Daniel Vlasic","Filip Pavetic","Keran Rong","Tianli Yu","Daniel Keysers","Xiaohua Zhai","Radu Soricut"],"pdf_url":"https://arxiv.org/pdf/2310.09199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16670v2","updated":"2023-10-13T15:45:13Z","published":"2023-09-28T17:59:51Z","title":"Decaf: Monocular Deformation Capture for Face and Hand Interactions","summary":"  Existing methods for 3D tracking from monocular RGB videos predominantly\nconsider articulated and rigid objects. Modelling dense non-rigid object\ndeformations in this setting remained largely unaddressed so far, although such\neffects can improve the realism of the downstream applications such as AR/VR\nand avatar communications. This is due to the severe ill-posedness of the\nmonocular view setting and the associated challenges. While it is possible to\nnaively track multiple non-rigid objects independently using 3D templates or\nparametric 3D models, such an approach would suffer from multiple artefacts in\nthe resulting 3D estimates such as depth ambiguity, unnatural intra-object\ncollisions and missing or implausible deformations. Hence, this paper\nintroduces the first method that addresses the fundamental challenges depicted\nabove and that allows tracking human hands interacting with human faces in 3D\nfrom single monocular RGB videos. We model hands as articulated objects\ninducing non-rigid face deformations during an active interaction. Our method\nrelies on a new hand-face motion and interaction capture dataset with realistic\nface deformations acquired with a markerless multi-view camera system. As a\npivotal step in its creation, we process the reconstructed raw 3D shapes with\nposition-based dynamics and an approach for non-uniform stiffness estimation of\nthe head tissues, which results in plausible annotations of the surface\ndeformations, hand-face contact regions and head-hand positions. At the core of\nour neural approach are a variational auto-encoder supplying the hand-face\ndepth prior and modules that guide the 3D tracking by estimating the contacts\nand the deformations. Our final 3D hand and face reconstructions are realistic\nand more plausible compared to several baselines applicable in our setting,\nboth quantitatively and qualitatively.\nhttps://vcai.mpi-inf.mpg.de/projects/Decaf\n","authors":["Soshi Shimada","Vladislav Golyanik","Patrick Pérez","Christian Theobalt"],"pdf_url":"https://arxiv.org/pdf/2309.16670v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17216v3","updated":"2023-10-13T15:35:42Z","published":"2023-05-26T19:22:03Z","title":"Generating Images with Multimodal Language Models","summary":"  We propose a method to fuse frozen text-only large language models (LLMs)\nwith pre-trained image encoder and decoder models, by mapping between their\nembedding spaces. Our model demonstrates a wide suite of multimodal\ncapabilities: image retrieval, novel image generation, and multimodal dialogue.\nOurs is the first approach capable of conditioning on arbitrarily interleaved\nimage and text inputs to generate coherent image (and text) outputs. To achieve\nstrong performance on image generation, we propose an efficient mapping network\nto ground the LLM to an off-the-shelf text-to-image generation model. This\nmapping network translates hidden representations of text into the embedding\nspace of the visual models, enabling us to leverage the strong text\nrepresentations of the LLM for visual outputs. Our approach outperforms\nbaseline generation models on tasks with longer and more complex language. In\naddition to novel image generation, our model is also capable of image\nretrieval from a prespecified dataset, and decides whether to retrieve or\ngenerate at inference time. This is done with a learnt decision module which\nconditions on the hidden representations of the LLM. Our model exhibits a wider\nrange of capabilities compared to prior multimodal language models. It can\nprocess image-and-text inputs, and produce retrieved images, generated images,\nand generated text -- outperforming non-LLM based generation models across\nseveral text-to-image tasks that measure context dependence.\n","authors":["Jing Yu Koh","Daniel Fried","Ruslan Salakhutdinov"],"pdf_url":"https://arxiv.org/pdf/2305.17216v3.pdf","comment":"NeurIPS 2023. Project page: http://jykoh.com/gill"},{"id":"http://arxiv.org/abs/2308.02490v2","updated":"2023-10-13T15:16:59Z","published":"2023-08-04T17:59:47Z","title":"MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities","summary":"  We propose MM-Vet, an evaluation benchmark that examines large multimodal\nmodels (LMMs) on complicated multimodal tasks. Recent LMMs have shown various\nintriguing abilities, such as solving math problems written on the blackboard,\nreasoning about events and celebrities in news images, and explaining visual\njokes. Rapid model advancements pose challenges to evaluation benchmark\ndevelopment. Problems include: (1) How to systematically structure and evaluate\nthe complicated multimodal tasks; (2) How to design evaluation metrics that\nwork well across question and answer types; and (3) How to give model insights\nbeyond a simple performance ranking. To this end, we present MM-Vet, designed\nbased on the insight that the intriguing ability to solve complicated tasks is\noften achieved by a generalist model being able to integrate different core\nvision-language (VL) capabilities. MM-Vet defines 6 core VL capabilities and\nexamines the 16 integrations of interest derived from the capability\ncombination. For evaluation metrics, we propose an LLM-based evaluator for\nopen-ended outputs. The evaluator enables the evaluation across different\nquestion types and answer styles, resulting in a unified scoring metric. We\nevaluate representative LMMs on MM-Vet, providing insights into the\ncapabilities of different LMM system paradigms and models. Code and data are\navailable at https://github.com/yuweihao/MM-Vet.\n","authors":["Weihao Yu","Zhengyuan Yang","Linjie Li","Jianfeng Wang","Kevin Lin","Zicheng Liu","Xinchao Wang","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.02490v2.pdf","comment":"Update results of OpenFlamingo-9B (MPT), LLaMA-Adapter v2-7B, and\n  Otter-9B (MPT). Code, data and leaderboard:\n  https://github.com/yuweihao/MM-Vet"},{"id":"http://arxiv.org/abs/2310.09170v1","updated":"2023-10-13T15:03:21Z","published":"2023-10-13T15:03:21Z","title":"mnmDTW: An extension to Dynamic Time Warping for Camera-based Movement\n  Error Localization","summary":"  In this proof of concept, we use Computer Vision (CV) methods to extract pose\ninformation out of exercise videos. We then employ a modified version of\nDynamic Time Warping (DTW) to calculate the deviation from a gold standard\nexecution of the exercise. Specifically, we calculate the distance between each\nbody part individually to get a more precise measure for exercise accuracy. We\ncan show that exercise mistakes are clearly visible, identifiable and\nlocalizable through this metric.\n","authors":["Sebastian Dill","Maurice Rohr"],"pdf_url":"https://arxiv.org/pdf/2310.09170v1.pdf","comment":"Poster Prague 2023 Conference, 4 pages"},{"id":"http://arxiv.org/abs/2310.05768v2","updated":"2023-10-13T15:00:11Z","published":"2023-10-09T14:54:37Z","title":"DANet: Enhancing Small Object Detection through an Efficient Deformable\n  Attention Network","summary":"  Efficient and accurate detection of small objects in manufacturing settings,\nsuch as defects and cracks, is crucial for ensuring product quality and safety.\nTo address this issue, we proposed a comprehensive strategy by synergizing\nFaster R-CNN with cutting-edge methods. By combining Faster R-CNN with Feature\nPyramid Network, we enable the model to efficiently handle multi-scale features\nintrinsic to manufacturing environments. Additionally, Deformable Net is used\nthat contorts and conforms to the geometric variations of defects, bringing\nprecision in detecting even the minuscule and complex features. Then, we\nincorporated an attention mechanism called Convolutional Block Attention Module\nin each block of our base ResNet50 network to selectively emphasize informative\nfeatures and suppress less useful ones. After that we incorporated RoI Align,\nreplacing RoI Pooling for finer region-of-interest alignment and finally the\nintegration of Focal Loss effectively handles class imbalance, crucial for rare\ndefect occurrences. The rigorous evaluation of our model on both the NEU-DET\nand Pascal VOC datasets underscores its robust performance and generalization\ncapabilities. On the NEU-DET dataset, our model exhibited a profound\nunderstanding of steel defects, achieving state-of-the-art accuracy in\nidentifying various defects. Simultaneously, when evaluated on the Pascal VOC\ndataset, our model showcases its ability to detect objects across a wide\nspectrum of categories within complex and small scenes.\n","authors":["Md Sohag Mia","Abdullah Al Bary Voban","Abu Bakor Hayat Arnob","Abdu Naim","Md Kawsar Ahmed","Md Shariful Islam"],"pdf_url":"https://arxiv.org/pdf/2310.05768v2.pdf","comment":"ICCD-23"},{"id":"http://arxiv.org/abs/2306.16585v2","updated":"2023-10-13T14:56:58Z","published":"2023-06-28T22:36:44Z","title":"SeMLaPS: Real-time Semantic Mapping with Latent Prior Networks and\n  Quasi-Planar Segmentation","summary":"  The availability of real-time semantics greatly improves the core geometric\nfunctionality of SLAM systems, enabling numerous robotic and AR/VR\napplications. We present a new methodology for real-time semantic mapping from\nRGB-D sequences that combines a 2D neural network and a 3D network based on a\nSLAM system with 3D occupancy mapping. When segmenting a new frame we perform\nlatent feature re-projection from previous frames based on differentiable\nrendering. Fusing re-projected feature maps from previous frames with\ncurrent-frame features greatly improves image segmentation quality, compared to\na baseline that processes images independently. For 3D map processing, we\npropose a novel geometric quasi-planar over-segmentation method that groups 3D\nmap elements likely to belong to the same semantic classes, relying on surface\nnormals. We also describe a novel neural network design for lightweight\nsemantic map post-processing. Our system achieves state-of-the-art semantic\nmapping quality within 2D-3D networks-based systems and matches the performance\nof 3D convolutional networks on three real indoor datasets, while working in\nreal-time. Moreover, it shows better cross-sensor generalization abilities\ncompared to 3D CNNs, enabling training and inference with different depth\nsensors. Code and data will be released on project page:\nhttp://jingwenwang95.github.io/SeMLaPS\n","authors":["Jingwen Wang","Juan Tarrio","Lourdes Agapito","Pablo F. Alcantarilla","Alexander Vakhitov"],"pdf_url":"https://arxiv.org/pdf/2306.16585v2.pdf","comment":"RA-L 2023. 8 pages, 7 figures. Project page:\n  http://jingwenwang95.github.io/SeMLaPS"},{"id":"http://arxiv.org/abs/2310.09147v1","updated":"2023-10-13T14:39:34Z","published":"2023-10-13T14:39:34Z","title":"Exploring Sparse Spatial Relation in Graph Inference for Text-Based VQA","summary":"  Text-based visual question answering (TextVQA) faces the significant\nchallenge of avoiding redundant relational inference. To be specific, a large\nnumber of detected objects and optical character recognition (OCR) tokens\nresult in rich visual relationships. Existing works take all visual\nrelationships into account for answer prediction. However, there are three\nobservations: (1) a single subject in the images can be easily detected as\nmultiple objects with distinct bounding boxes (considered repetitive objects).\nThe associations between these repetitive objects are superfluous for answer\nreasoning; (2) two spatially distant OCR tokens detected in the image\nfrequently have weak semantic dependencies for answer reasoning; and (3) the\nco-existence of nearby objects and tokens may be indicative of important visual\ncues for predicting answers. Rather than utilizing all of them for answer\nprediction, we make an effort to identify the most important connections or\neliminate redundant ones. We propose a sparse spatial graph network (SSGN) that\nintroduces a spatially aware relation pruning technique to this task. As\nspatial factors for relation measurement, we employ spatial distance, geometric\ndimension, overlap area, and DIoU for spatially aware pruning. We consider\nthree visual relationships for graph learning: object-object, OCR-OCR tokens,\nand object-OCR token relationships. SSGN is a progressive graph learning\narchitecture that verifies the pivotal relations in the correlated object-token\nsparse graph, and then in the respective object-based sparse graph and\ntoken-based sparse graph. Experiment results on TextVQA and ST-VQA datasets\ndemonstrate that SSGN achieves promising performances. And some visualization\nresults further demonstrate the interpretability of our method.\n","authors":["Sheng Zhou","Dan Guo","Jia Li","Xun Yang","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2310.09147v1.pdf","comment":"Accepted by TIP 2023"},{"id":"http://arxiv.org/abs/2310.05664v2","updated":"2023-10-13T14:36:36Z","published":"2023-10-09T12:31:30Z","title":"ViTs are Everywhere: A Comprehensive Study Showcasing Vision\n  Transformers in Different Domain","summary":"  Transformer design is the de facto standard for natural language processing\ntasks. The success of the transformer design in natural language processing has\nlately piqued the interest of researchers in the domain of computer vision.\nWhen compared to Convolutional Neural Networks (CNNs), Vision Transformers\n(ViTs) are becoming more popular and dominant solutions for many vision\nproblems. Transformer-based models outperform other types of networks, such as\nconvolutional and recurrent neural networks, in a range of visual benchmarks.\nWe evaluate various vision transformer models in this work by dividing them\ninto distinct jobs and examining their benefits and drawbacks. ViTs can\novercome several possible difficulties with convolutional neural networks\n(CNNs). The goal of this survey is to show the first use of ViTs in CV. In the\nfirst phase, we categorize various CV applications where ViTs are appropriate.\nImage classification, object identification, image segmentation, video\ntransformer, image denoising, and NAS are all CV applications. Our next step\nwill be to analyze the state-of-the-art in each area and identify the models\nthat are currently available. In addition, we outline numerous open research\ndifficulties as well as prospective research possibilities.\n","authors":["Md Sohag Mia","Abu Bakor Hayat Arnob","Abdu Naim","Abdullah Al Bary Voban","Md Shariful Islam"],"pdf_url":"https://arxiv.org/pdf/2310.05664v2.pdf","comment":"ICCD-2023. arXiv admin note: substantial text overlap with\n  arXiv:2208.04309 by other authors"},{"id":"http://arxiv.org/abs/2310.09126v1","updated":"2023-10-13T14:14:43Z","published":"2023-10-13T14:14:43Z","title":"Physics-guided Noise Neural Proxy for Low-light Raw Image Denoising","summary":"  Low-light raw image denoising plays a crucial role in mobile photography, and\nlearning-based methods have become the mainstream approach. Training the\nlearning-based methods with synthetic data emerges as an efficient and\npractical alternative to paired real data. However, the quality of synthetic\ndata is inherently limited by the low accuracy of the noise model, which\ndecreases the performance of low-light raw image denoising. In this paper, we\ndevelop a novel framework for accurate noise modeling that learns a\nphysics-guided noise neural proxy (PNNP) from dark frames. PNNP integrates\nthree efficient techniques: physics-guided noise decoupling (PND),\nphysics-guided proxy model (PPM), and differentiable distribution-oriented loss\n(DDL). The PND decouples the dark frame into different components and handles\ndifferent levels of noise in a flexible manner, which reduces the complexity of\nthe noise neural proxy. The PPM incorporates physical priors to effectively\nconstrain the generated noise, which promotes the accuracy of the noise neural\nproxy. The DDL provides explicit and reliable supervision for noise modeling,\nwhich promotes the precision of the noise neural proxy. Extensive experiments\non public low-light raw image denoising datasets and real low-light imaging\nscenarios demonstrate the superior performance of our PNNP framework.\n","authors":["Hansen Feng","Lizhi Wang","Yiqi Huang","Yuzhi Wang","Hua Huang"],"pdf_url":"https://arxiv.org/pdf/2310.09126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09125v1","updated":"2023-10-13T14:14:00Z","published":"2023-10-13T14:14:00Z","title":"Training and Predicting Visual Error for Real-Time Applications","summary":"  Visual error metrics play a fundamental role in the quantification of\nperceived image similarity. Most recently, use cases for them in real-time\napplications have emerged, such as content-adaptive shading and shading reuse\nto increase performance and improve efficiency. A wide range of different\nmetrics has been established, with the most sophisticated being capable of\ncapturing the perceptual characteristics of the human visual system. However,\ntheir complexity, computational expense, and reliance on reference images to\ncompare against prevent their generalized use in real-time, restricting such\napplications to using only the simplest available metrics. In this work, we\nexplore the abilities of convolutional neural networks to predict a variety of\nvisual metrics without requiring either reference or rendered images.\nSpecifically, we train and deploy a neural network to estimate the visual error\nresulting from reusing shading or using reduced shading rates. The resulting\nmodels account for 70%-90% of the variance while achieving up to an order of\nmagnitude faster computation times. Our solution combines image-space\ninformation that is readily available in most state-of-the-art deferred shading\npipelines with reprojection from previous frames to enable an adequate estimate\nof visual errors, even in previously unseen regions. We describe a suitable\nconvolutional network architecture and considerations for data preparation for\ntraining. We demonstrate the capability of our network to predict complex error\nmetrics at interactive rates in a real-time application that implements\ncontent-adaptive shading in a deferred pipeline. Depending on the portion of\nunseen image regions, our approach can achieve up to $2\\times$ performance\ncompared to state-of-the-art methods.\n","authors":["João Libório Cardoso","Bernhard Kerbl","Lei Yang","Yury Uralsky","Michael Wimmer"],"pdf_url":"https://arxiv.org/pdf/2310.09125v1.pdf","comment":"Published at Proceedings of the ACM in Computer Graphics and\n  Interactive Techniques. 14 Pages, 16 Figures, 3 Tables. For paper website and\n  higher quality figures, see https://jaliborc.github.io/rt-percept/"},{"id":"http://arxiv.org/abs/2310.09122v1","updated":"2023-10-13T14:11:33Z","published":"2023-10-13T14:11:33Z","title":"Equirectangular image construction method for standard CNNs for Semantic\n  Segmentation","summary":"  360{\\deg} spherical images have advantages of wide view field, and are\ntypically projected on a planar plane for processing, which is known as\nequirectangular image. The object shape in equirectangular images can be\ndistorted and lack translation invariance. In addition, there are few publicly\ndataset of equirectangular images with labels, which presents a challenge for\nstandard CNNs models to process equirectangular images effectively. To tackle\nthis problem, we propose a methodology for converting a perspective image into\nequirectangular image. The inverse transformation of the spherical center\nprojection and the equidistant cylindrical projection are employed. This\nenables the standard CNNs to learn the distortion features at different\npositions in the equirectangular image and thereby gain the ability to\nsemantically the equirectangular image. The parameter, {\\phi}, which determines\nthe projection position of the perspective image, has been analyzed using\nvarious datasets and models, such as UNet, UNet++, SegNet, PSPNet, and DeepLab\nv3+. The experiments demonstrate that an optimal value of {\\phi} for effective\nsemantic segmentation of equirectangular images is 6{\\pi}/16 for standard CNNs.\nCompared with the other three types of methods (supervised learning,\nunsupervised learning and data augmentation), the method proposed in this paper\nhas the best average IoU value of 43.76%. This value is 23.85%, 10.7% and\n17.23% higher than those of other three methods, respectively.\n","authors":["Haoqian Chen","Jian Liu","Minghe Li","Kaiwen Jiang","Ziheng Xu","Rencheng Sun","Yi Sui"],"pdf_url":"https://arxiv.org/pdf/2310.09122v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.07273v6","updated":"2023-10-13T14:10:29Z","published":"2022-11-14T11:07:18Z","title":"MLIC: Multi-Reference Entropy Model for Learned Image Compression","summary":"  Recently, learned image compression has achieved remarkable performance. The\nentropy model, which estimates the distribution of the latent representation,\nplays a crucial role in boosting rate-distortion performance. However, most\nentropy models only capture correlations in one dimension, while the latent\nrepresentation contain channel-wise, local spatial, and global spatial\ncorrelations. To tackle this issue, we propose the Multi-Reference Entropy\nModel (MEM) and the advanced version, MEM$^+$. These models capture the\ndifferent types of correlations present in latent representation. Specifically,\nWe first divide the latent representation into slices. When decoding the\ncurrent slice, we use previously decoded slices as context and employ the\nattention map of the previously decoded slice to predict global correlations in\nthe current slice. To capture local contexts, we introduce two enhanced\ncheckerboard context capturing techniques that avoids performance degradation.\nBased on MEM and MEM$^+$, we propose image compression models MLIC and\nMLIC$^+$. Extensive experimental evaluations demonstrate that our MLIC and\nMLIC$^+$ models achieve state-of-the-art performance, reducing BD-rate by\n$8.05\\%$ and $11.39\\%$ on the Kodak dataset compared to VTM-17.0 when measured\nin PSNR. Our code will be available at https://github.com/JiangWeibeta/MLIC.\n","authors":["Wei Jiang","Jiayu Yang","Yongqi Zhai","Peirong Ning","Feng Gao","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2211.07273v6.pdf","comment":"Accepted at ACMMM 2023"},{"id":"http://arxiv.org/abs/2309.09379v2","updated":"2023-10-13T14:04:54Z","published":"2023-09-17T21:06:22Z","title":"A Critical Analysis of Internal Reliability for Uncertainty\n  Quantification of Dense Image Matching in Multi-view Stereo","summary":"  Nowadays, photogrammetrically derived point clouds are widely used in many\ncivilian applications due to their low cost and flexibility in acquisition.\nTypically, photogrammetric point clouds are assessed through reference data\nsuch as LiDAR point clouds. However, when reference data are not available, the\nassessment of photogrammetric point clouds may be challenging. Since these\npoint clouds are algorithmically derived, their accuracies and precisions are\nhighly varying with the camera networks, scene complexity, and dense image\nmatching (DIM) algorithms, and there is no standard error metric to determine\nper-point errors. The theory of internal reliability of camera networks has\nbeen well studied through first-order error estimation of Bundle Adjustment\n(BA), which is used to understand the errors of 3D points assuming known\nmeasurement errors. However, the measurement errors of the DIM algorithms are\nintricate to an extent that every single point may have its error function\ndetermined by factors such as pixel intensity, texture entropy, and surface\nsmoothness. Despite the complexity, there exist a few common metrics that may\naid the process of estimating the posterior reliability of the derived points,\nespecially in a multi-view stereo (MVS) setup when redundancies are present. In\nthis paper, by using an aerial oblique photogrammetric block with LiDAR\nreference data, we analyze several internal matching metrics within a common\nMVS framework, including statistics in ray convergence, intersection angles,\nDIM energy, etc.\n","authors":["Debao Huang","Rongjun Qin"],"pdf_url":"https://arxiv.org/pdf/2309.09379v2.pdf","comment":"Figure 8"},{"id":"http://arxiv.org/abs/2310.09118v1","updated":"2023-10-13T14:03:01Z","published":"2023-10-13T14:03:01Z","title":"DSG: An End-to-End Document Structure Generator","summary":"  Information in industry, research, and the public sector is widely stored as\nrendered documents (e.g., PDF files, scans). Hence, to enable downstream tasks,\nsystems are needed that map rendered documents onto a structured hierarchical\nformat. However, existing systems for this task are limited by heuristics and\nare not end-to-end trainable. In this work, we introduce the Document Structure\nGenerator (DSG), a novel system for document parsing that is fully end-to-end\ntrainable. DSG combines a deep neural network for parsing (i) entities in\ndocuments (e.g., figures, text blocks, headers, etc.) and (ii) relations that\ncapture the sequence and nested structure between entities. Unlike existing\nsystems that rely on heuristics, our DSG is trained end-to-end, making it\neffective and flexible for real-world applications. We further contribute a\nnew, large-scale dataset called E-Periodica comprising real-world magazines\nwith complex document structures for evaluation. Our results demonstrate that\nour DSG outperforms commercial OCR tools and, on top of that, achieves\nstate-of-the-art performance. To the best of our knowledge, our DSG system is\nthe first end-to-end trainable system for hierarchical document parsing.\n","authors":["Johannes Rausch","Gentiana Rashiti","Maxim Gusev","Ce Zhang","Stefan Feuerriegel"],"pdf_url":"https://arxiv.org/pdf/2310.09118v1.pdf","comment":"Accepted at ICDM 2023"},{"id":"http://arxiv.org/abs/2310.09114v1","updated":"2023-10-13T14:00:49Z","published":"2023-10-13T14:00:49Z","title":"Timestamp-supervised Wearable-based Activity Segmentation and\n  Recognition with Contrastive Learning and Order-Preserving Optimal Transport","summary":"  Human activity recognition (HAR) with wearables is one of the serviceable\ntechnologies in ubiquitous and mobile computing applications. The\nsliding-window scheme is widely adopted while suffering from the multi-class\nwindows problem. As a result, there is a growing focus on joint segmentation\nand recognition with deep-learning methods, aiming at simultaneously dealing\nwith HAR and time-series segmentation issues. However, obtaining the full\nactivity annotations of wearable data sequences is resource-intensive or\ntime-consuming, while unsupervised methods yield poor performance. To address\nthese challenges, we propose a novel method for joint activity segmentation and\nrecognition with timestamp supervision, in which only a single annotated sample\nis needed in each activity segment. However, the limited information of sparse\nannotations exacerbates the gap between recognition and segmentation tasks,\nleading to sub-optimal model performance. Therefore, the prototypes are\nestimated by class-activation maps to form a sample-to-prototype contrast\nmodule for well-structured embeddings. Moreover, with the optimal transport\ntheory, our approach generates the sample-level pseudo-labels that take\nadvantage of unlabeled data between timestamp annotations for further\nperformance improvement. Comprehensive experiments on four public HAR datasets\ndemonstrate that our model trained with timestamp supervision is superior to\nthe state-of-the-art weakly-supervised methods and achieves comparable\nperformance to the fully-supervised approaches.\n","authors":["Songpengcheng Xia","Lei Chu","Ling Pei","Jiarui Yang","Wenxian Yu","Robert C. Qiu"],"pdf_url":"https://arxiv.org/pdf/2310.09114v1.pdf","comment":"Under Review (submitted to IEEE TMC)"},{"id":"http://arxiv.org/abs/2211.08007v2","updated":"2023-10-13T14:00:48Z","published":"2022-11-15T09:42:07Z","title":"Uncertainty-aware Gait Recognition via Learning from Dirichlet\n  Distribution-based Evidence","summary":"  Existing gait recognition frameworks retrieve an identity in the gallery\nbased on the distance between a probe sample and the identities in the gallery.\nHowever, existing methods often neglect that the gallery may not contain\nidentities corresponding to the probes, leading to recognition errors rather\nthan raising an alarm. In this paper, we introduce a novel uncertainty-aware\ngait recognition method that models the uncertainty of identification based on\nlearned evidence. Specifically, we treat our recognition model as an evidence\ncollector to gather evidence from input samples and parameterize a Dirichlet\ndistribution over the evidence. The Dirichlet distribution essentially\nrepresents the density of the probability assigned to the input samples. We\nutilize the distribution to evaluate the resultant uncertainty of each probe\nsample and then determine whether a probe has a counterpart in the gallery or\nnot. To the best of our knowledge, our method is the first attempt to tackle\ngait recognition with uncertainty modelling. Moreover, our uncertain modeling\nsignificantly improves the robustness against out-of-distribution (OOD)\nqueries. Extensive experiments demonstrate that our method achieves\nstate-of-the-art performance on datasets with OOD queries, and can also\ngeneralize well to other identity-retrieval tasks. Importantly, our method\noutperforms the state-of-the-art by a large margin of 51.26% when the OOD query\nrate is around 50% on OUMVLP.\n","authors":["Beibei Lin","Chen Liu","Ming Wang","Lincheng Li","Shunli Zhang","Robby T. Tan","Xin Yu"],"pdf_url":"https://arxiv.org/pdf/2211.08007v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09675v3","updated":"2023-10-13T13:48:58Z","published":"2023-06-16T08:13:41Z","title":"Multi-View Class Incremental Learning","summary":"  Multi-view learning (MVL) has gained great success in integrating information\nfrom multiple perspectives of a dataset to improve downstream task performance.\nTo make MVL methods more practical in an open-ended environment, this paper\ninvestigates a novel paradigm called multi-view class incremental learning\n(MVCIL), where a single model incrementally classifies new classes from a\ncontinual stream of views, requiring no access to earlier views of data.\nHowever, MVCIL is challenged by the catastrophic forgetting of old information\nand the interference with learning new concepts. To address this, we first\ndevelop a randomization-based representation learning technique serving for\nfeature extraction to guarantee their separate view-optimal working states,\nduring which multiple views belonging to a class are presented sequentially;\nThen, we integrate them one by one in the orthogonality fusion subspace spanned\nby the extracted features; Finally, we introduce selective weight consolidation\nfor learning-without-forgetting decision-making while encountering new classes.\nExtensive experiments on synthetic and real-world datasets validate the\neffectiveness of our approach.\n","authors":["Depeng Li","Tianqi Wang","Junwei Chen","Kenji Kawaguchi","Cheng Lian","Zhigang Zeng"],"pdf_url":"https://arxiv.org/pdf/2306.09675v3.pdf","comment":"Accepted to Information Fusion"},{"id":"http://arxiv.org/abs/2310.02071v2","updated":"2023-10-13T13:43:53Z","published":"2023-10-03T14:13:36Z","title":"Towards End-to-End Embodied Decision Making via Multi-modal Large\n  Language Model: Explorations with GPT4-Vision and Beyond","summary":"  In this study, we explore the potential of Multimodal Large Language Models\n(MLLMs) in improving embodied decision-making processes for agents. While Large\nLanguage Models (LLMs) have been widely used due to their advanced reasoning\nskills and vast world knowledge, MLLMs like GPT4-Vision offer enhanced visual\nunderstanding and reasoning capabilities. We investigate whether\nstate-of-the-art MLLMs can handle embodied decision-making in an end-to-end\nmanner and whether collaborations between LLMs and MLLMs can enhance\ndecision-making. To address these questions, we introduce a new benchmark\ncalled PCA-EVAL, which evaluates embodied decision-making from the perspectives\nof Perception, Cognition, and Action. Additionally, we propose HOLMES, a\nmulti-agent cooperation framework that allows LLMs to leverage MLLMs and APIs\nto gather multimodal information for informed decision-making. We compare\nend-to-end embodied decision-making and HOLMES on our benchmark and find that\nthe GPT4-Vision model demonstrates strong end-to-end embodied decision-making\nabilities, outperforming GPT4-HOLMES in terms of average decision accuracy\n(+3%). However, this performance is exclusive to the latest GPT4-Vision model,\nsurpassing the open-source state-of-the-art MLLM by 26%. Our results indicate\nthat powerful MLLMs like GPT4-Vision hold promise for decision-making in\nembodied agents, offering new avenues for MLLM research. Code and data are open\nat https://github.com/pkunlp-icler/PCA-EVAL/.\n","authors":["Liang Chen","Yichi Zhang","Shuhuai Ren","Haozhe Zhao","Zefan Cai","Yuchi Wang","Peiyi Wang","Tianyu Liu","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2310.02071v2.pdf","comment":"18 pages, 10 figures, Code and data:\n  https://github.com/pkunlp-icler/PCA-EVAL/"},{"id":"http://arxiv.org/abs/2304.03550v2","updated":"2023-10-13T13:38:26Z","published":"2023-04-07T09:11:29Z","title":"Hierarchical Disentanglement-Alignment Network for Robust SAR Vehicle\n  Recognition","summary":"  Vehicle recognition is a fundamental problem in SAR image interpretation.\nHowever, robustly recognizing vehicle targets is a challenging task in SAR due\nto the large intraclass variations and small interclass variations.\nAdditionally, the lack of large datasets further complicates the task. Inspired\nby the analysis of target signature variations and deep learning\nexplainability, this paper proposes a novel domain alignment framework named\nthe Hierarchical Disentanglement-Alignment Network (HDANet) to achieve\nrobustness under various operating conditions. Concisely, HDANet integrates\nfeature disentanglement and alignment into a unified framework with three\nmodules: domain data generation, multitask-assisted mask disentanglement, and\ndomain alignment of target features. The first module generates diverse data\nfor alignment, and three simple but effective data augmentation methods are\ndesigned to simulate target signature variations. The second module\ndisentangles the target features from background clutter using the\nmultitask-assisted mask to prevent clutter from interfering with subsequent\nalignment. The third module employs a contrastive loss for domain alignment to\nextract robust target features from generated diverse data and disentangled\nfeatures. Lastly, the proposed method demonstrates impressive robustness across\nnine operating conditions in the MSTAR dataset, and extensive qualitative and\nquantitative analyses validate the effectiveness of our framework.\n","authors":["Weijie Li","Wei Yang","Wenpeng Zhang","Tianpeng Liu","Yongxiang Liu","Li Liu"],"pdf_url":"https://arxiv.org/pdf/2304.03550v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08586v2","updated":"2023-10-13T13:37:57Z","published":"2023-10-12T17:59:57Z","title":"PonderV2: Pave the Way for 3D Foundation Model with A Universal\n  Pre-training Paradigm","summary":"  In contrast to numerous NLP and 2D computer vision foundational models, the\nlearning of a robust and highly generalized 3D foundational model poses\nconsiderably greater challenges. This is primarily due to the inherent data\nvariability and the diversity of downstream tasks. In this paper, we introduce\na comprehensive 3D pre-training framework designed to facilitate the\nacquisition of efficient 3D representations, thereby establishing a pathway to\n3D foundational models. Motivated by the fact that informative 3D features\nshould be able to encode rich geometry and appearance cues that can be utilized\nto render realistic images, we propose a novel universal paradigm to learn\npoint cloud representations by differentiable neural rendering, serving as a\nbridge between 3D and 2D worlds. We train a point cloud encoder within a\ndevised volumetric neural renderer by comparing the rendered images with the\nreal images. Notably, our approach demonstrates the seamless integration of the\nlearned 3D encoder into diverse downstream tasks. These tasks encompass not\nonly high-level challenges such as 3D detection and segmentation but also\nlow-level objectives like 3D reconstruction and image synthesis, spanning both\nindoor and outdoor scenarios. Besides, we also illustrate the capability of\npre-training a 2D backbone using the proposed universal methodology, surpassing\nconventional pre-training methods by a large margin. For the first time,\nPonderV2 achieves state-of-the-art performance on 11 indoor and outdoor\nbenchmarks. The consistent improvements in various settings imply the\neffectiveness of the proposed method. Code and models will be made available at\nhttps://github.com/OpenGVLab/PonderV2.\n","authors":["Haoyi Zhu","Honghui Yang","Xiaoyang Wu","Di Huang","Sha Zhang","Xianglong He","Tong He","Hengshuang Zhao","Chunhua Shen","Yu Qiao","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2310.08586v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2301.00157"},{"id":"http://arxiv.org/abs/2310.09099v1","updated":"2023-10-13T13:35:19Z","published":"2023-10-13T13:35:19Z","title":"Faster 3D cardiac CT segmentation with Vision Transformers","summary":"  Accurate segmentation of the heart is essential for personalized blood flow\nsimulations and surgical intervention planning. A recent advancement in image\nrecognition is the Vision Transformer (ViT), which expands the field of view to\nencompass a greater portion of the global image context. We adapted ViT for\nthree-dimensional volume inputs. Cardiac computed tomography (CT) volumes from\n39 patients, featuring up to 20 timepoints representing the complete cardiac\ncycle, were utilized. Our network incorporates a modified ResNet50 block as\nwell as a ViT block and employs cascade upsampling with skip connections.\nDespite its increased model complexity, our hybrid Transformer-Residual U-Net\nframework, termed TRUNet, converges in significantly less time than residual\nU-Net while providing comparable or superior segmentations of the left\nventricle, left atrium, left atrial appendage, ascending aorta, and pulmonary\nveins. TRUNet offers more precise vessel boundary segmentation and better\ncaptures the heart's overall anatomical structure compared to residual U-Net,\nas confirmed by the absence of extraneous clusters of missegmented voxels. In\nterms of both performance and training speed, TRUNet exceeded U-Net, a commonly\nused segmentation architecture, making it a promising tool for 3D semantic\nsegmentation tasks in medical imaging. The code for TRUNet is available at\ngithub.com/ljollans/TRUNet.\n","authors":["Lee Jollans","Mariana Bustamante","Lilian Henriksson","Anders Persson","Tino Ebbers"],"pdf_url":"https://arxiv.org/pdf/2310.09099v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09092v1","updated":"2023-10-13T13:24:37Z","published":"2023-10-13T13:24:37Z","title":"iPUNet:Iterative Cross Field Guided Point Cloud Upsampling","summary":"  Point clouds acquired by 3D scanning devices are often sparse, noisy, and\nnon-uniform, causing a loss of geometric features. To facilitate the usability\nof point clouds in downstream applications, given such input, we present a\nlearning-based point upsampling method, i.e., iPUNet, which generates dense and\nuniform points at arbitrary ratios and better captures sharp features. To\ngenerate feature-aware points, we introduce cross fields that are aligned to\nsharp geometric features by self-supervision to guide point generation. Given\ncross field defined frames, we enable arbitrary ratio upsampling by learning at\neach input point a local parameterized surface. The learned surface consumes\nthe neighboring points and 2D tangent plane coordinates as input, and maps onto\na continuous surface in 3D where arbitrary ratios of output points can be\nsampled. To solve the non-uniformity of input points, on top of the cross field\nguided upsampling, we further introduce an iterative strategy that refines the\npoint distribution by moving sparse points onto the desired continuous 3D\nsurface in each iteration. Within only a few iterations, the sparse points are\nevenly distributed and their corresponding dense samples are more uniform and\nbetter capture geometric features. Through extensive evaluations on diverse\nscans of objects and scenes, we demonstrate that iPUNet is robust to handle\nnoisy and non-uniformly distributed inputs, and outperforms state-of-the-art\npoint cloud upsampling methods.\n","authors":["Guangshun Wei","Hao Pan","Shaojie Zhuang","Yuanfeng Zhou","Changjian Li"],"pdf_url":"https://arxiv.org/pdf/2310.09092v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10474v2","updated":"2023-10-13T13:05:26Z","published":"2023-06-18T04:34:17Z","title":"A Universal Semantic-Geometric Representation for Robotic Manipulation","summary":"  Robots rely heavily on sensors, especially RGB and depth cameras, to perceive\nand interact with the world. RGB cameras record 2D images with rich semantic\ninformation while missing precise spatial information. On the other side, depth\ncameras offer critical 3D geometry data but capture limited semantics.\nTherefore, integrating both modalities is crucial for learning representations\nfor robotic perception and control. However, current research predominantly\nfocuses on only one of these modalities, neglecting the benefits of\nincorporating both. To this end, we present $\\textbf{Semantic-Geometric\nRepresentation} (\\textbf{SGR})$, a universal perception module for robotics\nthat leverages the rich semantic information of large-scale pre-trained 2D\nmodels and inherits the merits of 3D spatial reasoning. Our experiments\ndemonstrate that SGR empowers the agent to successfully complete a diverse\nrange of simulated and real-world robotic manipulation tasks, outperforming\nstate-of-the-art methods significantly in both single-task and multi-task\nsettings. Furthermore, SGR possesses the capability to generalize to novel\nsemantic attributes, setting it apart from the other methods. Project website:\nhttps://semantic-geometric-representation.github.io.\n","authors":["Tong Zhang","Yingdong Hu","Hanchen Cui","Hang Zhao","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2306.10474v2.pdf","comment":"CoRL 2023. Project website:\n  https://semantic-geometric-representation.github.io"},{"id":"http://arxiv.org/abs/2304.12372v3","updated":"2023-10-13T12:58:41Z","published":"2023-04-24T18:10:25Z","title":"Beyond the Pixel: a Photometrically Calibrated HDR Dataset for Luminance\n  and Color Prediction","summary":"  Light plays an important role in human well-being. However, most computer\nvision tasks treat pixels without considering their relationship to physical\nluminance. To address this shortcoming, we introduce the Laval Photometric\nIndoor HDR Dataset, the first large-scale photometrically calibrated dataset of\nhigh dynamic range 360{\\deg} panoramas. Our key contribution is the calibration\nof an existing, uncalibrated HDR Dataset. We do so by accurately capturing RAW\nbracketed exposures simultaneously with a professional photometric measurement\ndevice (chroma meter) for multiple scenes across a variety of lighting\nconditions. Using the resulting measurements, we establish the calibration\ncoefficients to be applied to the HDR images. The resulting dataset is a rich\nrepresentation of indoor scenes which displays a wide range of illuminance and\ncolor, and varied types of light sources. We exploit the dataset to introduce\nthree novel tasks, where: per-pixel luminance, per-pixel color and planar\nilluminance can be predicted from a single input image. Finally, we also\ncapture another smaller photometric dataset with a commercial 360{\\deg} camera,\nto experiment on generalization across cameras. We are optimistic that the\nrelease of our datasets and associated code will spark interest in physically\naccurate light estimation within the community. Dataset and code are available\nat https://lvsn.github.io/beyondthepixel/.\n","authors":["Christophe Bolduc","Justine Giroux","Marc Hébert","Claude Demers","Jean-François Lalonde"],"pdf_url":"https://arxiv.org/pdf/2304.12372v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15833v2","updated":"2023-10-13T12:49:35Z","published":"2023-03-28T09:05:15Z","title":"Complementary Domain Adaptation and Generalization for Unsupervised\n  Continual Domain Shift Learning","summary":"  Continual domain shift poses a significant challenge in real-world\napplications, particularly in situations where labeled data is not available\nfor new domains. The challenge of acquiring knowledge in this problem setting\nis referred to as unsupervised continual domain shift learning. Existing\nmethods for domain adaptation and generalization have limitations in addressing\nthis issue, as they focus either on adapting to a specific domain or\ngeneralizing to unseen domains, but not both. In this paper, we propose\nComplementary Domain Adaptation and Generalization (CoDAG), a simple yet\neffective learning framework that combines domain adaptation and generalization\nin a complementary manner to achieve three major goals of unsupervised\ncontinual domain shift learning: adapting to a current domain, generalizing to\nunseen domains, and preventing forgetting of previously seen domains. Our\napproach is model-agnostic, meaning that it is compatible with any existing\ndomain adaptation and generalization algorithms. We evaluate CoDAG on several\nbenchmark datasets and demonstrate that our model outperforms state-of-the-art\nmodels in all datasets and evaluation metrics, highlighting its effectiveness\nand robustness in handling unsupervised continual domain shift learning.\n","authors":["Wonguk Cho","Jinha Park","Taesup Kim"],"pdf_url":"https://arxiv.org/pdf/2303.15833v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2304.06385v4","updated":"2023-10-13T12:43:31Z","published":"2023-04-13T10:37:41Z","title":"TransHP: Image Classification with Hierarchical Prompting","summary":"  This paper explores a hierarchical prompting mechanism for the hierarchical\nimage classification (HIC) task. Different from prior HIC methods, our\nhierarchical prompting is the first to explicitly inject ancestor-class\ninformation as a tokenized hint that benefits the descendant-class\ndiscrimination. We think it well imitates human visual recognition, i.e.,\nhumans may use the ancestor class as a prompt to draw focus on the subtle\ndifferences among descendant classes. We model this prompting mechanism into a\nTransformer with Hierarchical Prompting (TransHP). TransHP consists of three\nsteps: 1) learning a set of prompt tokens to represent the coarse (ancestor)\nclasses, 2) on-the-fly predicting the coarse class of the input image at an\nintermediate block, and 3) injecting the prompt token of the predicted coarse\nclass into the intermediate feature. Though the parameters of TransHP maintain\nthe same for all input images, the injected coarse-class prompt conditions\n(modifies) the subsequent feature extraction and encourages a dynamic focus on\nrelatively subtle differences among the descendant classes. Extensive\nexperiments show that TransHP improves image classification on accuracy (e.g.,\nimproving ViT-B/16 by +2.83% ImageNet classification accuracy), training data\nefficiency (e.g., +12.69% improvement under 10% ImageNet training data), and\nmodel explainability. Moreover, TransHP also performs favorably against prior\nHIC methods, showing that TransHP well exploits the hierarchical information.\n","authors":["Wenhao Wang","Yifan Sun","Wei Li","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2304.06385v4.pdf","comment":"Accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.09066v1","updated":"2023-10-13T12:41:28Z","published":"2023-10-13T12:41:28Z","title":"pose-format: Library for Viewing, Augmenting, and Handling .pose Files","summary":"  Managing and analyzing pose data is a complex task, with challenges ranging\nfrom handling diverse file structures and data types to facilitating effective\ndata manipulations such as normalization and augmentation. This paper presents\n\\texttt{pose-format}, a comprehensive toolkit designed to address these\nchallenges by providing a unified, flexible, and easy-to-use interface. The\nlibrary includes a specialized file format that encapsulates various types of\npose data, accommodating multiple individuals and an indefinite number of time\nframes, thus proving its utility for both image and video data. Furthermore, it\noffers seamless integration with popular numerical libraries such as NumPy,\nPyTorch, and TensorFlow, thereby enabling robust machine-learning applications.\nThrough benchmarking, we demonstrate that our \\texttt{.pose} file format offers\nvastly superior performance against prevalent formats like OpenPose, with added\nadvantages like self-contained pose specification. Additionally, the library\nincludes features for data normalization, augmentation, and easy-to-use\nvisualization capabilities, both in Python and Browser environments.\n\\texttt{pose-format} emerges as a one-stop solution, streamlining the\ncomplexities of pose data management and analysis.\n","authors":["Amit Moryossef","Mathias Müller","Rebecka Fahrni"],"pdf_url":"https://arxiv.org/pdf/2310.09066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09016v1","updated":"2023-10-13T11:25:41Z","published":"2023-10-13T11:25:41Z","title":"A Spatial-Temporal Dual-Mode Mixed Flow Network for Panoramic Video\n  Salient Object Detection","summary":"  Salient object detection (SOD) in panoramic video is still in the initial\nexploration stage. The indirect application of 2D video SOD method to the\ndetection of salient objects in panoramic video has many unmet challenges, such\nas low detection accuracy, high model complexity, and poor generalization\nperformance. To overcome these hurdles, we design an Inter-Layer Attention\n(ILA) module, an Inter-Layer weight (ILW) module, and a Bi-Modal Attention\n(BMA) module. Based on these modules, we propose a Spatial-Temporal Dual-Mode\nMixed Flow Network (STDMMF-Net) that exploits the spatial flow of panoramic\nvideo and the corresponding optical flow for SOD. First, the ILA module\ncalculates the attention between adjacent level features of consecutive frames\nof panoramic video to improve the accuracy of extracting salient object\nfeatures from the spatial flow. Then, the ILW module quantifies the salient\nobject information contained in the features of each level to improve the\nfusion efficiency of the features of each level in the mixed flow. Finally, the\nBMA module improves the detection accuracy of STDMMF-Net. A large number of\nsubjective and objective experimental results testify that the proposed method\ndemonstrates better detection accuracy than the state-of-the-art (SOTA)\nmethods. Moreover, the comprehensive performance of the proposed method is\nbetter in terms of memory required for model inference, testing time,\ncomplexity, and generalization performance.\n","authors":["Xiaolei Chen","Pengcheng Zhang","Zelong Du","Ishfaq Ahmad"],"pdf_url":"https://arxiv.org/pdf/2310.09016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05000v2","updated":"2023-10-13T11:07:09Z","published":"2023-07-11T03:40:10Z","title":"Neural Point-based Volumetric Avatar: Surface-guided Neural Points for\n  Efficient and Photorealistic Volumetric Head Avatar","summary":"  Rendering photorealistic and dynamically moving human heads is crucial for\nensuring a pleasant and immersive experience in AR/VR and video conferencing\napplications. However, existing methods often struggle to model challenging\nfacial regions (e.g., mouth interior, eyes, hair/beard), resulting in\nunrealistic and blurry results. In this paper, we propose {\\fullname}\n({\\name}), a method that adopts the neural point representation as well as the\nneural volume rendering process and discards the predefined connectivity and\nhard correspondence imposed by mesh-based approaches. Specifically, the neural\npoints are strategically constrained around the surface of the target\nexpression via a high-resolution UV displacement map, achieving increased\nmodeling capacity and more accurate control. We introduce three technical\ninnovations to improve the rendering and training efficiency: a patch-wise\ndepth-guided (shading point) sampling strategy, a lightweight radiance decoding\nprocess, and a Grid-Error-Patch (GEP) ray sampling strategy during training. By\ndesign, our {\\name} is better equipped to handle topologically changing regions\nand thin structures while also ensuring accurate expression control when\nanimating avatars. Experiments conducted on three subjects from the Multiface\ndataset demonstrate the effectiveness of our designs, outperforming previous\nstate-of-the-art methods, especially in handling challenging facial regions.\n","authors":["Cong Wang","Di Kang","Yan-Pei Cao","Linchao Bao","Ying Shan","Song-Hai Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.05000v2.pdf","comment":"Accepted by SIGGRAPH Asia 2023"},{"id":"http://arxiv.org/abs/2309.14976v3","updated":"2023-10-13T10:36:14Z","published":"2023-09-26T14:52:51Z","title":"MoCaE: Mixture of Calibrated Experts Significantly Improves Object\n  Detection","summary":"  We propose an extremely simple and highly effective approach to faithfully\ncombine different object detectors to obtain a Mixture of Experts (MoE) that\nhas a superior accuracy to the individual experts in the mixture. We find that\nnaively combining these experts in a similar way to the well-known Deep\nEnsembles (DEs), does not result in an effective MoE. We identify the\nincompatibility between the confidence score distribution of different\ndetectors to be the primary reason for such failure cases. Therefore, to\nconstruct the MoE, our proposal is to first calibrate each individual detector\nagainst a target calibration function. Then, filter and refine all the\npredictions from different detectors in the mixture. We term this approach as\nMoCaE and demonstrate its effectiveness through extensive experiments on object\ndetection, instance segmentation and rotated object detection tasks.\nSpecifically, MoCaE improves (i) three strong object detectors on COCO test-dev\nby $2.4$ $\\mathrm{AP}$ by reaching $59.0$ $\\mathrm{AP}$; (ii) instance\nsegmentation methods on the challenging long-tailed LVIS dataset by $2.3$\n$\\mathrm{AP}$; and (iii) all existing rotated object detectors by reaching\n$82.62$ $\\mathrm{AP_{50}}$ on DOTA dataset, establishing a new state-of-the-art\n(SOTA). Code will be made public.\n","authors":["Kemal Oksuz","Selim Kuzucu","Tom Joy","Puneet K. Dokania"],"pdf_url":"https://arxiv.org/pdf/2309.14976v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08986v1","updated":"2023-10-13T10:06:39Z","published":"2023-10-13T10:06:39Z","title":"VCL Challenges 2023 at ICCV 2023 Technical Report: Bi-level Adaptation\n  Method for Test-time Adaptive Object Detection","summary":"  This report outlines our team's participation in VCL Challenges B Continual\nTest_time Adaptation, focusing on the technical details of our approach. Our\nprimary focus is Testtime Adaptation using bi_level adaptations, encompassing\nimage_level and detector_level adaptations. At the image level, we employ\nadjustable parameterbased image filters, while at the detector level, we\nleverage adjustable parameterbased mean teacher modules. Ultimately, through\nthe utilization of these bi_level adaptations, we have achieved a remarkable\n38.3% mAP on the target domain of the test set within VCL Challenges B. It is\nworth noting that the minimal drop in mAP, is mearly 4.2%, and the overall\nperformance is 32.5% mAP.\n","authors":["Chenyu Lin","Yusheng He","Zhengqing Zang","Chenwei Tang","Tao Wang","Jiancheng Lv"],"pdf_url":"https://arxiv.org/pdf/2310.08986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08984v1","updated":"2023-10-13T10:03:01Z","published":"2023-10-13T10:03:01Z","title":"UniParser: Multi-Human Parsing with Unified Correlation Representation\n  Learning","summary":"  Multi-human parsing is an image segmentation task necessitating both\ninstance-level and fine-grained category-level information. However, prior\nresearch has typically processed these two types of information through\nseparate branches and distinct output formats, leading to inefficient and\nredundant frameworks. This paper introduces UniParser, which integrates\ninstance-level and category-level representations in three key aspects: 1) we\npropose a unified correlation representation learning approach, allowing our\nnetwork to learn instance and category features within the cosine space; 2) we\nunify the form of outputs of each modules as pixel-level segmentation results\nwhile supervising instance and category features using a homogeneous label\naccompanied by an auxiliary loss; and 3) we design a joint optimization\nprocedure to fuse instance and category representations. By virtual of unifying\ninstance-level and category-level output, UniParser circumvents manually\ndesigned post-processing techniques and surpasses state-of-the-art methods,\nachieving 49.3% AP on MHPv2.0 and 60.4% AP on CIHP. We will release our source\ncode, pretrained models, and online demos to facilitate future studies.\n","authors":["Jiaming Chu","Lei Jin","Junliang Xing","Jian Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.08984v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08371v2","updated":"2023-10-13T09:20:09Z","published":"2023-10-12T14:40:24Z","title":"Worst-Case Morphs using Wasserstein ALI and Improved MIPGAN","summary":"  A morph is a combination of two separate facial images and contains identity\ninformation of two different people. When used in an identity document, both\npeople can be authenticated by a biometric Face Recognition (FR) system. Morphs\ncan be generated using either a landmark-based approach or approaches based on\ndeep learning such as Generative Adversarial Networks (GAN). In a recent paper,\nwe introduced a \\emph{worst-case} upper bound on how challenging morphing\nattacks can be for an FR system. The closer morphs are to this upper bound, the\nbigger the challenge they pose to FR. We introduced an approach with which it\nwas possible to generate morphs that approximate this upper bound for a known\nFR system (white box), but not for unknown (black box) FR systems.\n  In this paper, we introduce a morph generation method that can approximate\nworst-case morphs even when the FR system is not known. A key contribution is\nthat we include the goal of generating difficult morphs \\emph{during} training.\nOur method is based on Adversarially Learned Inference (ALI) and uses concepts\nfrom Wasserstein GANs trained with Gradient Penalty, which were introduced to\nstabilise the training of GANs. We include these concepts to achieve similar\nimprovement in training stability and call the resulting method Wasserstein ALI\n(WALI). We finetune WALI using loss functions designed specifically to improve\nthe ability to manipulate identity information in facial images and show how it\ncan generate morphs that are more challenging for FR systems than landmark- or\nGAN-based morphs. We also show how our findings can be used to improve MIPGAN,\nan existing StyleGAN-based morph generator.\n","authors":["Una M. Kelly","Meike Nauta","Lu Liu","Luuk J. Spreeuwers","Raymond N. J. Veldhuis"],"pdf_url":"https://arxiv.org/pdf/2310.08371v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08956v1","updated":"2023-10-13T09:04:52Z","published":"2023-10-13T09:04:52Z","title":"LRRU: Long-short Range Recurrent Updating Networks for Depth Completion","summary":"  Existing deep learning-based depth completion methods generally employ\nmassive stacked layers to predict the dense depth map from sparse input data.\nAlthough such approaches greatly advance this task, their accompanied huge\ncomputational complexity hinders their practical applications. To accomplish\ndepth completion more efficiently, we propose a novel lightweight deep network\nframework, the Long-short Range Recurrent Updating (LRRU) network. Without\nlearning complex feature representations, LRRU first roughly fills the sparse\ninput to obtain an initial dense depth map, and then iteratively updates it\nthrough learned spatially-variant kernels. Our iterative update process is\ncontent-adaptive and highly flexible, where the kernel weights are learned by\njointly considering the guidance RGB images and the depth map to be updated,\nand large-to-small kernel scopes are dynamically adjusted to capture\nlong-to-short range dependencies. Our initial depth map has coarse but complete\nscene depth information, which helps relieve the burden of directly regressing\nthe dense depth from sparse ones, while our proposed method can effectively\nrefine it to an accurate depth map with less learnable parameters and inference\ntime. Experimental results demonstrate that our proposed LRRU variants achieve\nstate-of-the-art performance across different parameter regimes. In particular,\nthe LRRU-Base model outperforms competing approaches on the NYUv2 dataset, and\nranks 1st on the KITTI depth completion benchmark at the time of submission.\nProject page: https://npucvr.github.io/LRRU/.\n","authors":["Yufei Wang","Bo Li","Ge Zhang","Qi Liu","Tao Gao","Yuchao Dai"],"pdf_url":"https://arxiv.org/pdf/2310.08956v1.pdf","comment":"Published in ICCV 2023"},{"id":"http://arxiv.org/abs/2309.01429v2","updated":"2023-10-13T08:42:01Z","published":"2023-09-04T08:23:31Z","title":"Adapting Segment Anything Model for Change Detection in HR Remote\n  Sensing Images","summary":"  Vision Foundation Models (VFMs) such as the Segment Anything Model (SAM)\nallow zero-shot or interactive segmentation of visual contents, thus they are\nquickly applied in a variety of visual scenes. However, their direct use in\nmany Remote Sensing (RS) applications is often unsatisfactory due to the\nspecial imaging characteristics of RS images. In this work, we aim to utilize\nthe strong visual recognition capabilities of VFMs to improve the change\ndetection of high-resolution Remote Sensing Images (RSIs). We employ the visual\nencoder of FastSAM, an efficient variant of the SAM, to extract visual\nrepresentations in RS scenes. To adapt FastSAM to focus on some specific ground\nobjects in the RS scenes, we propose a convolutional adaptor to aggregate the\ntask-oriented change information. Moreover, to utilize the semantic\nrepresentations that are inherent to SAM features, we introduce a task-agnostic\nsemantic learning branch to model the semantic latent in bi-temporal RSIs. The\nresulting method, SAMCD, obtains superior accuracy compared to the SOTA methods\nand exhibits a sample-efficient learning ability that is comparable to\nsemi-supervised CD methods. To the best of our knowledge, this is the first\nwork that adapts VFMs for the CD of HR RSIs.\n","authors":["Lei Ding","Kun Zhu","Daifeng Peng","Hao Tang","Kuiwu Yang","Lorenzo Bruzzone"],"pdf_url":"https://arxiv.org/pdf/2309.01429v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08949v1","updated":"2023-10-13T08:38:56Z","published":"2023-10-13T08:38:56Z","title":"Making Multimodal Generation Easier: When Diffusion Models Meet LLMs","summary":"  We present EasyGen, an efficient model designed to enhance multimodal\nunderstanding and generation by harnessing the capabilities of diffusion models\nand large language models (LLMs). Unlike existing multimodal models that\npredominately depend on encoders like CLIP or ImageBind and need ample amounts\nof training data to bridge the gap between modalities, EasyGen is built upon a\nbidirectional conditional diffusion model named BiDiffuser, which promotes more\nefficient interactions between modalities. EasyGen handles image-to-text\ngeneration by integrating BiDiffuser and an LLM via a simple projection layer.\nUnlike most existing multimodal models that are limited to generating text\nresponses, EasyGen can also facilitate text-to-image generation by leveraging\nthe LLM to create textual descriptions, which can be interpreted by BiDiffuser\nto generate appropriate visual responses. Extensive quantitative and\nqualitative experiments demonstrate the effectiveness of EasyGen, whose\ntraining can be easily achieved in a lab setting. The source code is available\nat https://github.com/zxy556677/EasyGen.\n","authors":["Xiangyu Zhao","Bo Liu","Qijiong Liu","Guangyuan Shi","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2310.08949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08948v1","updated":"2023-10-13T08:35:02Z","published":"2023-10-13T08:35:02Z","title":"Federated Class-Incremental Learning with Prompting","summary":"  As Web technology continues to develop, it has become increasingly common to\nuse data stored on different clients. At the same time, federated learning has\nreceived widespread attention due to its ability to protect data privacy when\nlet models learn from data which is distributed across various clients.\nHowever, most existing works assume that the client's data are fixed. In\nreal-world scenarios, such an assumption is most likely not true as data may be\ncontinuously generated and new classes may also appear. To this end, we focus\non the practical and challenging federated class-incremental learning (FCIL)\nproblem. For FCIL, the local and global models may suffer from catastrophic\nforgetting on old classes caused by the arrival of new classes and the data\ndistributions of clients are non-independent and identically distributed\n(non-iid).\n  In this paper, we propose a novel method called Federated Class-Incremental\nLearning with PrompTing (FCILPT). Given the privacy and limited memory, FCILPT\ndoes not use a rehearsal-based buffer to keep exemplars of old data. We choose\nto use prompts to ease the catastrophic forgetting of the old classes.\nSpecifically, we encode the task-relevant and task-irrelevant knowledge into\nprompts, preserving the old and new knowledge of the local clients and solving\nthe problem of catastrophic forgetting. We first sort the task information in\nthe prompt pool in the local clients to align the task information on different\nclients before global aggregation. It ensures that the same task's knowledge\nare fully integrated, solving the problem of non-iid caused by the lack of\nclasses among different clients in the same incremental task. Experiments on\nCIFAR-100, Mini-ImageNet, and Tiny-ImageNet demonstrate that FCILPT achieves\nsignificant accuracy improvements over the state-of-the-art methods.\n","authors":["Jiale Liu","Yu-Wei Zhan","Chong-Yu Zhang","Xin Luo","Zhen-Duo Chen","Yinwei Wei","Xin-Shun Xu"],"pdf_url":"https://arxiv.org/pdf/2310.08948v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.09726v3","updated":"2023-10-13T08:10:41Z","published":"2021-12-17T19:22:01Z","title":"Soundify: Matching Sound Effects to Video","summary":"  In the art of video editing, sound helps add character to an object and\nimmerse the viewer within a space. Through formative interviews with\nprofessional editors (N=10), we found that the task of adding sounds to video\ncan be challenging. This paper presents Soundify, a system that assists editors\nin matching sounds to video. Given a video, Soundify identifies matching\nsounds, synchronizes the sounds to the video, and dynamically adjusts panning\nand volume to create spatial audio. In a human evaluation study (N=889), we\nshow that Soundify is capable of matching sounds to video out-of-the-box for a\ndiverse range of audio categories. In a within-subjects expert study (N=12), we\ndemonstrate the usefulness of Soundify in helping video editors match sounds to\nvideo with lighter workload, reduced task completion time, and improved\nusability.\n","authors":["David Chuan-En Lin","Anastasis Germanidis","Cristóbal Valenzuela","Yining Shi","Nikolas Martelaro"],"pdf_url":"https://arxiv.org/pdf/2112.09726v3.pdf","comment":"Full paper in UIST 2023; Short paper in NeurIPS 2021 ML4CD Workshop;\n  Online demo: http://soundify.cc"},{"id":"http://arxiv.org/abs/2310.08934v1","updated":"2023-10-13T08:00:33Z","published":"2023-10-13T08:00:33Z","title":"Online Adaptive Disparity Estimation for Dynamic Scenes in Structured\n  Light Systems","summary":"  In recent years, deep neural networks have shown remarkable progress in dense\ndisparity estimation from dynamic scenes in monocular structured light systems.\nHowever, their performance significantly drops when applied in unseen\nenvironments. To address this issue, self-supervised online adaptation has been\nproposed as a solution to bridge this performance gap. Unlike traditional\nfine-tuning processes, online adaptation performs test-time optimization to\nadapt networks to new domains. Therefore, achieving fast convergence during the\nadaptation process is critical for attaining satisfactory accuracy. In this\npaper, we propose an unsupervised loss function based on long sequential\ninputs. It ensures better gradient directions and faster convergence. Our loss\nfunction is designed using a multi-frame pattern flow, which comprises a set of\nsparse trajectories of the projected pattern along the sequence. We estimate\nthe sparse pseudo ground truth with a confidence mask using a filter-based\nmethod, which guides the online adaptation process. Our proposed framework\nsignificantly improves the online adaptation speed and achieves superior\nperformance on unseen data.\n","authors":["Rukun Qiao","Hiroshi Kawasaki","Hongbin Zha"],"pdf_url":"https://arxiv.org/pdf/2310.08934v1.pdf","comment":"Accpeted by 36th IEEE/RSJ International Conference on Intelligent\n  Robots and Systems, 2023"},{"id":"http://arxiv.org/abs/2310.08932v1","updated":"2023-10-13T07:55:33Z","published":"2023-10-13T07:55:33Z","title":"TIDE: Temporally Incremental Disparity Estimation via Pattern Flow in\n  Structured Light System","summary":"  We introduced Temporally Incremental Disparity Estimation Network (TIDE-Net),\na learning-based technique for disparity computation in mono-camera structured\nlight systems. In our hardware setting, a static pattern is projected onto a\ndynamic scene and captured by a monocular camera. Different from most former\ndisparity estimation methods that operate in a frame-wise manner, our network\nacquires disparity maps in a temporally incremental way. Specifically, We\nexploit the deformation of projected patterns (named pattern flow ) on captured\nimage sequences, to model the temporal information. Notably, this newly\nproposed pattern flow formulation reflects the disparity changes along the\nepipolar line, which is a special form of optical flow. Tailored for pattern\nflow, the TIDE-Net, a recurrent architecture, is proposed and implemented. For\neach incoming frame, our model fuses correlation volumes (from current frame)\nand disparity (from former frame) warped by pattern flow. From fused features,\nthe final stage of TIDE-Net estimates the residual disparity rather than the\nfull disparity, as conducted by many previous methods. Interestingly, this\ndesign brings clear empirical advantages in terms of efficiency and\ngeneralization ability. Using only synthetic data for training, our extensitve\nevaluation results (w.r.t. both accuracy and efficienty metrics) show superior\nperformance than several SOTA models on unseen real data. The code is available\non https://github.com/CodePointer/TIDENet.\n","authors":["Rukun Qiao","Hiroshi Kawasaki","Hongbin Zha"],"pdf_url":"https://arxiv.org/pdf/2310.08932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08929v1","updated":"2023-10-13T07:51:00Z","published":"2023-10-13T07:51:00Z","title":"Towards Interpretable Controllability in Object-Centric Learning","summary":"  The binding problem in artificial neural networks is actively explored with\nthe goal of achieving human-level recognition skills through the comprehension\nof the world in terms of symbol-like entities. Especially in the field of\ncomputer vision, object-centric learning (OCL) is extensively researched to\nbetter understand complex scenes by acquiring object representations or slots.\nWhile recent studies in OCL have made strides with complex images or videos,\nthe interpretability and interactivity over object representation remain\nlargely uncharted, still holding promise in the field of OCL. In this paper, we\nintroduce a novel method, Slot Attention with Image Augmentation (SlotAug), to\nexplore the possibility of learning interpretable controllability over slots in\na self-supervised manner by utilizing an image augmentation strategy. We also\ndevise the concept of sustainability in controllable slots by introducing\niterative and reversible controls over slots with two proposed submethods:\nAuxiliary Identity Manipulation and Slot Consistency Loss. Extensive empirical\nstudies and theoretical validation confirm the effectiveness of our approach,\noffering a novel capability for interpretable and sustainable control of object\nrepresentations. Code will be available soon.\n","authors":["Jinwoo Kim","Janghyuk Choi","Jaehyun Kang","Changyeon Lee","Ho-Jin Choi","Seon Joo Kim"],"pdf_url":"https://arxiv.org/pdf/2310.08929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08928v1","updated":"2023-10-13T07:50:37Z","published":"2023-10-13T07:50:37Z","title":"SIDE: Self-supervised Intermediate Domain Exploration for Source-free\n  Domain Adaptation","summary":"  Domain adaptation aims to alleviate the domain shift when transferring the\nknowledge learned from the source domain to the target domain. Due to privacy\nissues, source-free domain adaptation (SFDA), where source data is unavailable\nduring adaptation, has recently become very demanding yet challenging. Existing\nSFDA methods focus on either self-supervised learning of target samples or\nreconstruction of virtual source data. The former overlooks the transferable\nknowledge in the source model, whilst the latter introduces even more\nuncertainty. To address the above issues, this paper proposes self-supervised\nintermediate domain exploration (SIDE) that effectively bridges the domain gap\nwith an intermediate domain, where samples are cyclically filtered out in a\nself-supervised fashion. First, we propose cycle intermediate domain filtering\n(CIDF) to cyclically select intermediate samples with similar distributions\nover source and target domains. Second, with the aid of those intermediate\nsamples, an inter-domain gap transition (IDGT) module is developed to mitigate\npossible distribution mismatches between the source and target data. Finally,\nwe introduce cross-view consistency learning (CVCL) to maintain the intrinsic\nclass discriminability whilst adapting the model to the target domain.\nExtensive experiments on three popular benchmarks, i.e. Office-31, Office-Home\nand VisDA-C, show that our proposed SIDE achieves competitive performance\nagainst state-of-the-art methods.\n","authors":["Jiamei Liu","Han Sun","Yizhen Jia","Jie Qin","Huiyu Zhou","Ningzhong Liu"],"pdf_url":"https://arxiv.org/pdf/2310.08928v1.pdf","comment":"code at https://github.com/se111/SIDE"},{"id":"http://arxiv.org/abs/2201.03454v3","updated":"2023-10-13T07:48:24Z","published":"2022-01-10T16:53:39Z","title":"3D Face Morphing Attacks: Generation, Vulnerability and Detection","summary":"  Face Recognition systems (FRS) have been found to be vulnerable to morphing\nattacks, where the morphed face image is generated by blending the face images\nfrom contributory data subjects. This work presents a novel direction for\ngenerating face-morphing attacks in 3D. To this extent, we introduced a novel\napproach based on blending 3D face point clouds corresponding to contributory\ndata subjects. The proposed method generates 3D face morphing by projecting the\ninput 3D face point clouds onto depth maps and 2D color images, followed by\nimage blending and wrapping operations performed independently on the color\nimages and depth maps. We then back-projected the 2D morphing color map and the\ndepth map to the point cloud using the canonical (fixed) view. Given that the\ngenerated 3D face morphing models will result in holes owing to a single\ncanonical view, we have proposed a new algorithm for hole filling that will\nresult in a high-quality 3D face morphing model. Extensive experiments were\nconducted on the newly generated 3D face dataset comprising 675 3D scans\ncorresponding to 41 unique data subjects and a publicly available database\n(Facescape) with 100 data subjects. Experiments were performed to benchmark the\nvulnerability of the {proposed 3D morph-generation scheme against} automatic\n2D, 3D FRS, and human observer analysis. We also presented a quantitative\nassessment of the quality of the generated 3D face-morphing models using eight\ndifferent quality metrics. Finally, we propose three different 3D face Morphing\nAttack Detection (3D-MAD) algorithms to benchmark the performance of 3D face\nmorphing attack detection techniques.\n","authors":["Jag Mohan Singh","Raghavendra Ramachandra"],"pdf_url":"https://arxiv.org/pdf/2201.03454v3.pdf","comment":"The paper is accepted at IEEE Transactions on Biometrics, Behavior\n  and Identity Science"},{"id":"http://arxiv.org/abs/2310.08921v1","updated":"2023-10-13T07:46:57Z","published":"2023-10-13T07:46:57Z","title":"Feature Proliferation -- the \"Cancer\" in StyleGAN and its Treatments","summary":"  Despite the success of StyleGAN in image synthesis, the images it synthesizes\nare not always perfect and the well-known truncation trick has become a\nstandard post-processing technique for StyleGAN to synthesize high-quality\nimages. Although effective, it has long been noted that the truncation trick\ntends to reduce the diversity of synthesized images and unnecessarily\nsacrifices many distinct image features. To address this issue, in this paper,\nwe first delve into the StyleGAN image synthesis mechanism and discover an\nimportant phenomenon, namely Feature Proliferation, which demonstrates how\nspecific features reproduce with forward propagation. Then, we show how the\noccurrence of Feature Proliferation results in StyleGAN image artifacts. As an\nanalogy, we refer to it as the\" cancer\" in StyleGAN from its proliferating and\nmalignant nature. Finally, we propose a novel feature rescaling method that\nidentifies and modulates risky features to mitigate feature proliferation.\nThanks to our discovery of Feature Proliferation, the proposed feature\nrescaling method is less destructive and retains more useful image features\nthan the truncation trick, as it is more fine-grained and works in a\nlower-level feature space rather than a high-level latent space. Experimental\nresults justify the validity of our claims and the effectiveness of the\nproposed feature rescaling method. Our code is available at https://github.\ncom/songc42/Feature-proliferation.\n","authors":["Shuang Song","Yuanbang Liang","Jing Wu","Yu-Kun Lai","Yipeng Qin"],"pdf_url":"https://arxiv.org/pdf/2310.08921v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2303.11793v2","updated":"2023-10-13T07:37:56Z","published":"2023-03-21T12:22:59Z","title":"OTJR: Optimal Transport Meets Optimal Jacobian Regularization for\n  Adversarial Robustness","summary":"  The Web, as a rich medium of diverse content, has been constantly under the\nthreat of malicious entities exploiting its vulnerabilities, especially with\nthe rapid proliferation of deep learning applications in various web services.\nOne such vulnerability, crucial to the fidelity and integrity of web content,\nis the susceptibility of deep neural networks to adversarial perturbations,\nespecially concerning images - a dominant form of data on the web. In light of\nthe recent advancements in the robustness of classifiers, we delve deep into\nthe intricacies of adversarial training (AT) and Jacobian regularization, two\npivotal defenses. Our work {is the} first carefully analyzes and characterizes\nthese two schools of approaches, both theoretically and empirically, to\ndemonstrate how each approach impacts the robust learning of a classifier.\nNext, we propose our novel Optimal Transport with Jacobian regularization\nmethod, dubbed~\\SystemName, jointly incorporating the input-output Jacobian\nregularization into the AT by leveraging the optimal transport theory. In\nparticular, we employ the Sliced Wasserstein (SW) distance that can efficiently\npush the adversarial samples' representations closer to those of clean samples,\nregardless of the number of classes within the dataset. The SW distance\nprovides the adversarial samples' movement directions, which are much more\ninformative and powerful for the Jacobian regularization. Our empirical\nevaluations set a new standard in the domain, with our method achieving\ncommendable accuracies of 51.41\\% on the ~\\CIFAR-10 and 28.49\\% on the\n~\\CIFAR-100 datasets under the AutoAttack metric. In a real-world\ndemonstration, we subject images sourced from the Internet to online\nadversarial attacks, reinforcing the efficacy and relevance of our model in\ndefending against sophisticated web-image perturbations.\n","authors":["Binh M. Le","Shahroz Tariq","Simon S. Woo"],"pdf_url":"https://arxiv.org/pdf/2303.11793v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08910v1","updated":"2023-10-13T07:31:04Z","published":"2023-10-13T07:31:04Z","title":"Scalarization for Multi-Task and Multi-Domain Learning at Scale","summary":"  Training a single model on multiple input domains and/or output tasks allows\nfor compressing information from multiple sources into a unified backbone hence\nimproves model efficiency. It also enables potential positive knowledge\ntransfer across tasks/domains, leading to improved accuracy and data-efficient\ntraining. However, optimizing such networks is a challenge, in particular due\nto discrepancies between the different tasks or domains: Despite several\nhypotheses and solutions proposed over the years, recent work has shown that\nuniform scalarization training, i.e., simply minimizing the average of the task\nlosses, yields on-par performance with more costly SotA optimization methods.\nThis raises the issue of how well we understand the training dynamics of\nmulti-task and multi-domain networks. In this work, we first devise a\nlarge-scale unified analysis of multi-domain and multi-task learning to better\nunderstand the dynamics of scalarization across varied task/domain combinations\nand model sizes. Following these insights, we then propose to leverage\npopulation-based training to efficiently search for the optimal scalarization\nweights when dealing with a large number of tasks or domains.\n","authors":["Amelie Royer","Tijmen Blankevoort","Babak Ehteshami Bejnordi"],"pdf_url":"https://arxiv.org/pdf/2310.08910v1.pdf","comment":"NeurIPS 2023; https://openreview.net/forum?id=TSuq3debnD"},{"id":"http://arxiv.org/abs/2310.08904v1","updated":"2023-10-13T07:19:30Z","published":"2023-10-13T07:19:30Z","title":"3D Understanding of Deformable Linear Objects: Datasets and\n  Transferability Benchmark","summary":"  Deformable linear objects are vastly represented in our everyday lives. It is\noften challenging even for humans to visually understand them, as the same\nobject can be entangled so that it appears completely different. Examples of\ndeformable linear objects include blood vessels and wiring harnesses, vital to\nthe functioning of their corresponding systems, such as the human body and a\nvehicle. However, no point cloud datasets exist for studying 3D deformable\nlinear objects. Therefore, we are introducing two point cloud datasets,\nPointWire and PointVessel. We evaluated state-of-the-art methods on the\nproposed large-scale 3D deformable linear object benchmarks. Finally, we\nanalyzed the generalization capabilities of these methods by conducting\ntransferability experiments on the PointWire and PointVessel datasets.\n","authors":["Bare Luka Žagar","Tim Hertel","Mingyu Liu","Ekim Yurtsever","ALois C. Knoll"],"pdf_url":"https://arxiv.org/pdf/2310.08904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08247v3","updated":"2023-10-13T07:14:24Z","published":"2023-06-14T05:25:06Z","title":"Diffusion in Diffusion: Cyclic One-Way Diffusion for\n  Text-Vision-Conditioned Generation","summary":"  Originating from the diffusion phenomenon in physics that describes particle\nmovement, the diffusion generative models inherit the characteristics of\nstochastic random walk in the data space along the denoising trajectory.\nHowever, the intrinsic mutual interference among image regions contradicts the\nneed for practical downstream application scenarios where the preservation of\nlow-level pixel information from given conditioning is desired (e.g.,\ncustomization tasks like personalized generation and inpainting based on a\nuser-provided single image). In this work, we investigate the diffusion\n(physics) in diffusion (machine learning) properties and propose our Cyclic\nOne-Way Diffusion (COW) method to control the direction of diffusion phenomenon\ngiven a pre-trained frozen diffusion model for versatile customization\napplication scenarios, where the low-level pixel information from the\nconditioning needs to be preserved. Notably, unlike most current methods that\nincorporate additional conditions by fine-tuning the base text-to-image\ndiffusion model or learning auxiliary networks, our method provides a novel\nperspective to understand the task needs and is applicable to a wider range of\ncustomization scenarios in a learning-free manner. Extensive experiment results\nshow that our proposed COW can achieve more flexible customization based on\nstrict visual conditions in different application settings.\n","authors":["Ruoyu Wang","Yongqi Yang","Zhihao Qian","Ye Zhu","Yu Wu"],"pdf_url":"https://arxiv.org/pdf/2306.08247v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.03574v4","updated":"2023-10-13T07:13:51Z","published":"2022-02-04T12:30:49Z","title":"Structured Prediction Problem Archive","summary":"  Structured prediction problems are one of the fundamental tools in machine\nlearning. In order to facilitate algorithm development for their numerical\nsolution, we collect in one place a large number of datasets in easy to read\nformats for a diverse set of problem classes. We provide archival links to\ndatasets, description of the considered problems and problem formats, and a\nshort summary of problem characteristics including size, number of instances\netc. For reference we also give a non-exhaustive selection of algorithms\nproposed in the literature for their solution. We hope that this central\nrepository will make benchmarking and comparison to established works easier.\nWe welcome submission of interesting new datasets and algorithms for inclusion\nin our archive.\n","authors":["Paul Swoboda","Ahmed Abbas","Florian Bernard","Andrea Hornakova","Paul Roetzer","Bogdan Savchynskyy"],"pdf_url":"https://arxiv.org/pdf/2202.03574v4.pdf","comment":"Added new shape matching instances based of learned descriptors"},{"id":"http://arxiv.org/abs/2305.15710v2","updated":"2023-10-13T07:04:13Z","published":"2023-05-25T04:44:50Z","title":"CUEING: a lightweight model to Capture hUman attEntion In driviNG","summary":"  Discrepancies in decision-making between Autonomous Driving Systems (ADS) and\nhuman drivers underscore the need for intuitive human gaze predictors to bridge\nthis gap, thereby improving user trust and experience. Existing gaze datasets,\ndespite their value, suffer from noise that hampers effective training.\nFurthermore, current gaze prediction models exhibit inconsistency across\ndiverse scenarios and demand substantial computational resources, restricting\ntheir on-board deployment in autonomous vehicles. We propose a novel adaptive\ncleansing technique for purging noise from existing gaze datasets, coupled with\na robust, lightweight convolutional self-attention gaze prediction model. Our\napproach not only significantly enhances model generalizability and performance\nby up to 12.13% but also ensures a remarkable reduction in model complexity by\nup to 98.2% compared to the state-of-the art, making in-vehicle deployment\nfeasible to augment ADS decision visualization and performance.\n","authors":["Linfeng Liang","Yao Deng","Yang Zhang","Jianchao Lu","Chen Wang","Quanzheng Sheng","Xi Zheng"],"pdf_url":"https://arxiv.org/pdf/2305.15710v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08897v1","updated":"2023-10-13T06:58:52Z","published":"2023-10-13T06:58:52Z","title":"Self supervised convolutional kernel based handcrafted feature\n  harmonization: Enhanced left ventricle hypertension disease phenotyping on\n  echocardiography","summary":"  Radiomics, a medical imaging technique, extracts quantitative handcrafted\nfeatures from images to predict diseases. Harmonization in those features\nensures consistent feature extraction across various imaging devices and\nprotocols. Methods for harmonization include standardized imaging protocols,\nstatistical adjustments, and evaluating feature robustness. Myocardial diseases\nsuch as Left Ventricular Hypertrophy (LVH) and Hypertensive Heart Disease (HHD)\nare diagnosed via echocardiography, but variable imaging settings pose\nchallenges. Harmonization techniques are crucial for applying handcrafted\nfeatures in disease diagnosis in such scenario. Self-supervised learning (SSL)\nenhances data understanding within limited datasets and adapts to diverse data\nsettings. ConvNeXt-V2 integrates convolutional layers into SSL, displaying\nsuperior performance in various tasks. This study focuses on convolutional\nfilters within SSL, using them as preprocessing to convert images into feature\nmaps for handcrafted feature harmonization. Our proposed method excelled in\nharmonization evaluation and exhibited superior LVH classification performance\ncompared to existing methods.\n","authors":["Jina Lee","Youngtaek Hong","Dawun Jeong","Yeonggul Jang","Sihyeon Jeong","Taekgeun Jung","Yeonyee E. Yoon","Inki Moon","Seung-Ah Lee","Hyuk-Jae Chang"],"pdf_url":"https://arxiv.org/pdf/2310.08897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08892v1","updated":"2023-10-13T06:53:28Z","published":"2023-10-13T06:53:28Z","title":"Image Cropping under Design Constraints","summary":"  Image cropping is essential in image editing for obtaining a compositionally\nenhanced image. In display media, image cropping is a prospective technique for\nautomatically creating media content. However, image cropping for media\ncontents is often required to satisfy various constraints, such as an aspect\nratio and blank regions for placing texts or objects. We call this problem\nimage cropping under design constraints. To achieve image cropping under design\nconstraints, we propose a score function-based approach, which computes scores\nfor cropped results whether aesthetically plausible and satisfies design\nconstraints. We explore two derived approaches, a proposal-based approach, and\na heatmap-based approach, and we construct a dataset for evaluating the\nperformance of the proposed approaches on image cropping under design\nconstraints. In experiments, we demonstrate that the proposed approaches\noutperform a baseline, and we observe that the proposal-based approach is\nbetter than the heatmap-based approach under the same computation cost, but the\nheatmap-based approach leads to better scores by increasing computation cost.\nThe experimental results indicate that balancing aesthetically plausible\nregions and satisfying design constraints is not a trivial problem and requires\nsensitive balance, and both proposed approaches are reasonable alternatives.\n","authors":["Takumi Nishiyasu","Wataru Shimoda","Yoichi Sato"],"pdf_url":"https://arxiv.org/pdf/2310.08892v1.pdf","comment":"ACMMM Asia accepted"},{"id":"http://arxiv.org/abs/2310.08888v1","updated":"2023-10-13T06:48:38Z","published":"2023-10-13T06:48:38Z","title":"A Hybrid Transfer Learning Assisted Decision Support System for Accurate\n  Prediction of Alzheimer Disease","summary":"  Alzheimer's disease (AD) is the most common long-term illness in elderly\npeople. In recent years, deep learning has become popular in the area of\nmedical imaging and has had a lot of success there. It has become the most\neffective way to look at medical images. When it comes to detecting AD, the\ndeep neural model is more accurate and effective than general machine learning.\nOur research contributes to the development of a more comprehensive\nunderstanding and detection of the disease by identifying four distinct classes\nthat are predictive of AD with a high weighted accuracy of 98.91%. A unique\nstrategy has been proposed to improve the accuracy of the imbalance dataset\nclassification problem via the combination of ensemble averaging models and\nfive different transfer learning models in this study.\nEfficientNetB0+Resnet152(effnet+res152) and\nInceptionV3+EfficientNetB0+Resnet50(incep+effnet+res50) models have been\nfine-tuned and have reached the highest weighted accuracy for multi-class AD\nstage classifications.\n","authors":["Mahin Khan Mahadi","Abdullah Abdullah","Jamal Uddin","Asif Newaz"],"pdf_url":"https://arxiv.org/pdf/2310.08888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.03434v7","updated":"2023-10-13T06:46:27Z","published":"2022-12-07T03:39:18Z","title":"Name Your Colour For the Task: Artificially Discover Colour Naming via\n  Colour Quantisation Transformer","summary":"  The long-standing theory that a colour-naming system evolves under dual\npressure of efficient communication and perceptual mechanism is supported by\nmore and more linguistic studies, including analysing four decades of\ndiachronic data from the Nafaanra language. This inspires us to explore whether\nmachine learning could evolve and discover a similar colour-naming system via\noptimising the communication efficiency represented by high-level recognition\nperformance. Here, we propose a novel colour quantisation transformer,\nCQFormer, that quantises colour space while maintaining the accuracy of machine\nrecognition on the quantised images. Given an RGB image, Annotation Branch maps\nit into an index map before generating the quantised image with a colour\npalette; meanwhile the Palette Branch utilises a key-point detection way to\nfind proper colours in the palette among the whole colour space. By interacting\nwith colour annotation, CQFormer is able to balance both the machine vision\naccuracy and colour perceptual structure such as distinct and stable colour\ndistribution for discovered colour system. Very interestingly, we even observe\nthe consistent evolution pattern between our artificial colour system and basic\ncolour terms across human languages. Besides, our colour quantisation method\nalso offers an efficient quantisation method that effectively compresses the\nimage storage while maintaining high performance in high-level recognition\ntasks such as classification and detection. Extensive experiments demonstrate\nthe superior performance of our method with extremely low bit-rate colours,\nshowing potential to integrate into quantisation network to quantities from\nimage to network activation. The source code is available at\nhttps://github.com/ryeocthiv/CQFormer\n","authors":["Shenghan Su","Lin Gu","Yue Yang","Zenghui Zhang","Tatsuya Harada"],"pdf_url":"https://arxiv.org/pdf/2212.03434v7.pdf","comment":"ICCV 2023 Oral"},{"id":"http://arxiv.org/abs/2310.08884v1","updated":"2023-10-13T06:34:23Z","published":"2023-10-13T06:34:23Z","title":"Extending Multi-modal Contrastive Representations","summary":"  Multi-modal contrastive representation (MCR) of more than three modalities is\ncritical in multi-modal learning. Although recent methods showcase impressive\nachievements, the high dependence on large-scale, high-quality paired data and\nthe expensive training costs limit their further development. Inspired by\nrecent C-MCR, this paper proposes Extending Multimodal Contrastive\nRepresentation (Ex-MCR), a training-efficient and paired-data-free method to\nflexibly learn unified contrastive representation space for more than three\nmodalities by integrating the knowledge of existing MCR spaces. Specifically,\nEx-MCR aligns multiple existing MCRs into the same based MCR, which can\neffectively preserve the original semantic alignment of the based MCR. Besides,\nwe comprehensively enhance the entire learning pipeline for aligning MCR spaces\nfrom the perspectives of training data, architecture, and learning objectives.\nWith the preserved original modality alignment and the enhanced space\nalignment, Ex-MCR shows superior representation learning performance and\nexcellent modality extensibility. To demonstrate the effectiveness of Ex-MCR,\nwe align the MCR spaces of CLAP (audio-text) and ULIP (3D-vision) into the CLIP\n(vision-text), leveraging the overlapping text and image modality,\nrespectively. Remarkably, without using any paired data, Ex-MCR learns a\n3D-image-text-audio unified contrastive representation, and it achieves\nstate-of-the-art performance on audio-visual, 3D-image, audio-text, visual-text\nretrieval, and 3D object classification tasks. More importantly, extensive\nqualitative results further demonstrate the emergent semantic alignment between\nthe extended modalities (e.g., audio and 3D), which highlights the great\npotential of modality extensibility.\n","authors":["Zehan Wang","Ziang Zhang","Luping Liu","Yang Zhao","Haifeng Huang","Tao Jin","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.08884v1.pdf","comment":"Our code is available at https://github.com/MCR-PEFT/Ex-MCR"},{"id":"http://arxiv.org/abs/2310.08872v1","updated":"2023-10-13T05:48:42Z","published":"2023-10-13T05:48:42Z","title":"R&B: Region and Boundary Aware Zero-shot Grounded Text-to-image\n  Generation","summary":"  Recent text-to-image (T2I) diffusion models have achieved remarkable progress\nin generating high-quality images given text-prompts as input. However, these\nmodels fail to convey appropriate spatial composition specified by a layout\ninstruction. In this work, we probe into zero-shot grounded T2I generation with\ndiffusion models, that is, generating images corresponding to the input layout\ninformation without training auxiliary modules or finetuning diffusion models.\nWe propose a Region and Boundary (R&B) aware cross-attention guidance approach\nthat gradually modulates the attention maps of diffusion model during\ngenerative process, and assists the model to synthesize images (1) with high\nfidelity, (2) highly compatible with textual input, and (3) interpreting layout\ninstructions accurately. Specifically, we leverage the discrete sampling to\nbridge the gap between consecutive attention maps and discrete layout\nconstraints, and design a region-aware loss to refine the generative layout\nduring diffusion process. We further propose a boundary-aware loss to\nstrengthen object discriminability within the corresponding regions.\nExperimental results show that our method outperforms existing state-of-the-art\nzero-shot grounded T2I generation methods by a large margin both qualitatively\nand quantitatively on several benchmarks.\n","authors":["Jiayu Xiao","Liang Li","Henglei Lv","Shuhui Wang","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2310.08872v1.pdf","comment":"Preprint. Under review"},{"id":"http://arxiv.org/abs/2310.03986v2","updated":"2023-10-13T05:35:40Z","published":"2023-10-06T03:04:21Z","title":"Robust Multimodal Learning with Missing Modalities via\n  Parameter-Efficient Adaptation","summary":"  Multimodal learning seeks to utilize data from multiple sources to improve\nthe overall performance of downstream tasks. It is desirable for redundancies\nin the data to make multimodal systems robust to missing or corrupted\nobservations in some correlated modalities. However, we observe that the\nperformance of several existing multimodal networks significantly deteriorates\nif one or multiple modalities are absent at test time. To enable robustness to\nmissing modalities, we propose simple and parameter-efficient adaptation\nprocedures for pretrained multimodal networks. In particular, we exploit\nlow-rank adaptation and modulation of intermediate features to compensate for\nthe missing modalities. We demonstrate that such adaptation can partially\nbridge performance drop due to missing modalities and outperform independent,\ndedicated networks trained for the available modality combinations in some\ncases. The proposed adaptation requires extremely small number of parameters\n(e.g., fewer than 0.7% of the total parameters in most experiments). We conduct\na series of experiments to highlight the robustness of our proposed method\nusing diverse datasets for RGB-thermal and RGB-Depth semantic segmentation,\nmultimodal material segmentation, and multimodal sentiment analysis tasks. Our\nproposed method demonstrates versatility across various tasks and datasets, and\noutperforms existing methods for robust multimodal learning with missing\nmodalities.\n","authors":["Md Kaykobad Reza","Ashley Prater-Bennette","M. Salman Asif"],"pdf_url":"https://arxiv.org/pdf/2310.03986v2.pdf","comment":"18 pages, 3 figures, 11 tables"},{"id":"http://arxiv.org/abs/2310.08861v1","updated":"2023-10-13T05:08:35Z","published":"2023-10-13T05:08:35Z","title":"Re-initialization-free Level Set Method via Molecular Beam Epitaxy\n  Equation Regularization for Image Segmentation","summary":"  Variational level set method has become a powerful tool in image segmentation\ndue to its ability to handle complex topological changes and maintain\ncontinuity and smoothness in the process of evolution. However its evolution\nprocess can be unstable, which results in over flatted or over sharpened\ncontours and segmentation failure. To improve the accuracy and stability of\nevolution, we propose a high-order level set variational segmentation method\nintegrated with molecular beam epitaxy (MBE) equation regularization. This\nmethod uses the crystal growth in the MBE process to limit the evolution of the\nlevel set function, and thus can avoid the re-initialization in the evolution\nprocess and regulate the smoothness of the segmented curve. It also works for\nnoisy images with intensity inhomogeneity, which is a challenge in image\nsegmentation. To solve the variational model, we derive the gradient flow and\ndesign scalar auxiliary variable (SAV) scheme coupled with fast Fourier\ntransform (FFT), which can significantly improve the computational efficiency\ncompared with the traditional semi-implicit and semi-explicit scheme. Numerical\nexperiments show that the proposed method can generate smooth segmentation\ncurves, retain fine segmentation targets and obtain robust segmentation results\nof small objects. Compared to existing level set methods, this model is\nstate-of-the-art in both accuracy and efficiency.\n","authors":["Fanghui Song","Jiebao Sun","Shengzhu Shi","Zhichang Guo","Dazhi Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02601v3","updated":"2023-10-13T05:04:35Z","published":"2023-10-04T06:14:06Z","title":"MagicDrive: Street View Generation with Diverse 3D Geometry Control","summary":"  Recent advancements in diffusion models have significantly enhanced the data\nsynthesis with 2D control. Yet, precise 3D control in street view generation,\ncrucial for 3D perception tasks, remains elusive. Specifically, utilizing\nBird's-Eye View (BEV) as the primary condition often leads to challenges in\ngeometry control (e.g., height), affecting the representation of object shapes,\nocclusion patterns, and road surface elevations, all of which are essential to\nperception data synthesis, especially for 3D object detection tasks. In this\npaper, we introduce MagicDrive, a novel street view generation framework\noffering diverse 3D geometry controls, including camera poses, road maps, and\n3D bounding boxes, together with textual descriptions, achieved through\ntailored encoding strategies. Besides, our design incorporates a cross-view\nattention module, ensuring consistency across multiple camera views. With\nMagicDrive, we achieve high-fidelity street-view synthesis that captures\nnuanced 3D geometry and various scene descriptions, enhancing tasks like BEV\nsegmentation and 3D object detection.\n","authors":["Ruiyuan Gao","Kai Chen","Enze Xie","Lanqing Hong","Zhenguo Li","Dit-Yan Yeung","Qiang Xu"],"pdf_url":"https://arxiv.org/pdf/2310.02601v3.pdf","comment":"Project Page: https://flymin.github.io/magicdrive"},{"id":"http://arxiv.org/abs/2310.08854v1","updated":"2023-10-13T04:48:32Z","published":"2023-10-13T04:48:32Z","title":"Rank-DETR for High Quality Object Detection","summary":"  Modern detection transformers (DETRs) use a set of object queries to predict\na list of bounding boxes, sort them by their classification confidence scores,\nand select the top-ranked predictions as the final detection results for the\ngiven input image. A highly performant object detector requires accurate\nranking for the bounding box predictions. For DETR-based detectors, the\ntop-ranked bounding boxes suffer from less accurate localization quality due to\nthe misalignment between classification scores and localization accuracy, thus\nimpeding the construction of high-quality detectors. In this work, we introduce\na simple and highly performant DETR-based object detector by proposing a series\nof rank-oriented designs, combinedly called Rank-DETR. Our key contributions\ninclude: (i) a rank-oriented architecture design that can prompt positive\npredictions and suppress the negative ones to ensure lower false positive\nrates, as well as (ii) a rank-oriented loss function and matching cost design\nthat prioritizes predictions of more accurate localization accuracy during\nranking to boost the AP under high IoU thresholds. We apply our method to\nimprove the recent SOTA methods (e.g., H-DETR and DINO-DETR) and report strong\nCOCO object detection results when using different backbones such as\nResNet-$50$, Swin-T, and Swin-L, demonstrating the effectiveness of our\napproach. Code is available at \\url{https://github.com/LeapLabTHU/Rank-DETR}.\n","authors":["Yifan Pu","Weicong Liang","Yiduo Hao","Yuhui Yuan","Yukang Yang","Chao Zhang","Han Hu","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2310.08854v1.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.04780v3","updated":"2023-10-13T04:23:26Z","published":"2023-10-07T11:45:33Z","title":"IPMix: Label-Preserving Data Augmentation Method for Training Robust\n  Classifiers","summary":"  Data augmentation has been proven effective for training high-accuracy\nconvolutional neural network classifiers by preventing overfitting. However,\nbuilding deep neural networks in real-world scenarios requires not only high\naccuracy on clean data but also robustness when data distributions shift. While\nprior methods have proposed that there is a trade-off between accuracy and\nrobustness, we propose IPMix, a simple data augmentation approach to improve\nrobustness without hurting clean accuracy. IPMix integrates three levels of\ndata augmentation (image-level, patch-level, and pixel-level) into a coherent\nand label-preserving technique to increase the diversity of training data with\nlimited computational overhead. To further improve the robustness, IPMix\nintroduces structural complexity at different levels to generate more diverse\nimages and adopts the random mixing method for multi-scale information fusion.\nExperiments demonstrate that IPMix outperforms state-of-the-art corruption\nrobustness on CIFAR-C and ImageNet-C. In addition, we show that IPMix also\nsignificantly improves the other safety measures, including robustness to\nadversarial perturbations, calibration, prediction consistency, and anomaly\ndetection, achieving state-of-the-art or comparable results on several\nbenchmarks, including ImageNet-R, ImageNet-A, and ImageNet-O.\n","authors":["Zhenglin Huang","Xianan Bao","Na Zhang","Qingqi Zhang","Xiaomei Tu","Biao Wu","Xi Yang"],"pdf_url":"https://arxiv.org/pdf/2310.04780v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.12403v2","updated":"2023-10-13T03:48:11Z","published":"2022-06-24T17:59:02Z","title":"ZSON: Zero-Shot Object-Goal Navigation using Multimodal Goal Embeddings","summary":"  We present a scalable approach for learning open-world object-goal navigation\n(ObjectNav) -- the task of asking a virtual robot (agent) to find any instance\nof an object in an unexplored environment (e.g., \"find a sink\"). Our approach\nis entirely zero-shot -- i.e., it does not require ObjectNav rewards or\ndemonstrations of any kind. Instead, we train on the image-goal navigation\n(ImageNav) task, in which agents find the location where a picture (i.e., goal\nimage) was captured. Specifically, we encode goal images into a multimodal,\nsemantic embedding space to enable training semantic-goal navigation\n(SemanticNav) agents at scale in unannotated 3D environments (e.g., HM3D).\nAfter training, SemanticNav agents can be instructed to find objects described\nin free-form natural language (e.g., \"sink\", \"bathroom sink\", etc.) by\nprojecting language goals into the same multimodal, semantic embedding space.\nAs a result, our approach enables open-world ObjectNav. We extensively evaluate\nour agents on three ObjectNav datasets (Gibson, HM3D, and MP3D) and observe\nabsolute improvements in success of 4.2% - 20.0% over existing zero-shot\nmethods. For reference, these gains are similar or better than the 5%\nimprovement in success between the Habitat 2020 and 2021 ObjectNav challenge\nwinners. In an open-world setting, we discover that our agents can generalize\nto compound instructions with a room explicitly mentioned (e.g., \"Find a\nkitchen sink\") and when the target room can be inferred (e.g., \"Find a sink and\na stove\").\n","authors":["Arjun Majumdar","Gunjan Aggarwal","Bhavika Devnani","Judy Hoffman","Dhruv Batra"],"pdf_url":"https://arxiv.org/pdf/2206.12403v2.pdf","comment":"code: https://github.com/gunagg/zson"},{"id":"http://arxiv.org/abs/2304.12152v2","updated":"2023-10-13T03:40:42Z","published":"2023-04-24T15:03:37Z","title":"Efficient Halftoning via Deep Reinforcement Learning","summary":"  Halftoning aims to reproduce a continuous-tone image with pixels whose\nintensities are constrained to two discrete levels. This technique has been\ndeployed on every printer, and the majority of them adopt fast methods (e.g.,\nordered dithering, error diffusion) that fail to render structural details,\nwhich determine halftone's quality. Other prior methods of pursuing visual\npleasure by searching for the optimal halftone solution, on the contrary,\nsuffer from their high computational cost. In this paper, we propose a fast and\nstructure-aware halftoning method via a data-driven approach. Specifically, we\nformulate halftoning as a reinforcement learning problem, in which each binary\npixel's value is regarded as an action chosen by a virtual agent with a shared\nfully convolutional neural network (CNN) policy. In the offline phase, an\neffective gradient estimator is utilized to train the agents in producing\nhigh-quality halftones in one action step. Then, halftones can be generated\nonline by one fast CNN inference. Besides, we propose a novel anisotropy\nsuppressing loss function, which brings the desirable blue-noise property.\nFinally, we find that optimizing SSIM could result in holes in flat areas,\nwhich can be avoided by weighting the metric with the contone's contrast map.\nExperiments show that our framework can effectively train a light-weight CNN,\nwhich is 15x faster than previous structure-aware methods, to generate\nblue-noise halftones with satisfactory visual quality. We also present a\nprototype of deep multitoning to demonstrate the extensibility of our method.\n","authors":["Haitian Jiang","Dongliang Xiong","Xiaowen Jiang","Li Ding","Liang Chen","Kai Huang"],"pdf_url":"https://arxiv.org/pdf/2304.12152v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03601v2","updated":"2023-10-13T03:25:34Z","published":"2023-07-07T13:43:44Z","title":"GPT4RoI: Instruction Tuning Large Language Model on Region-of-Interest","summary":"  Visual instruction tuning large language model(LLM) on image-text pairs has\nachieved general-purpose vision-language abilities. However, the lack of\nregion-text pairs limits their advancements to fine-grained multimodal\nunderstanding. In this paper, we propose spatial instruction tuning, which\nintroduces the reference to the region-of-interest(RoI) in the instruction.\nBefore sending to LLM, the reference is replaced by RoI features and\ninterleaved with language embeddings as a sequence. Our model GPT4RoI, trained\non 7 region-text pair datasets, brings an unprecedented interactive and\nconversational experience compared to previous image-level models. (1)\nInteraction beyond language: Users can interact with our model by both language\nand drawing bounding boxes to flexibly adjust the referring granularity. (2)\nVersatile multimodal abilities: A variety of attribute information within each\nRoI can be mined by GPT4RoI, e.g., color, shape, material, action, etc.\nFurthermore, it can reason about multiple RoIs based on common sense. On the\nVisual Commonsense Reasoning(VCR) dataset, GPT4RoI achieves a remarkable\naccuracy of 81.6%, surpassing all existing models by a significant margin (the\nsecond place is 75.6%) and almost reaching human-level performance of 85.0%.\nThe code, dataset, and demo can be found at\nhttps://github.com/jshilong/GPT4RoI.\n","authors":["Shilong Zhang","Peize Sun","Shoufa Chen","Min Xiao","Wenqi Shao","Wenwei Zhang","Yu Liu","Kai Chen","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2307.03601v2.pdf","comment":"Code has been released at https://github.com/jshilong/GPT4RoI"},{"id":"http://arxiv.org/abs/2310.01404v2","updated":"2023-10-13T03:14:16Z","published":"2023-10-02T17:59:03Z","title":"H-InDex: Visual Reinforcement Learning with Hand-Informed\n  Representations for Dexterous Manipulation","summary":"  Human hands possess remarkable dexterity and have long served as a source of\ninspiration for robotic manipulation. In this work, we propose a human\n$\\textbf{H}$and$\\textbf{-In}$formed visual representation learning framework to\nsolve difficult $\\textbf{Dex}$terous manipulation tasks ($\\textbf{H-InDex}$)\nwith reinforcement learning. Our framework consists of three stages: (i)\npre-training representations with 3D human hand pose estimation, (ii) offline\nadapting representations with self-supervised keypoint detection, and (iii)\nreinforcement learning with exponential moving average BatchNorm. The last two\nstages only modify $0.36\\%$ parameters of the pre-trained representation in\ntotal, ensuring the knowledge from pre-training is maintained to the full\nextent. We empirically study 12 challenging dexterous manipulation tasks and\nfind that H-InDex largely surpasses strong baseline methods and the recent\nvisual foundation models for motor control. Code is available at\nhttps://yanjieze.com/H-InDex .\n","authors":["Yanjie Ze","Yuyao Liu","Ruizhe Shi","Jiaxin Qin","Zhecheng Yuan","Jiashun Wang","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2310.01404v2.pdf","comment":"NeurIPS 2023. Code and videos: https://yanjieze.com/H-InDex"},{"id":"http://arxiv.org/abs/2303.05050v3","updated":"2023-10-13T02:44:40Z","published":"2023-03-09T05:54:42Z","title":"Lifelong-MonoDepth: Lifelong Learning for Multi-Domain Monocular Metric\n  Depth Estimation","summary":"  With the rapid advancements in autonomous driving and robot navigation, there\nis a growing demand for lifelong learning models capable of estimating metric\n(absolute) depth. Lifelong learning approaches potentially offer significant\ncost savings in terms of model training, data storage, and collection. However,\nthe quality of RGB images and depth maps is sensor-dependent, and depth maps in\nthe real world exhibit domain-specific characteristics, leading to variations\nin depth ranges. These challenges limit existing methods to lifelong learning\nscenarios with small domain gaps and relative depth map estimation. To\nfacilitate lifelong metric depth learning, we identify three crucial technical\nchallenges that require attention: i) developing a model capable of addressing\nthe depth scale variation through scale-aware depth learning, ii) devising an\neffective learning strategy to handle significant domain gaps, and iii)\ncreating an automated solution for domain-aware depth inference in practical\napplications. Based on the aforementioned considerations, in this paper, we\npresent i) a lightweight multi-head framework that effectively tackles the\ndepth scale imbalance, ii) an uncertainty-aware lifelong learning solution that\nadeptly handles significant domain gaps, and iii) an online domain-specific\npredictor selection method for real-time inference. Through extensive numerical\nstudies, we show that the proposed method can achieve good efficiency,\nstability, and plasticity, leading the benchmarks by 8% to 15%.\n","authors":["Junjie Hu","Chenyou Fan","Liguang Zhou","Qing Gao","Honghai Liu","Tin Lun Lam"],"pdf_url":"https://arxiv.org/pdf/2303.05050v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08826v1","updated":"2023-10-13T02:43:35Z","published":"2023-10-13T02:43:35Z","title":"Revisiting Multi-modal 3D Semantic Segmentation in Real-world Autonomous\n  Driving","summary":"  LiDAR and camera are two critical sensors for multi-modal 3D semantic\nsegmentation and are supposed to be fused efficiently and robustly to promise\nsafety in various real-world scenarios. However, existing multi-modal methods\nface two key challenges: 1) difficulty with efficient deployment and real-time\nexecution; and 2) drastic performance degradation under weak calibration\nbetween LiDAR and cameras. To address these challenges, we propose CPGNet-LCF,\na new multi-modal fusion framework extending the LiDAR-only CPGNet. CPGNet-LCF\nsolves the first challenge by inheriting the easy deployment and real-time\ncapabilities of CPGNet. For the second challenge, we introduce a novel weak\ncalibration knowledge distillation strategy during training to improve the\nrobustness against the weak calibration. CPGNet-LCF achieves state-of-the-art\nperformance on the nuScenes and SemanticKITTI benchmarks. Remarkably, it can be\neasily deployed to run in 20ms per frame on a single Tesla V100 GPU using\nTensorRT TF16 mode. Furthermore, we benchmark performance over four weak\ncalibration levels, demonstrating the robustness of our proposed approach.\n","authors":["Feng Jiang","Chaoping Tu","Gang Zhang","Jun Li","Hanqing Huang","Junyu Lin","Di Feng","Jian Pu"],"pdf_url":"https://arxiv.org/pdf/2310.08826v1.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.08825v1","updated":"2023-10-13T02:41:55Z","published":"2023-10-13T02:41:55Z","title":"From CLIP to DINO: Visual Encoders Shout in Multi-modal Large Language\n  Models","summary":"  Multi-modal Large Language Models (MLLMs) have made significant strides in\nexpanding the capabilities of Large Language Models (LLMs) through the\nincorporation of visual perception interfaces. Despite the emergence of\nexciting applications and the availability of diverse instruction tuning data,\nexisting approaches often rely on CLIP or its variants as the visual branch,\nand merely extract features from the deep layers. However, these methods lack a\ncomprehensive analysis of the visual encoders in MLLMs. In this paper, we\nconduct an extensive investigation into the effectiveness of different vision\nencoders within MLLMs. Our findings reveal that the shallow layer features of\nCLIP offer particular advantages for fine-grained tasks such as grounding and\nregion understanding. Surprisingly, the vision-only model DINO, which is not\npretrained with text-image alignment, demonstrates promising performance as a\nvisual branch within MLLMs. By simply equipping it with an MLP layer for\nalignment, DINO surpasses CLIP in fine-grained related perception tasks.\nBuilding upon these observations, we propose a simple yet effective feature\nmerging strategy, named COMM, that integrates CLIP and DINO with Multi-level\nfeatures Merging, to enhance the visual capabilities of MLLMs. We evaluate COMM\nthrough comprehensive experiments on a wide range of benchmarks, including\nimage captioning, visual question answering, visual grounding, and object\nhallucination. Experimental results demonstrate the superior performance of\nCOMM compared to existing methods, showcasing its enhanced visual capabilities\nwithin MLLMs. Code will be made available at\nhttps://github.com/YuchenLiu98/COMM.\n","authors":["Dongsheng Jiang","Yuchen Liu","Songlin Liu","Xiaopeng Zhang","Jin Li","Hongkai Xiong","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2310.08825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12966v3","updated":"2023-10-13T02:41:28Z","published":"2023-08-24T17:59:17Z","title":"Qwen-VL: A Versatile Vision-Language Model for Understanding,\n  Localization, Text Reading, and Beyond","summary":"  In this work, we introduce the Qwen-VL series, a set of large-scale\nvision-language models (LVLMs) designed to perceive and understand both texts\nand images. Starting from the Qwen-LM as a foundation, we endow it with visual\ncapacity by the meticulously designed (i) visual receptor, (ii) input-output\ninterface, (iii) 3-stage training pipeline, and (iv) multilingual multimodal\ncleaned corpus. Beyond the conventional image description and\nquestion-answering, we implement the grounding and text-reading ability of\nQwen-VLs by aligning image-caption-box tuples. The resulting models, including\nQwen-VL and Qwen-VL-Chat, set new records for generalist models under similar\nmodel scales on a broad range of visual-centric benchmarks (e.g., image\ncaptioning, question answering, visual grounding) and different settings (e.g.,\nzero-shot, few-shot). Moreover, on real-world dialog benchmarks, our\ninstruction-tuned Qwen-VL-Chat also demonstrates superiority compared to\nexisting vision-language chatbots. Code, demo and models are available at\nhttps://github.com/QwenLM/Qwen-VL.\n","authors":["Jinze Bai","Shuai Bai","Shusheng Yang","Shijie Wang","Sinan Tan","Peng Wang","Junyang Lin","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.12966v3.pdf","comment":"Code, demo and models are available at\n  https://github.com/QwenLM/Qwen-VL"},{"id":"http://arxiv.org/abs/2310.08820v1","updated":"2023-10-13T02:28:40Z","published":"2023-10-13T02:28:40Z","title":"SAM-guided Unsupervised Domain Adaptation for 3D Segmentation","summary":"  Unsupervised domain adaptation (UDA) in 3D segmentation tasks presents a\nformidable challenge, primarily stemming from the sparse and unordered nature\nof point cloud data. Especially for LiDAR point clouds, the domain discrepancy\nbecomes obvious across varying capture scenes, fluctuating weather conditions,\nand the diverse array of LiDAR devices in use. While previous UDA methodologies\nhave often sought to mitigate this gap by aligning features between source and\ntarget domains, this approach falls short when applied to 3D segmentation due\nto the substantial domain variations. Inspired by the remarkable generalization\ncapabilities exhibited by the vision foundation model, SAM, in the realm of\nimage segmentation, our approach leverages the wealth of general knowledge\nembedded within SAM to unify feature representations across diverse 3D domains\nand further solves the 3D domain adaptation problem. Specifically, we harness\nthe corresponding images associated with point clouds to facilitate knowledge\ntransfer and propose an innovative hybrid feature augmentation methodology,\nwhich significantly enhances the alignment between the 3D feature space and\nSAM's feature space, operating at both the scene and instance levels. Our\nmethod is evaluated on many widely-recognized datasets and achieves\nstate-of-the-art performance.\n","authors":["Xidong Peng","Runnan Chen","Feng Qiao","Lingdong Kong","Youquan Liu","Tai Wang","Xinge Zhu","Yuexin Ma"],"pdf_url":"https://arxiv.org/pdf/2310.08820v1.pdf","comment":"submitted to ICLR 2024"},{"id":"http://arxiv.org/abs/2310.05873v3","updated":"2023-10-13T02:04:57Z","published":"2023-10-09T17:13:10Z","title":"Geom-Erasing: Geometry-Driven Removal of Implicit Concept in Diffusion\n  Models","summary":"  Fine-tuning diffusion models through personalized datasets is an acknowledged\nmethod for improving generation quality across downstream tasks, which,\nhowever, often inadvertently generates unintended concepts such as watermarks\nand QR codes, attributed to the limitations in image sources and collecting\nmethods within specific downstream tasks. Existing solutions suffer from\neliminating these unintentionally learned implicit concepts, primarily due to\nthe dependency on the model's ability to recognize concepts that it actually\ncannot discern. In this work, we introduce Geom-Erasing, a novel approach that\nsuccessfully removes the implicit concepts with either an additional accessible\nclassifier or detector model to encode geometric information of these concepts\ninto text domain. Moreover, we propose Implicit Concept, a novel image-text\ndataset imbued with three implicit concepts (i.e., watermarks, QR codes, and\ntext) for training and evaluation. Experimental results demonstrate that\nGeom-Erasing not only identifies but also proficiently eradicates implicit\nconcepts, revealing a significant improvement over the existing methods. The\nintegration of geometric information marks a substantial progression in the\nprecise removal of implicit concepts in diffusion models.\n","authors":["Zhili Liu","Kai Chen","Yifan Zhang","Jianhua Han","Lanqing Hong","Hang Xu","Zhenguo Li","Dit-Yan Yeung","James Kwok"],"pdf_url":"https://arxiv.org/pdf/2310.05873v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08815v1","updated":"2023-10-13T01:59:39Z","published":"2023-10-13T01:59:39Z","title":"Incremental Object Detection with CLIP","summary":"  In the incremental detection task, unlike the incremental classification\ntask, data ambiguity exists due to the possibility of an image having different\nlabeled bounding boxes in multiple continuous learning stages. This phenomenon\noften impairs the model's ability to learn new classes. However, the forward\ncompatibility of the model is less considered in existing work, which hinders\nthe model's suitability for incremental learning. To overcome this obstacle, we\npropose to use a language-visual model such as CLIP to generate text feature\nembeddings for different class sets, which enhances the feature space globally.\nWe then employ the broad classes to replace the unavailable novel classes in\nthe early learning stage to simulate the actual incremental scenario. Finally,\nwe use the CLIP image encoder to identify potential objects in the proposals,\nwhich are classified into the background by the model. We modify the background\nlabels of those proposals to known classes and add the boxes to the training\nset to alleviate the problem of data ambiguity. We evaluate our approach on\nvarious incremental learning settings on the PASCAL VOC 2007 dataset, and our\napproach outperforms state-of-the-art methods, particularly for the new\nclasses.\n","authors":["Yupeng He","Ziyue Huang","Qingjie Liu","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2310.08815v1.pdf","comment":"10 pages, 2 figures"},{"id":"http://arxiv.org/abs/2305.02034v4","updated":"2023-10-13T01:49:42Z","published":"2023-05-03T10:58:07Z","title":"SAMRS: Scaling-up Remote Sensing Segmentation Dataset with Segment\n  Anything Model","summary":"  The success of the Segment Anything Model (SAM) demonstrates the significance\nof data-centric machine learning. However, due to the difficulties and high\ncosts associated with annotating Remote Sensing (RS) images, a large amount of\nvaluable RS data remains unlabeled, particularly at the pixel level. In this\nstudy, we leverage SAM and existing RS object detection datasets to develop an\nefficient pipeline for generating a large-scale RS segmentation dataset, dubbed\nSAMRS. SAMRS totally possesses 105,090 images and 1,668,241 instances,\nsurpassing existing high-resolution RS segmentation datasets in size by several\norders of magnitude. It provides object category, location, and instance\ninformation that can be used for semantic segmentation, instance segmentation,\nand object detection, either individually or in combination. We also provide a\ncomprehensive analysis of SAMRS from various aspects. Moreover, preliminary\nexperiments highlight the importance of conducting segmentation pre-training\nwith SAMRS to address task discrepancies and alleviate the limitations posed by\nlimited training data during fine-tuning. The code and dataset will be\navailable at https://github.com/ViTAE-Transformer/SAMRS.\n","authors":["Di Wang","Jing Zhang","Bo Du","Minqiang Xu","Lin Liu","Dacheng Tao","Liangpei Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.02034v4.pdf","comment":"Accepted by NeurIPS 2023 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2303.08320v4","updated":"2023-10-13T01:43:04Z","published":"2023-03-15T02:16:39Z","title":"VideoFusion: Decomposed Diffusion Models for High-Quality Video\n  Generation","summary":"  A diffusion probabilistic model (DPM), which constructs a forward diffusion\nprocess by gradually adding noise to data points and learns the reverse\ndenoising process to generate new samples, has been shown to handle complex\ndata distribution. Despite its recent success in image synthesis, applying DPMs\nto video generation is still challenging due to high-dimensional data spaces.\nPrevious methods usually adopt a standard diffusion process, where frames in\nthe same video clip are destroyed with independent noises, ignoring the content\nredundancy and temporal correlation. This work presents a decomposed diffusion\nprocess via resolving the per-frame noise into a base noise that is shared\namong all frames and a residual noise that varies along the time axis. The\ndenoising pipeline employs two jointly-learned networks to match the noise\ndecomposition accordingly. Experiments on various datasets confirm that our\napproach, termed as VideoFusion, surpasses both GAN-based and diffusion-based\nalternatives in high-quality video generation. We further show that our\ndecomposed formulation can benefit from pre-trained image diffusion models and\nwell-support text-conditioned video creation.\n","authors":["Zhengxiong Luo","Dayou Chen","Yingya Zhang","Yan Huang","Liang Wang","Yujun Shen","Deli Zhao","Jingren Zhou","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2303.08320v4.pdf","comment":"Accepted to CVPR2023"},{"id":"http://arxiv.org/abs/2310.08805v1","updated":"2023-10-13T01:27:36Z","published":"2023-10-13T01:27:36Z","title":"Two-Stage Deep Learning Framework for Quality Assessment of Left Atrial\n  Late Gadolinium Enhanced MRI Images","summary":"  Accurate assessment of left atrial fibrosis in patients with atrial\nfibrillation relies on high-quality 3D late gadolinium enhancement (LGE) MRI\nimages. However, obtaining such images is challenging due to patient motion,\nchanging breathing patterns, or sub-optimal choice of pulse sequence\nparameters. Automated assessment of LGE-MRI image diagnostic quality is\nclinically significant as it would enhance diagnostic accuracy, improve\nefficiency, ensure standardization, and contributes to better patient outcomes\nby providing reliable and high-quality LGE-MRI scans for fibrosis\nquantification and treatment planning. To address this, we propose a two-stage\ndeep-learning approach for automated LGE-MRI image diagnostic quality\nassessment. The method includes a left atrium detector to focus on relevant\nregions and a deep network to evaluate diagnostic quality. We explore two\ntraining strategies, multi-task learning, and pretraining using contrastive\nlearning, to overcome limited annotated data in medical imaging. Contrastive\nLearning result shows about $4\\%$, and $9\\%$ improvement in F1-Score and\nSpecificity compared to Multi-Task learning when there's limited data.\n","authors":["K M Arefeen Sultan","Benjamin Orkild","Alan Morris","Eugene Kholmovski","Erik Bieging","Eugene Kwan","Ravi Ranjan","Ed DiBella","Shireen Elhabian"],"pdf_url":"https://arxiv.org/pdf/2310.08805v1.pdf","comment":"Accepted to STACOM 2023. 11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.08475v2","updated":"2023-10-13T01:12:25Z","published":"2023-10-12T16:32:44Z","title":"Can We Edit Multimodal Large Language Models?","summary":"  In this paper, we focus on editing Multimodal Large Language Models (MLLMs).\nCompared to editing single-modal LLMs, multimodal model editing is more\nchallenging, which demands a higher level of scrutiny and careful consideration\nin the editing process. To facilitate research in this area, we construct a new\nbenchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite\nof innovative metrics for evaluation. We conduct comprehensive experiments\ninvolving various model editing baselines and analyze the impact of editing\ndifferent components for multimodal LLMs. Empirically, we notice that previous\nbaselines can implement editing multimodal LLMs to some extent, but the effect\nis still barely satisfactory, indicating the potential difficulty of this task.\nWe hope that our work can provide the NLP community with insights. Code and\ndataset are available in https://github.com/zjunlp/EasyEdit.\n","authors":["Siyuan Cheng","Bozhong Tian","Qingbin Liu","Xi Chen","Yongheng Wang","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08475v2.pdf","comment":"EMNLP 2023"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2303.00807v3","updated":"2023-10-13T17:23:04Z","published":"2023-03-01T20:21:23Z","title":"UDAPDR: Unsupervised Domain Adaptation via LLM Prompting and\n  Distillation of Rerankers","summary":"  Many information retrieval tasks require large labeled datasets for\nfine-tuning. However, such datasets are often unavailable, and their utility\nfor real-world applications can diminish quickly due to domain shifts. To\naddress this challenge, we develop and motivate a method for using large\nlanguage models (LLMs) to generate large numbers of synthetic queries cheaply.\nThe method begins by generating a small number of synthetic queries using an\nexpensive LLM. After that, a much less expensive one is used to create large\nnumbers of synthetic queries, which are used to fine-tune a family of reranker\nmodels. These rerankers are then distilled into a single efficient retriever\nfor use in the target domain. We show that this technique boosts zero-shot\naccuracy in long-tail domains and achieves substantially lower latency than\nstandard reranking methods.\n","authors":["Jon Saad-Falcon","Omar Khattab","Keshav Santhanam","Radu Florian","Martin Franz","Salim Roukos","Avirup Sil","Md Arafat Sultan","Christopher Potts"],"pdf_url":"https://arxiv.org/pdf/2303.00807v3.pdf","comment":"Long Paper at Empirical Methods in Natural Language Processing\n  (EMNLP) 2023"},{"id":"http://arxiv.org/abs/2310.09234v1","updated":"2023-10-13T16:37:53Z","published":"2023-10-13T16:37:53Z","title":"ClickPrompt: CTR Models are Strong Prompt Generators for Adapting\n  Language Models to CTR Prediction","summary":"  Click-through rate (CTR) prediction has become increasingly indispensable for\nvarious Internet applications. Traditional CTR models convert the multi-field\ncategorical data into ID features via one-hot encoding, and extract the\ncollaborative signals among features. Such a paradigm suffers from the problem\nof semantic information loss. Another line of research explores the potential\nof pretrained language models (PLMs) for CTR prediction by converting input\ndata into textual sentences through hard prompt templates. Although semantic\nsignals are preserved, they generally fail to capture the collaborative\ninformation (e.g., feature interactions, pure ID features), not to mention the\nunacceptable inference overhead brought by the huge model size. In this paper,\nwe aim to model both the semantic knowledge and collaborative knowledge for\naccurate CTR estimation, and meanwhile address the inference inefficiency\nissue. To benefit from both worlds and close their gaps, we propose a novel\nmodel-agnostic framework (i.e., ClickPrompt), where we incorporate CTR models\nto generate interaction-aware soft prompts for PLMs. We design a\nprompt-augmented masked language modeling (PA-MLM) pretraining task, where PLM\nhas to recover the masked tokens based on the language context, as well as the\nsoft prompts generated by CTR model. The collaborative and semantic knowledge\nfrom ID and textual features would be explicitly aligned and interacted via the\nprompt interface. Then, we can either tune the CTR model with PLM for superior\nperformance, or solely tune the CTR model without PLM for inference efficiency.\nExperiments on four real-world datasets validate the effectiveness of\nClickPrompt compared with existing baselines.\n","authors":["Jianghao Lin","Bo Chen","Hangyu Wang","Yunjia Xi","Yanru Qu","Xinyi Dai","Kangning Zhang","Ruiming Tang","Yong Yu","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.09234v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2310.09233v1","updated":"2023-10-13T16:37:14Z","published":"2023-10-13T16:37:14Z","title":"AgentCF: Collaborative Learning with Autonomous Language Agents for\n  Recommender Systems","summary":"  Recently, there has been an emergence of employing LLM-powered agents as\nbelievable human proxies, based on their remarkable decision-making capability.\nHowever, existing studies mainly focus on simulating human dialogue. Human\nnon-verbal behaviors, such as item clicking in recommender systems, although\nimplicitly exhibiting user preferences and could enhance the modeling of users,\nhave not been deeply explored. The main reasons lie in the gap between language\nmodeling and behavior modeling, as well as the incomprehension of LLMs about\nuser-item relations.\n  To address this issue, we propose AgentCF for simulating user-item\ninteractions in recommender systems through agent-based collaborative\nfiltering. We creatively consider not only users but also items as agents, and\ndevelop a collaborative learning approach that optimizes both kinds of agents\ntogether. Specifically, at each time step, we first prompt the user and item\nagents to interact autonomously. Then, based on the disparities between the\nagents' decisions and real-world interaction records, user and item agents are\nprompted to reflect on and adjust the misleading simulations collaboratively,\nthereby modeling their two-sided relations. The optimized agents can also\npropagate their preferences to other agents in subsequent interactions,\nimplicitly capturing the collaborative filtering idea. Overall, the optimized\nagents exhibit diverse interaction behaviors within our framework, including\nuser-item, user-user, item-item, and collective interactions. The results show\nthat these agents can demonstrate personalized behaviors akin to those of\nreal-world individuals, sparking the development of next-generation user\nbehavior simulation.\n","authors":["Junjie Zhang","Yupeng Hou","Ruobing Xie","Wenqi Sun","Julian McAuley","Wayne Xin Zhao","Leyu Lin","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2310.09233v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11131v2","updated":"2023-10-13T16:13:52Z","published":"2023-08-22T02:25:04Z","title":"ReLLa: Retrieval-enhanced Large Language Models for Lifelong Sequential\n  Behavior Comprehension in Recommendation","summary":"  With large language models (LLMs) achieving remarkable breakthroughs in\nnatural language processing (NLP) domains, LLM-enhanced recommender systems\nhave received much attention and have been actively explored currently. In this\npaper, we focus on adapting and empowering a pure large language model for\nzero-shot and few-shot recommendation tasks. First and foremost, we identify\nand formulate the lifelong sequential behavior incomprehension problem for LLMs\nin recommendation domains, i.e., LLMs fail to extract useful information from a\ntextual context of long user behavior sequence, even if the length of context\nis far from reaching the context limitation of LLMs. To address such an issue\nand improve the recommendation performance of LLMs, we propose a novel\nframework, namely Retrieval-enhanced Large Language models (ReLLa) for\nrecommendation tasks in both zero-shot and few-shot settings. For zero-shot\nrecommendation, we perform semantic user behavior retrieval (SUBR) to improve\nthe data quality of testing samples, which greatly reduces the difficulty for\nLLMs to extract the essential knowledge from user behavior sequences. As for\nfew-shot recommendation, we further design retrieval-enhanced instruction\ntuning (ReiT) by adopting SUBR as a data augmentation technique for training\nsamples. Specifically, we develop a mixed training dataset consisting of both\nthe original data samples and their retrieval-enhanced counterparts. We conduct\nextensive experiments on three real-world public datasets to demonstrate the\nsuperiority of ReLLa compared with existing baseline models, as well as its\ncapability for lifelong sequential behavior comprehension. To be highlighted,\nwith only less than 10% training samples, few-shot ReLLa can outperform\ntraditional CTR models that are trained on the entire training set (e.g.,\nDCNv2, DIN, SIM).\n","authors":["Jianghao Lin","Rong Shan","Chenxu Zhu","Kounianhua Du","Bo Chen","Shigang Quan","Ruiming Tang","Yong Yu","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.11131v2.pdf","comment":"Updated Version. Few-shot ReLLa is now able to outperform full-shot\n  CTR models trained on the entire training set"},{"id":"http://arxiv.org/abs/2310.08891v1","updated":"2023-10-13T06:53:02Z","published":"2023-10-13T06:53:02Z","title":"EHI: End-to-end Learning of Hierarchical Index for Efficient Dense\n  Retrieval","summary":"  Dense embedding-based retrieval is now the industry standard for semantic\nsearch and ranking problems, like obtaining relevant web documents for a given\nquery. Such techniques use a two-stage process: (a) contrastive learning to\ntrain a dual encoder to embed both the query and documents and (b) approximate\nnearest neighbor search (ANNS) for finding similar documents for a given query.\nThese two stages are disjoint; the learned embeddings might be ill-suited for\nthe ANNS method and vice-versa, leading to suboptimal performance. In this\nwork, we propose End-to-end Hierarchical Indexing -- EHI -- that jointly learns\nboth the embeddings and the ANNS structure to optimize retrieval performance.\nEHI uses a standard dual encoder model for embedding queries and documents\nwhile learning an inverted file index (IVF) style tree structure for efficient\nANNS. To ensure stable and efficient learning of discrete tree-based ANNS\nstructure, EHI introduces the notion of dense path embedding that captures the\nposition of a query/document in the tree. We demonstrate the effectiveness of\nEHI on several benchmarks, including de-facto industry standard MS MARCO (Dev\nset and TREC DL19) datasets. For example, with the same compute budget, EHI\noutperforms state-of-the-art (SOTA) in by 0.6% (MRR@10) on MS MARCO dev set and\nby 4.2% (nDCG@10) on TREC DL19 benchmarks.\n","authors":["Ramnath Kumar","Anshul Mittal","Nilesh Gupta","Aditya Kusupati","Inderjit Dhillon","Prateek Jain"],"pdf_url":"https://arxiv.org/pdf/2310.08891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09401v1","updated":"2023-10-13T20:53:50Z","published":"2023-10-13T20:53:50Z","title":"CIDER: Category-Guided Intent Disentanglement for Accurate Personalized\n  News Recommendation","summary":"  Personalized news recommendation aims to assist users in finding news\narticles that align with their interests, which plays a pivotal role in\nmitigating users' information overload problem. Although many recent works have\nbeen studied for better user and news representations, the following challenges\nhave been rarely studied: (C1) How to precisely comprehend a range of intents\ncoupled within a news article? and (C2) How to differentiate news articles with\nvarying post-read preferences in users' click history? To tackle both\nchallenges together, in this paper, we propose a novel personalized news\nrecommendation framework (CIDER) that employs (1) category-guided intent\ndisentanglement for (C1) and (2) consistency-based news representation for\n(C2). Furthermore, we incorporate a category prediction into the training\nprocess of CIDER as an auxiliary task, which provides supplementary supervisory\nsignals to enhance intent disentanglement. Extensive experiments on two\nreal-world datasets reveal that (1) CIDER provides consistent performance\nimprovements over seven state-of-the-art news recommendation methods and (2)\nthe proposed strategies significantly improve the model accuracy of CIDER.\n","authors":["Yunyong Ko","Seongeun Ryu","Sang-Wook Kim"],"pdf_url":"https://arxiv.org/pdf/2310.09401v1.pdf","comment":"8 pages, 6 figures, 6 tables"},{"id":"http://arxiv.org/abs/2310.09400v1","updated":"2023-10-13T20:52:18Z","published":"2023-10-13T20:52:18Z","title":"Collaborative Contextualization: Bridging the Gap between Collaborative\n  Filtering and Pre-trained Language Model","summary":"  Traditional recommender systems have heavily relied on identity\nrepresentations (IDs) to model users and items, while the ascendancy of\npre-trained language model (PLM) encoders has enriched the modeling of\ncontextual item descriptions. However, PLMs, although effective in addressing\nfew-shot, zero-shot, or unified modeling scenarios, often neglect the crucial\ncollaborative filtering signal. This neglect gives rise to two pressing\nchallenges: (1) Collaborative Contextualization, the seamless integration of\ncollaborative signals with contextual representations. (2) the imperative to\nbridge the representation gap between ID-based representations and contextual\nrepresentations while preserving their contextual semantics. In this paper, we\npropose CollabContext, a novel model that adeptly combines collaborative\nfiltering signals with contextual representations and aligns these\nrepresentations within the contextual space, preserving essential contextual\nsemantics. Experimental results across three real-world datasets demonstrate\nsubstantial improvements. Leveraging collaborative contextualization,\nCollabContext can also be effectively applied to cold-start scenarios,\nachieving remarkable enhancements in recommendation performance. The code is\navailable after the conference accepts the paper.\n","authors":["Chen Wang","Liangwei Yang","Zhiwei Liu","Xiaolong Liu","Mingdai Yang","Yueqing Liang","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2310.09400v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09341v1","updated":"2023-10-13T18:11:12Z","published":"2023-10-13T18:11:12Z","title":"Addressing the cold start problem in privacy preserving content-based\n  recommender systems using hypercube graphs","summary":"  The initial interaction of a user with a recommender system is problematic\nbecause, in such a so-called cold start situation, the recommender system has\nvery little information about the user, if any. Moreover, in collaborative\nfiltering, users need to share their preferences with the service provider by\nrating items while in content-based filtering there is no need for such\ninformation sharing. We have recently shown that a content-based model that\nuses hypercube graphs can determine user preferences with a very limited number\nof ratings while better preserving user privacy. In this paper, we confirm\nthese findings on the basis of experiments with more than 1,000 users in the\nrestaurant and movie domains. We show that the proposed method outperforms\nstandard machine learning algorithms when the number of available ratings is at\nmost 10, which often happens, and is competitive with larger training sets. In\naddition, training is simple and does not require large computational efforts.\n","authors":["Noa Tuval","Alain Hertz","Tsvi Kuflik"],"pdf_url":"https://arxiv.org/pdf/2310.09341v1.pdf","comment":"22 pages, 6 figures, 9 tables"},{"id":"http://arxiv.org/abs/2309.04761v2","updated":"2023-10-13T11:18:13Z","published":"2023-09-09T11:20:40Z","title":"A Comprehensive Survey on Deep Learning Techniques in Educational Data\n  Mining","summary":"  Educational Data Mining (EDM) has emerged as a vital field of research, which\nharnesses the power of computational techniques to analyze educational data.\nWith the increasing complexity and diversity of educational data, Deep Learning\ntechniques have shown significant advantages in addressing the challenges\nassociated with analyzing and modeling this data. This survey aims to\nsystematically review the state-of-the-art in EDM with Deep Learning. We begin\nby providing a brief introduction to EDM and Deep Learning, highlighting their\nrelevance in the context of modern education. Next, we present a detailed\nreview of Deep Learning techniques applied in four typical educational\nscenarios, including knowledge tracing, undesirable student detecting,\nperformance prediction, and personalized recommendation. Furthermore, a\ncomprehensive overview of public datasets and processing tools for EDM is\nprovided. Finally, we point out emerging trends and future directions in this\nresearch area.\n","authors":["Yuanguo Lin","Hong Chen","Wei Xia","Fan Lin","Pengcheng Wu","Zongyue Wang","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2309.04761v2.pdf","comment":"21 pages, 5 figures"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2306.07261v4","updated":"2023-10-13T17:53:38Z","published":"2023-06-12T17:44:15Z","title":"Unprocessing Seven Years of Algorithmic Fairness","summary":"  Seven years ago, researchers proposed a postprocessing method to equalize the\nerror rates of a model across different demographic groups. The work launched\nhundreds of papers purporting to improve over the postprocessing baseline. We\nempirically evaluate these claims through thousands of model evaluations on\nseveral tabular datasets. We find that the fairness-accuracy Pareto frontier\nachieved by postprocessing contains all other methods we were feasibly able to\nevaluate. In doing so, we address two common methodological errors that have\nconfounded previous observations. One relates to the comparison of methods with\ndifferent unconstrained base models. The other concerns methods achieving\ndifferent levels of constraint relaxation. At the heart of our study is a\nsimple idea we call unprocessing that roughly corresponds to the inverse of\npostprocessing. Unprocessing allows for a direct comparison of methods using\ndifferent underlying models and levels of relaxation.\n","authors":["André F. Cruz","Moritz Hardt"],"pdf_url":"https://arxiv.org/pdf/2306.07261v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09278v1","updated":"2023-10-13T17:40:39Z","published":"2023-10-13T17:40:39Z","title":"Disentangled Latent Spaces Facilitate Data-Driven Auxiliary Learning","summary":"  In deep learning, auxiliary objectives are often used to facilitate learning\nin situations where data is scarce, or the principal task is extremely complex.\nThis idea is primarily inspired by the improved generalization capability\ninduced by solving multiple tasks simultaneously, which leads to a more robust\nshared representation. Nevertheless, finding optimal auxiliary tasks that give\nrise to the desired improvement is a crucial problem that often requires\nhand-crafted solutions or expensive meta-learning approaches. In this paper, we\npropose a novel framework, dubbed Detaux, whereby a weakly supervised\ndisentanglement procedure is used to discover new unrelated classification\ntasks and the associated labels that can be exploited with the principal task\nin any Multi-Task Learning (MTL) model. The disentanglement procedure works at\na representation level, isolating a subspace related to the principal task,\nplus an arbitrary number of orthogonal subspaces. In the most disentangled\nsubspaces, through a clustering procedure, we generate the additional\nclassification tasks, and the associated labels become their representatives.\nSubsequently, the original data, the labels associated with the principal task,\nand the newly discovered ones can be fed into any MTL framework. Extensive\nvalidation on both synthetic and real data, along with various ablation\nstudies, demonstrate promising results, revealing the potential in what has\nbeen, so far, an unexplored connection between learning disentangled\nrepresentations and MTL. The code will be made publicly available upon\nacceptance.\n","authors":["Geri Skenderi","Luigi Capogrosso","Andrea Toaiari","Matteo Denitto","Franco Fummi","Simone Melzi","Marco Cristani"],"pdf_url":"https://arxiv.org/pdf/2310.09278v1.pdf","comment":"Under review in Pattern Recognition Letters"},{"id":"http://arxiv.org/abs/2310.09277v1","updated":"2023-10-13T17:39:35Z","published":"2023-10-13T17:39:35Z","title":"A Hybrid Approach for Depression Classification: Random Forest-ANN\n  Ensemble on Motor Activity Signals","summary":"  Regarding the rising number of people suffering from mental health illnesses\nin today's society, the importance of mental health cannot be overstated.\nWearable sensors, which are increasingly widely available, provide a potential\nway to track and comprehend mental health issues. These gadgets not only\nmonitor everyday activities but also continuously record vital signs like heart\nrate, perhaps providing information on a person's mental state. Recent research\nhas used these sensors in conjunction with machine learning methods to identify\npatterns relating to different mental health conditions, highlighting the\nimmense potential of this data beyond simple activity monitoring. In this\nresearch, we present a novel algorithm called the Hybrid Random forest - Neural\nnetwork that has been tailored to evaluate sensor data from depressed patients.\nOur method has a noteworthy accuracy of 80\\% when evaluated on a special\ndataset that included both unipolar and bipolar depressive patients as well as\nhealthy controls. The findings highlight the algorithm's potential for reliably\ndetermining a person's depression condition using sensor data, making a\nsubstantial contribution to the area of mental health diagnostics.\n","authors":["Anket Patil","Dhairya Shah","Abhishek Shah","Mokshit Gala"],"pdf_url":"https://arxiv.org/pdf/2310.09277v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2310.09270v1","updated":"2023-10-13T17:35:04Z","published":"2023-10-13T17:35:04Z","title":"Retro-fallback: retrosynthetic planning in an uncertain world","summary":"  Retrosynthesis is the task of proposing a series of chemical reactions to\ncreate a desired molecule from simpler, buyable molecules. While previous works\nhave proposed algorithms to find optimal solutions for a range of metrics (e.g.\nshortest, lowest-cost), these works generally overlook the fact that we have\nimperfect knowledge of the space of possible reactions, meaning plans created\nby the algorithm may not work in a laboratory. In this paper we propose a novel\nformulation of retrosynthesis in terms of stochastic processes to account for\nthis uncertainty. We then propose a novel greedy algorithm called\nretro-fallback which maximizes the probability that at least one synthesis plan\ncan be executed in the lab. Using in-silico benchmarks we demonstrate that\nretro-fallback generally produces better sets of synthesis plans than the\npopular MCTS and retro* algorithms.\n","authors":["Austin Tripp","Krzysztof Maziarz","Sarah Lewis","Marwin Segler","José Miguel Hernández-Lobato"],"pdf_url":"https://arxiv.org/pdf/2310.09270v1.pdf","comment":"39 pages (including appendices). Currently undergoing peer review"},{"id":"http://arxiv.org/abs/2310.09267v1","updated":"2023-10-13T17:25:11Z","published":"2023-10-13T17:25:11Z","title":"Genetic algorithms are strong baselines for molecule generation","summary":"  Generating molecules, both in a directed and undirected fashion, is a huge\npart of the drug discovery pipeline. Genetic algorithms (GAs) generate\nmolecules by randomly modifying known molecules. In this paper we show that GAs\nare very strong algorithms for such tasks, outperforming many complicated\nmachine learning methods: a result which many researchers may find surprising.\nWe therefore propose insisting during peer review that new algorithms must have\nsome clear advantage over GAs, which we call the GA criterion. Ultimately our\nwork suggests that a lot of research in molecule generation should be\nre-assessed.\n","authors":["Austin Tripp","José Miguel Hernández-Lobato"],"pdf_url":"https://arxiv.org/pdf/2310.09267v1.pdf","comment":"Currently under review. Code will be made available at a later date"},{"id":"http://arxiv.org/abs/2310.09266v1","updated":"2023-10-13T17:24:52Z","published":"2023-10-13T17:24:52Z","title":"User Inference Attacks on Large Language Models","summary":"  Fine-tuning is a common and effective method for tailoring large language\nmodels (LLMs) to specialized tasks and applications. In this paper, we study\nthe privacy implications of fine-tuning LLMs on user data. To this end, we\ndefine a realistic threat model, called user inference, wherein an attacker\ninfers whether or not a user's data was used for fine-tuning. We implement\nattacks for this threat model that require only a small set of samples from a\nuser (possibly different from the samples used for training) and black-box\naccess to the fine-tuned LLM. We find that LLMs are susceptible to user\ninference attacks across a variety of fine-tuning datasets, at times with near\nperfect attack success rates. Further, we investigate which properties make\nusers vulnerable to user inference, finding that outlier users (i.e. those with\ndata distributions sufficiently different from other users) and users who\ncontribute large quantities of data are most susceptible to attack. Finally, we\nexplore several heuristics for mitigating privacy attacks. We find that\ninterventions in the training algorithm, such as batch or per-example gradient\nclipping and early stopping fail to prevent user inference. However, limiting\nthe number of fine-tuning samples from a single user can reduce attack\neffectiveness, albeit at the cost of reducing the total amount of fine-tuning\ndata.\n","authors":["Nikhil Kandpal","Krishna Pillutla","Alina Oprea","Peter Kairouz","Christopher A. Choquette-Choo","Zheng Xu"],"pdf_url":"https://arxiv.org/pdf/2310.09266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09265v1","updated":"2023-10-13T17:23:17Z","published":"2023-10-13T17:23:17Z","title":"PromptRE: Weakly-Supervised Document-Level Relation Extraction via\n  Prompting-Based Data Programming","summary":"  Relation extraction aims to classify the relationships between two entities\ninto pre-defined categories. While previous research has mainly focused on\nsentence-level relation extraction, recent studies have expanded the scope to\ndocument-level relation extraction. Traditional relation extraction methods\nheavily rely on human-annotated training data, which is time-consuming and\nlabor-intensive. To mitigate the need for manual annotation, recent\nweakly-supervised approaches have been developed for sentence-level relation\nextraction while limited work has been done on document-level relation\nextraction. Weakly-supervised document-level relation extraction faces\nsignificant challenges due to an imbalanced number \"no relation\" instances and\nthe failure of directly probing pretrained large language models for document\nrelation extraction. To address these challenges, we propose PromptRE, a novel\nweakly-supervised document-level relation extraction method that combines\nprompting-based techniques with data programming. Furthermore, PromptRE\nincorporates the label distribution and entity types as prior knowledge to\nimprove the performance. By leveraging the strengths of both prompting and data\nprogramming, PromptRE achieves improved performance in relation classification\nand effectively handles the \"no relation\" problem. Experimental results on\nReDocRED, a benchmark dataset for document-level relation extraction,\ndemonstrate the superiority of PromptRE over baseline approaches.\n","authors":["Chufan Gao","Xulin Fan","Jimeng Sun","Xuan Wang"],"pdf_url":"https://arxiv.org/pdf/2310.09265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.12127v3","updated":"2023-10-13T17:21:46Z","published":"2022-09-25T02:56:01Z","title":"SpeedLimit: Neural Architecture Search for Quantized Transformer Models","summary":"  While research in the field of transformer models has primarily focused on\nenhancing performance metrics such as accuracy and perplexity, practical\napplications in industry often necessitate a rigorous consideration of\ninference latency constraints. Addressing this challenge, we introduce\nSpeedLimit, a novel Neural Architecture Search (NAS) technique that optimizes\naccuracy whilst adhering to an upper-bound latency constraint. Our method\nincorporates 8-bit integer quantization in the search process to outperform the\ncurrent state-of-the-art technique. Our results underline the feasibility and\nefficacy of seeking an optimal balance between performance and latency,\nproviding new avenues for deploying state-of-the-art transformer models in\nlatency-sensitive environments.\n","authors":["Yuji Chai","Luke Bailey","Yunho Jin","Matthew Karle","Glenn G. Ko","David Brooks","Gu-Yeon Wei","H. T. Kung"],"pdf_url":"https://arxiv.org/pdf/2209.12127v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09259v1","updated":"2023-10-13T17:15:05Z","published":"2023-10-13T17:15:05Z","title":"Towards End-to-end 4-Bit Inference on Generative Large Language Models","summary":"  We show that the majority of the inference computations for large generative\nmodels such as LLaMA and OPT can be performed with both weights and activations\nbeing cast to 4 bits, in a way that leads to practical speedups while at the\nsame time maintaining good accuracy. We achieve this via a hybrid quantization\nstrategy called QUIK, which compresses most of the weights and activations to\n4-bit, while keeping some outlier weights and activations in higher-precision.\nCrucially, our scheme is designed with computational efficiency in mind: we\nprovide GPU kernels with highly-efficient layer-wise runtimes, which lead to\npractical end-to-end throughput improvements of up to 3.1x relative to FP16\nexecution. Code and models are provided at https://github.com/IST-DASLab/QUIK.\n","authors":["Saleh Ashkboos","Ilia Markov","Elias Frantar","Tingxuan Zhong","Xincheng Wang","Jie Ren","Torsten Hoefler","Dan Alistarh"],"pdf_url":"https://arxiv.org/pdf/2310.09259v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2310.09254v1","updated":"2023-10-13T17:12:04Z","published":"2023-10-13T17:12:04Z","title":"Generative Entropic Neural Optimal Transport To Map Within and Across\n  Spaces","summary":"  Learning measure-to-measure mappings is a crucial task in machine learning,\nfeatured prominently in generative modeling. Recent years have witnessed a\nsurge of techniques that draw inspiration from optimal transport (OT) theory.\nCombined with neural network models, these methods collectively known as\n\\textit{Neural OT} use optimal transport as an inductive bias: such mappings\nshould be optimal w.r.t. a given cost function, in the sense that they are able\nto move points in a thrifty way, within (by minimizing displacements) or across\nspaces (by being isometric). This principle, while intuitive, is often\nconfronted with several practical challenges that require adapting the OT\ntoolbox: cost functions other than the squared-Euclidean cost can be\nchallenging to handle, the deterministic formulation of Monge maps leaves\nlittle flexibility, mapping across incomparable spaces raises multiple\nchallenges, while the mass conservation constraint inherent to OT can provide\ntoo much credit to outliers. While each of these mismatches between practice\nand theory has been addressed independently in various works, we propose in\nthis work an elegant framework to unify them, called \\textit{generative\nentropic neural optimal transport} (GENOT). GENOT can accommodate any cost\nfunction; handles randomness using conditional generative models; can map\npoints across incomparable spaces, and can be used as an \\textit{unbalanced}\nsolver. We evaluate our approach through experiments conducted on various\nsynthetic datasets and demonstrate its practicality in single-cell biology. In\nthis domain, GENOT proves to be valuable for tasks such as modeling cell\ndevelopment, predicting cellular responses to drugs, and translating between\ndifferent data modalities of cells.\n","authors":["Dominik Klein","Théo Uscidda","Fabian Theis","Marco Cuturi"],"pdf_url":"https://arxiv.org/pdf/2310.09254v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09250v1","updated":"2023-10-13T17:06:34Z","published":"2023-10-13T17:06:34Z","title":"It's an Alignment, Not a Trade-off: Revisiting Bias and Variance in Deep\n  Models","summary":"  Classical wisdom in machine learning holds that the generalization error can\nbe decomposed into bias and variance, and these two terms exhibit a\n\\emph{trade-off}. However, in this paper, we show that for an ensemble of deep\nlearning based classification models, bias and variance are \\emph{aligned} at a\nsample level, where squared bias is approximately \\emph{equal} to variance for\ncorrectly classified sample points. We present empirical evidence confirming\nthis phenomenon in a variety of deep learning models and datasets. Moreover, we\nstudy this phenomenon from two theoretical perspectives: calibration and neural\ncollapse. We first show theoretically that under the assumption that the models\nare well calibrated, we can observe the bias-variance alignment. Second,\nstarting from the picture provided by the neural collapse theory, we show an\napproximate correlation between bias and variance.\n","authors":["Lin Chen","Michal Lukasik","Wittawat Jitkrittum","Chong You","Sanjiv Kumar"],"pdf_url":"https://arxiv.org/pdf/2310.09250v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09247v1","updated":"2023-10-13T16:53:25Z","published":"2023-10-13T16:53:25Z","title":"Hypernymy Understanding Evaluation of Text-to-Image Models via WordNet\n  Hierarchy","summary":"  Text-to-image synthesis has recently attracted widespread attention due to\nrapidly improving quality and numerous practical applications. However, the\nlanguage understanding capabilities of text-to-image models are still poorly\nunderstood, which makes it difficult to reason about prompt formulations that a\ngiven model would understand well. In this work, we measure the capability of\npopular text-to-image models to understand $\\textit{hypernymy}$, or the \"is-a\"\nrelation between words. We design two automatic metrics based on the WordNet\nsemantic hierarchy and existing image classifiers pretrained on ImageNet. These\nmetrics both enable broad quantitative comparison of linguistic capabilities\nfor text-to-image models and offer a way of finding fine-grained qualitative\ndifferences, such as words that are unknown to models and thus are difficult\nfor them to draw. We comprehensively evaluate popular text-to-image models,\nincluding GLIDE, Latent Diffusion, and Stable Diffusion, showing how our\nmetrics can provide a better understanding of the individual strengths and\nweaknesses of these models.\n","authors":["Anton Baryshnikov","Max Ryabinin"],"pdf_url":"https://arxiv.org/pdf/2310.09247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09236v1","updated":"2023-10-13T16:40:29Z","published":"2023-10-13T16:40:29Z","title":"Time CNN and Graph Convolution Network for Epileptic Spike Detection in\n  MEG Data","summary":"  Magnetoencephalography (MEG) recordings of patients with epilepsy exhibit\nspikes, a typical biomarker of the pathology. Detecting those spikes allows\naccurate localization of brain regions triggering seizures. Spike detection is\noften performed manually. However, it is a burdensome and error prone task due\nto the complexity of MEG data. To address this problem, we propose a 1D\ntemporal convolutional neural network (Time CNN) coupled with a graph\nconvolutional network (GCN) to classify short time frames of MEG recording as\ncontaining a spike or not. Compared to other recent approaches, our models have\nfewer parameters to train and we propose to use a GCN to account for MEG\nsensors spatial relationships. Our models produce clinically relevant results\nand outperform deep learning-based state-of-the-art methods reaching a\nclassification f1-score of 76.7% on a balanced dataset and of 25.5% on a\nrealistic, highly imbalanced dataset, for the spike class.\n","authors":["Pauline Mouches","Thibaut Dejean","Julien Jung","Romain Bouet","Carole Lartizien","Romain Quentin"],"pdf_url":"https://arxiv.org/pdf/2310.09236v1.pdf","comment":"This work has been submitted to IEEE ISBI 2024 for possible\n  publication"},{"id":"http://arxiv.org/abs/2310.09229v1","updated":"2023-10-13T16:31:51Z","published":"2023-10-13T16:31:51Z","title":"Insuring Smiles: Predicting routine dental coverage using Spark ML","summary":"  Finding suitable health insurance coverage can be challenging for individuals\nand small enterprises in the USA. The Health Insurance Exchange Public Use\nFiles (Exchange PUFs) dataset provided by CMS offers valuable information on\nhealth and dental policies [1]. In this paper, we leverage machine learning\nalgorithms to predict if a health insurance plan covers routine dental services\nfor adults. By analyzing plan type, region, deductibles, out-of-pocket\nmaximums, and copayments, we employ Logistic Regression, Decision Tree, Random\nForest, Gradient Boost, Factorization Model and Support Vector Machine\nalgorithms. Our goal is to provide a clinical strategy for individuals and\nfamilies to select the most suitable insurance plan based on income and\nexpenses.\n","authors":["Aishwarya Gupta","Rahul S. Bhogale","Priyanka Thota","Prathushkumar Dathuri","Jongwook Woo"],"pdf_url":"https://arxiv.org/pdf/2310.09229v1.pdf","comment":"4 pages, 13 figures, 5 tables"},{"id":"http://arxiv.org/abs/2310.09222v1","updated":"2023-10-13T16:20:20Z","published":"2023-10-13T16:20:20Z","title":"Fast & Efficient Learning of Bayesian Networks from Data: Knowledge\n  Discovery and Causality","summary":"  Structure learning is essential for Bayesian networks (BNs) as it uncovers\ncausal relationships, and enables knowledge discovery, predictions, inferences,\nand decision-making under uncertainty. Two novel algorithms, FSBN and SSBN,\nbased on the PC algorithm, employ local search strategy and conditional\nindependence tests to learn the causal network structure from data. They\nincorporate d-separation to infer additional topology information, prioritize\nconditioning sets, and terminate the search immediately and efficiently. FSBN\nachieves up to 52% computation cost reduction, while SSBN surpasses it with a\nremarkable 72% reduction for a 200-node network. SSBN demonstrates further\nefficiency gains due to its intelligent strategy. Experimental studies show\nthat both algorithms match the induction quality of the PC algorithm while\nsignificantly reducing computation costs. This enables them to offer\ninterpretability and adaptability while reducing the computational burden,\nmaking them valuable for various applications in big data analytics.\n","authors":["Minn Sein","Fu Shunkai"],"pdf_url":"https://arxiv.org/pdf/2310.09222v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03026v2","updated":"2023-10-13T16:13:43Z","published":"2023-10-04T17:59:49Z","title":"LanguageMPC: Large Language Models as Decision Makers for Autonomous\n  Driving","summary":"  Existing learning-based autonomous driving (AD) systems face challenges in\ncomprehending high-level information, generalizing to rare events, and\nproviding interpretability. To address these problems, this work employs Large\nLanguage Models (LLMs) as a decision-making component for complex AD scenarios\nthat require human commonsense understanding. We devise cognitive pathways to\nenable comprehensive reasoning with LLMs, and develop algorithms for\ntranslating LLM decisions into actionable driving commands. Through this\napproach, LLM decisions are seamlessly integrated with low-level controllers by\nguided parameter matrix adaptation. Extensive experiments demonstrate that our\nproposed method not only consistently surpasses baseline approaches in\nsingle-vehicle tasks, but also helps handle complex driving behaviors even\nmulti-vehicle coordination, thanks to the commonsense reasoning capabilities of\nLLMs. This paper presents an initial step toward leveraging LLMs as effective\ndecision-makers for intricate AD scenarios in terms of safety, efficiency,\ngeneralizability, and interoperability. We aspire for it to serve as\ninspiration for future research in this field. Project page:\nhttps://sites.google.com/view/llm-mpc\n","authors":["Hao Sha","Yao Mu","Yuxuan Jiang","Li Chen","Chenfeng Xu","Ping Luo","Shengbo Eben Li","Masayoshi Tomizuka","Wei Zhan","Mingyu Ding"],"pdf_url":"https://arxiv.org/pdf/2310.03026v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16506v2","updated":"2023-10-13T16:09:19Z","published":"2023-07-31T09:08:40Z","title":"Explainable Equivariant Neural Networks for Particle Physics: PELICAN","summary":"  PELICAN is a novel permutation equivariant and Lorentz invariant or covariant\naggregator network designed to overcome common limitations found in\narchitectures applied to particle physics problems. Compared to many approaches\nthat use non-specialized architectures that neglect underlying physics\nprinciples and require very large numbers of parameters, PELICAN employs a\nfundamentally symmetry group-based architecture that demonstrates benefits in\nterms of reduced complexity, increased interpretability, and raw performance.\nWe present a comprehensive study of the PELICAN algorithm architecture in the\ncontext of both tagging (classification) and reconstructing (regression)\nLorentz-boosted top quarks, including the difficult task of specifically\nidentifying and measuring the $W$-boson inside the dense environment of the\nLorentz-boosted top-quark hadronic final state. We also extend the application\nof PELICAN to the tasks of identifying quark-initiated vs.~gluon-initiated\njets, and a multi-class identification across five separate target categories\nof jets. When tested on the standard task of Lorentz-boosted top-quark tagging,\nPELICAN outperforms existing competitors with much lower model complexity and\nhigh sample efficiency. On the less common and more complex task of 4-momentum\nregression, PELICAN also outperforms hand-crafted, non-machine learning\nalgorithms. We discuss the implications of symmetry-restricted architectures\nfor the wider field of machine learning for physics.\n","authors":["Alexander Bogatskiy","Timothy Hoffman","David W. Miller","Jan T. Offermann","Xiaoyang Liu"],"pdf_url":"https://arxiv.org/pdf/2307.16506v2.pdf","comment":"50 pages, 34 figures, 12 tables"},{"id":"http://arxiv.org/abs/2310.09213v1","updated":"2023-10-13T16:07:31Z","published":"2023-10-13T16:07:31Z","title":"Unseen Image Synthesis with Diffusion Models","summary":"  While the current trend in the generative field is scaling up towards larger\nmodels and more training data for generalized domain representations, we go the\nopposite direction in this work by synthesizing unseen domain images without\nadditional training. We do so via latent sampling and geometric optimization\nusing pre-trained and frozen Denoising Diffusion Probabilistic Models (DDPMs)\non single-domain datasets. Our key observation is that DDPMs pre-trained even\njust on single-domain images are already equipped with sufficient\nrepresentation abilities to reconstruct arbitrary images from the inverted\nlatent encoding following bi-directional deterministic diffusion and denoising\ntrajectories. This motivates us to investigate the statistical and geometric\nbehaviors of the Out-Of-Distribution (OOD) samples from unseen image domains in\nthe latent spaces along the denoising chain. Notably, we theoretically and\nempirically show that the inverted OOD samples also establish Gaussians that\nare distinguishable from the original In-Domain (ID) samples in the\nintermediate latent spaces, which allows us to sample from them directly.\nGeometrical domain-specific and model-dependent information of the unseen\nsubspace (e.g., sample-wise distance and angles) is used to further optimize\nthe sampled OOD latent encodings from the estimated Gaussian prior. We conduct\nextensive analysis and experiments using pre-trained diffusion models (DDPM,\niDDPM) on different datasets (AFHQ, CelebA-HQ, LSUN-Church, and LSUN-Bedroom),\nproving the effectiveness of this novel perspective to explore and re-think the\ndiffusion models' data synthesis generalization ability.\n","authors":["Ye Zhu","Yu Wu","Zhiwei Deng","Olga Russakovsky","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2310.09213v1.pdf","comment":"28 pages including appendices"},{"id":"http://arxiv.org/abs/2310.03684v2","updated":"2023-10-13T16:04:55Z","published":"2023-10-05T17:01:53Z","title":"SmoothLLM: Defending Large Language Models Against Jailbreaking Attacks","summary":"  Despite efforts to align large language models (LLMs) with human values,\nwidely-used LLMs such as GPT, Llama, Claude, and PaLM are susceptible to\njailbreaking attacks, wherein an adversary fools a targeted LLM into generating\nobjectionable content. To address this vulnerability, we propose SmoothLLM, the\nfirst algorithm designed to mitigate jailbreaking attacks on LLMs. Based on our\nfinding that adversarially-generated prompts are brittle to character-level\nchanges, our defense first randomly perturbs multiple copies of a given input\nprompt, and then aggregates the corresponding predictions to detect adversarial\ninputs. SmoothLLM reduces the attack success rate on numerous popular LLMs to\nbelow one percentage point, avoids unnecessary conservatism, and admits\nprovable guarantees on attack mitigation. Moreover, our defense uses\nexponentially fewer queries than existing attacks and is compatible with any\nLLM.\n","authors":["Alexander Robey","Eric Wong","Hamed Hassani","George J. Pappas"],"pdf_url":"https://arxiv.org/pdf/2310.03684v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09210v1","updated":"2023-10-13T16:04:06Z","published":"2023-10-13T16:04:06Z","title":"Regularization-Based Methods for Ordinal Quantification","summary":"  Quantification, i.e., the task of training predictors of the class prevalence\nvalues in sets of unlabeled data items, has received increased attention in\nrecent years. However, most quantification research has concentrated on\ndeveloping algorithms for binary and multiclass problems in which the classes\nare not ordered. Here, we study the ordinal case, i.e., the case in which a\ntotal order is defined on the set of n>2 classes. We give three main\ncontributions to this field. First, we create and make available two datasets\nfor ordinal quantification (OQ) research that overcome the inadequacies of the\npreviously available ones. Second, we experimentally compare the most important\nOQ algorithms proposed in the literature so far. To this end, we bring together\nalgorithms proposed by authors from very different research fields, such as\ndata mining and astrophysics, who were unaware of each others' developments.\nThird, we propose a novel class of regularized OQ algorithms, which outperforms\nexisting algorithms in our experiments. The key to this gain in performance is\nthat our regularization prevents ordinally implausible estimates, assuming that\nordinal distributions tend to be smooth in practice. We informally verify this\nassumption for several real-world applications.\n","authors":["Mirko Bunse","Alejandro Moreo","Fabrizio Sebastiani","Martin Senz"],"pdf_url":"https://arxiv.org/pdf/2310.09210v1.pdf","comment":"45 pages"},{"id":"http://arxiv.org/abs/2305.08960v2","updated":"2023-10-13T15:52:36Z","published":"2023-05-15T19:02:46Z","title":"One Forward is Enough for Neural Network Training via Likelihood Ratio\n  Method","summary":"  While backpropagation (BP) is the mainstream approach for gradient\ncomputation in neural network training, its heavy reliance on the chain rule of\ndifferentiation constrains the designing flexibility of network architecture\nand training pipelines. We avoid the recursive computation in BP and develop a\nunified likelihood ratio (ULR) method for gradient estimation with just one\nforward propagation. Not only can ULR be extended to train a wide variety of\nneural network architectures, but the computation flow in BP can also be\nrearranged by ULR for better device adaptation. Moreover, we propose several\nvariance reduction techniques to further accelerate the training process. Our\nexperiments offer numerical results across diverse aspects, including various\nneural network training scenarios, computation flow rearrangement, and\nfine-tuning of pre-trained models. All findings demonstrate that ULR\neffectively enhances the flexibility of neural network training by permitting\nlocalized module training without compromising the global objective and\nsignificantly boosts the network robustness.\n","authors":["Jinyang Jiang","Zeliang Zhang","Chenliang Xu","Zhaofei Yu","Yijie Peng"],"pdf_url":"https://arxiv.org/pdf/2305.08960v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09203v1","updated":"2023-10-13T15:48:24Z","published":"2023-10-13T15:48:24Z","title":"SiamAF: Learning Shared Information from ECG and PPG Signals for Robust\n  Atrial Fibrillation Detection","summary":"  Atrial fibrillation (AF) is the most common type of cardiac arrhythmia. It is\nassociated with an increased risk of stroke, heart failure, and other\ncardiovascular complications, but can be clinically silent. Passive AF\nmonitoring with wearables may help reduce adverse clinical outcomes related to\nAF. Detecting AF in noisy wearable data poses a significant challenge, leading\nto the emergence of various deep learning techniques. Previous deep learning\nmodels learn from a single modality, either electrocardiogram (ECG) or\nphotoplethysmography (PPG) signals. However, deep learning models often\nstruggle to learn generalizable features and rely on features that are more\nsusceptible to corruption from noise, leading to sub-optimal performances in\ncertain scenarios, especially with low-quality signals. Given the increasing\navailability of ECG and PPG signal pairs from wearables and bedside monitors,\nwe propose a new approach, SiamAF, leveraging a novel Siamese network\narchitecture and joint learning loss function to learn shared information from\nboth ECG and PPG signals. At inference time, the proposed model is able to\npredict AF from either PPG or ECG and outperforms baseline methods on three\nexternal test sets. It learns medically relevant features as a result of our\nnovel architecture design. The proposed model also achieves comparable\nperformance to traditional learning regimes while requiring much fewer training\nlabels, providing a potential approach to reduce future reliance on manual\nlabeling.\n","authors":["Zhicheng Guo","Cheng Ding","Duc H. Do","Amit Shah","Randall J. Lee","Xiao Hu","Cynthia Rudin"],"pdf_url":"https://arxiv.org/pdf/2310.09203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09202v1","updated":"2023-10-13T15:48:12Z","published":"2023-10-13T15:48:12Z","title":"Graph Condensation via Eigenbasis Matching","summary":"  The increasing amount of graph data places requirements on the efficiency and\nscalability of graph neural networks (GNNs), despite their effectiveness in\nvarious graph-related applications. Recently, the emerging graph condensation\n(GC) sheds light on reducing the computational cost of GNNs from a data\nperspective. It aims to replace the real large graph with a significantly\nsmaller synthetic graph so that GNNs trained on both graphs exhibit comparable\nperformance. However, our empirical investigation reveals that existing GC\nmethods suffer from poor generalization, i.e., different GNNs trained on the\nsame synthetic graph have obvious performance gaps. What factors hinder the\ngeneralization of GC and how can we mitigate it? To answer this question, we\ncommence with a detailed analysis and observe that GNNs will inject spectrum\nbias into the synthetic graph, resulting in a distribution shift. To tackle\nthis issue, we propose eigenbasis matching for spectrum-free graph\ncondensation, named GCEM, which has two key steps: First, GCEM matches the\neigenbasis of the real and synthetic graphs, rather than the graph structure,\nwhich eliminates the spectrum bias of GNNs. Subsequently, GCEM leverages the\nspectrum of the real graph and the synthetic eigenbasis to construct the\nsynthetic graph, thereby preserving the essential structural information. We\ntheoretically demonstrate that the synthetic graph generated by GCEM maintains\nthe spectral similarity, i.e., total variation, of the real graph. Extensive\nexperiments conducted on five graph datasets verify that GCEM not only achieves\nstate-of-the-art performance over baselines but also significantly narrows the\nperformance gaps between different GNNs.\n","authors":["Yang Liu","Deyu Bo","Chuan Shi"],"pdf_url":"https://arxiv.org/pdf/2310.09202v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2310.09196v1","updated":"2023-10-13T15:42:55Z","published":"2023-10-13T15:42:55Z","title":"A 4-approximation algorithm for min max correlation clustering","summary":"  We introduce a lower bounding technique for the min max correlation\nclustering problem and, based on this technique, a combinatorial\n4-approximation algorithm for complete graphs. This improves upon the previous\nbest known approximation guarantees of 5, using a linear program formulation\n(Kalhan et al., 2019), and 4, for a combinatorial algorithm (Davies et al.,\n2023). We extend this algorithm by a greedy joining heuristic and show\nempirically that it improves the state of the art in solution quality and\nruntime on several benchmark datasets.\n","authors":["Holger Heidrich","Jannik Irmai","Bjoern Andres"],"pdf_url":"https://arxiv.org/pdf/2310.09196v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2310.09194v1","updated":"2023-10-13T15:40:55Z","published":"2023-10-13T15:40:55Z","title":"Variational autoencoder with weighted samples for high-dimensional\n  non-parametric adaptive importance sampling","summary":"  Probability density function estimation with weighted samples is the main\nfoundation of all adaptive importance sampling algorithms. Classically, a\ntarget distribution is approximated either by a non-parametric model or within\na parametric family. However, these models suffer from the curse of\ndimensionality or from their lack of flexibility. In this contribution, we\nsuggest to use as the approximating model a distribution parameterised by a\nvariational autoencoder. We extend the existing framework to the case of\nweighted samples by introducing a new objective function. The flexibility of\nthe obtained family of distributions makes it as expressive as a non-parametric\nmodel, and despite the very high number of parameters to estimate, this family\nis much more efficient in high dimension than the classical Gaussian or\nGaussian mixture families. Moreover, in order to add flexibility to the model\nand to be able to learn multimodal distributions, we consider a learnable prior\ndistribution for the variational autoencoder latent variables. We also\nintroduce a new pre-training procedure for the variational autoencoder to find\ngood starting weights of the neural networks to prevent as much as possible the\nposterior collapse phenomenon to happen. At last, we explicit how the resulting\ndistribution can be combined with importance sampling, and we exploit the\nproposed procedure in existing adaptive importance sampling algorithms to draw\npoints from a target distribution and to estimate a rare event probability in\nhigh dimension on two multimodal problems.\n","authors":["Julien Demange-Chryst","François Bachoc","Jérôme Morio","Timothé Krauth"],"pdf_url":"https://arxiv.org/pdf/2310.09194v1.pdf","comment":"20 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.09192v1","updated":"2023-10-13T15:36:48Z","published":"2023-10-13T15:36:48Z","title":"Does Graph Distillation See Like Vision Dataset Counterpart?","summary":"  Training on large-scale graphs has achieved remarkable results in graph\nrepresentation learning, but its cost and storage have attracted increasing\nconcerns. Existing graph condensation methods primarily focus on optimizing the\nfeature matrices of condensed graphs while overlooking the impact of the\nstructure information from the original graphs. To investigate the impact of\nthe structure information, we conduct analysis from the spectral domain and\nempirically identify substantial Laplacian Energy Distribution (LED) shifts in\nprevious works. Such shifts lead to poor performance in cross-architecture\ngeneralization and specific tasks, including anomaly detection and link\nprediction. In this paper, we propose a novel Structure-broadcasting Graph\nDataset Distillation (SGDD) scheme for broadcasting the original structure\ninformation to the generation of the synthetic one, which explicitly prevents\noverlooking the original structure information. Theoretically, the synthetic\ngraphs by SGDD are expected to have smaller LED shifts than previous works,\nleading to superior performance in both cross-architecture settings and\nspecific tasks. We validate the proposed SGDD across 9 datasets and achieve\nstate-of-the-art results on all of them: for example, on the YelpChi dataset,\nour approach maintains 98.6% test accuracy of training on the original graph\ndataset with 1,000 times saving on the scale of the graph. Moreover, we\nempirically evaluate there exist 17.6% ~ 31.4% reductions in LED shift crossing\n9 datasets. Extensive experiments and analysis verify the effectiveness and\nnecessity of the proposed designs. The code is available in the GitHub\nrepository: https://github.com/RingBDStack/SGDD.\n","authors":["Beining Yang","Kai Wang","Qingyun Sun","Cheng Ji","Xingcheng Fu","Hao Tang","Yang You","Jianxin Li"],"pdf_url":"https://arxiv.org/pdf/2310.09192v1.pdf","comment":"Accepted by NeurIPS 2023"},{"id":"http://arxiv.org/abs/2305.17216v3","updated":"2023-10-13T15:35:42Z","published":"2023-05-26T19:22:03Z","title":"Generating Images with Multimodal Language Models","summary":"  We propose a method to fuse frozen text-only large language models (LLMs)\nwith pre-trained image encoder and decoder models, by mapping between their\nembedding spaces. Our model demonstrates a wide suite of multimodal\ncapabilities: image retrieval, novel image generation, and multimodal dialogue.\nOurs is the first approach capable of conditioning on arbitrarily interleaved\nimage and text inputs to generate coherent image (and text) outputs. To achieve\nstrong performance on image generation, we propose an efficient mapping network\nto ground the LLM to an off-the-shelf text-to-image generation model. This\nmapping network translates hidden representations of text into the embedding\nspace of the visual models, enabling us to leverage the strong text\nrepresentations of the LLM for visual outputs. Our approach outperforms\nbaseline generation models on tasks with longer and more complex language. In\naddition to novel image generation, our model is also capable of image\nretrieval from a prespecified dataset, and decides whether to retrieve or\ngenerate at inference time. This is done with a learnt decision module which\nconditions on the hidden representations of the LLM. Our model exhibits a wider\nrange of capabilities compared to prior multimodal language models. It can\nprocess image-and-text inputs, and produce retrieved images, generated images,\nand generated text -- outperforming non-LLM based generation models across\nseveral text-to-image tasks that measure context dependence.\n","authors":["Jing Yu Koh","Daniel Fried","Ruslan Salakhutdinov"],"pdf_url":"https://arxiv.org/pdf/2305.17216v3.pdf","comment":"NeurIPS 2023. Project page: http://jykoh.com/gill"},{"id":"http://arxiv.org/abs/2306.03655v2","updated":"2023-10-13T15:33:48Z","published":"2023-06-06T13:15:01Z","title":"Online Learning under Adversarial Nonlinear Constraints","summary":"  In many applications, learning systems are required to process continuous\nnon-stationary data streams. We study this problem in an online learning\nframework and propose an algorithm that can deal with adversarial time-varying\nand nonlinear constraints. As we show in our work, the algorithm called\nConstraint Violation Velocity Projection (CVV-Pro) achieves $\\sqrt{T}$ regret\nand converges to the feasible set at a rate of $1/\\sqrt{T}$, despite the fact\nthat the feasible set is slowly time-varying and a priori unknown to the\nlearner. CVV-Pro only relies on local sparse linear approximations of the\nfeasible set and therefore avoids optimizing over the entire set at each\niteration, which is in sharp contrast to projected gradients or Frank-Wolfe\nmethods. We also empirically evaluate our algorithm on two-player games, where\nthe players are subjected to a shared constraint.\n","authors":["Pavel Kolev","Georg Martius","Michael Muehlebach"],"pdf_url":"https://arxiv.org/pdf/2306.03655v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2303.00233v3","updated":"2023-10-13T15:32:57Z","published":"2023-03-01T05:03:23Z","title":"Single-Cell Multimodal Prediction via Transformers","summary":"  The recent development of multimodal single-cell technology has made the\npossibility of acquiring multiple omics data from individual cells, thereby\nenabling a deeper understanding of cellular states and dynamics. Nevertheless,\nthe proliferation of multimodal single-cell data also introduces tremendous\nchallenges in modeling the complex interactions among different modalities. The\nrecently advanced methods focus on constructing static interaction graphs and\napplying graph neural networks (GNNs) to learn from multimodal data. However,\nsuch static graphs can be suboptimal as they do not take advantage of the\ndownstream task information; meanwhile GNNs also have some inherent limitations\nwhen deeply stacking GNN layers. To tackle these issues, in this work, we\ninvestigate how to leverage transformers for multimodal single-cell data in an\nend-to-end manner while exploiting downstream task information. In particular,\nwe propose a scMoFormer framework which can readily incorporate external domain\nknowledge and model the interactions within each modality and cross modalities.\nExtensive experiments demonstrate that scMoFormer achieves superior performance\non various benchmark datasets. Remarkably, scMoFormer won a Kaggle silver medal\nwith the rank of 24/1221 (Top 2%) without ensemble in a NeurIPS 2022\ncompetition. Our implementation is publicly available at Github.\n","authors":["Wenzhuo Tang","Hongzhi Wen","Renming Liu","Jiayuan Ding","Wei Jin","Yuying Xie","Hui Liu","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2303.00233v3.pdf","comment":"CIKM 2023"},{"id":"http://arxiv.org/abs/2207.10062v4","updated":"2023-10-13T15:24:24Z","published":"2022-07-20T17:47:54Z","title":"DataPerf: Benchmarks for Data-Centric AI Development","summary":"  Machine learning research has long focused on models rather than datasets,\nand prominent datasets are used for common ML tasks without regard to the\nbreadth, difficulty, and faithfulness of the underlying problems. Neglecting\nthe fundamental importance of data has given rise to inaccuracy, bias, and\nfragility in real-world applications, and research is hindered by saturation\nacross existing dataset benchmarks. In response, we present DataPerf, a\ncommunity-led benchmark suite for evaluating ML datasets and data-centric\nalgorithms. We aim to foster innovation in data-centric AI through competition,\ncomparability, and reproducibility. We enable the ML community to iterate on\ndatasets, instead of just architectures, and we provide an open, online\nplatform with multiple rounds of challenges to support this iterative\ndevelopment. The first iteration of DataPerf contains five benchmarks covering\na wide spectrum of data-centric techniques, tasks, and modalities in vision,\nspeech, acquisition, debugging, and diffusion prompting, and we support hosting\nnew contributed benchmarks from the community. The benchmarks, online\nevaluation platform, and baseline implementations are open source, and the\nMLCommons Association will maintain DataPerf to ensure long-term benefits to\nacademia and industry.\n","authors":["Mark Mazumder","Colby Banbury","Xiaozhe Yao","Bojan Karlaš","William Gaviria Rojas","Sudnya Diamos","Greg Diamos","Lynn He","Alicia Parrish","Hannah Rose Kirk","Jessica Quaye","Charvi Rastogi","Douwe Kiela","David Jurado","David Kanter","Rafael Mosquera","Juan Ciro","Lora Aroyo","Bilge Acun","Lingjiao Chen","Mehul Smriti Raje","Max Bartolo","Sabri Eyuboglu","Amirata Ghorbani","Emmett Goodman","Oana Inel","Tariq Kane","Christine R. Kirkpatrick","Tzu-Sheng Kuo","Jonas Mueller","Tristan Thrush","Joaquin Vanschoren","Margaret Warren","Adina Williams","Serena Yeung","Newsha Ardalani","Praveen Paritosh","Lilith Bat-Leah","Ce Zhang","James Zou","Carole-Jean Wu","Cody Coleman","Andrew Ng","Peter Mattson","Vijay Janapa Reddi"],"pdf_url":"https://arxiv.org/pdf/2207.10062v4.pdf","comment":"NeurIPS 2023 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2310.09183v1","updated":"2023-10-13T15:21:25Z","published":"2023-10-13T15:21:25Z","title":"PRIOR: Personalized Prior for Reactivating the Information Overlooked in\n  Federated Learning","summary":"  Classical federated learning (FL) enables training machine learning models\nwithout sharing data for privacy preservation, but heterogeneous data\ncharacteristic degrades the performance of the localized model. Personalized FL\n(PFL) addresses this by synthesizing personalized models from a global model\nvia training on local data. Such a global model may overlook the specific\ninformation that the clients have been sampled. In this paper, we propose a\nnovel scheme to inject personalized prior knowledge into the global model in\neach client, which attempts to mitigate the introduced incomplete information\nproblem in PFL. At the heart of our proposed approach is a framework, the PFL\nwith Bregman Divergence (pFedBreD), decoupling the personalized prior from the\nlocal objective function regularized by Bregman divergence for greater\nadaptability in personalized scenarios. We also relax the mirror descent (RMD)\nto extract the prior explicitly to provide optional strategies. Additionally,\nour pFedBreD is backed up by a convergence analysis. Sufficient experiments\ndemonstrate that our method reaches the state-of-the-art performances on 5\ndatasets and outperforms other methods by up to 3.5% across 8 benchmarks.\nExtensive analyses verify the robustness and necessity of proposed designs.\n","authors":["Mingjia Shi","Yuhao Zhou","Kai Wang","Huaizheng Zhang","Shudong Huang","Qing Ye","Jiangcheng Lv"],"pdf_url":"https://arxiv.org/pdf/2310.09183v1.pdf","comment":"This paper is accepted by NeurIPS 2023"},{"id":"http://arxiv.org/abs/2308.02490v2","updated":"2023-10-13T15:16:59Z","published":"2023-08-04T17:59:47Z","title":"MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities","summary":"  We propose MM-Vet, an evaluation benchmark that examines large multimodal\nmodels (LMMs) on complicated multimodal tasks. Recent LMMs have shown various\nintriguing abilities, such as solving math problems written on the blackboard,\nreasoning about events and celebrities in news images, and explaining visual\njokes. Rapid model advancements pose challenges to evaluation benchmark\ndevelopment. Problems include: (1) How to systematically structure and evaluate\nthe complicated multimodal tasks; (2) How to design evaluation metrics that\nwork well across question and answer types; and (3) How to give model insights\nbeyond a simple performance ranking. To this end, we present MM-Vet, designed\nbased on the insight that the intriguing ability to solve complicated tasks is\noften achieved by a generalist model being able to integrate different core\nvision-language (VL) capabilities. MM-Vet defines 6 core VL capabilities and\nexamines the 16 integrations of interest derived from the capability\ncombination. For evaluation metrics, we propose an LLM-based evaluator for\nopen-ended outputs. The evaluator enables the evaluation across different\nquestion types and answer styles, resulting in a unified scoring metric. We\nevaluate representative LMMs on MM-Vet, providing insights into the\ncapabilities of different LMM system paradigms and models. Code and data are\navailable at https://github.com/yuweihao/MM-Vet.\n","authors":["Weihao Yu","Zhengyuan Yang","Linjie Li","Jianfeng Wang","Kevin Lin","Zicheng Liu","Xinchao Wang","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.02490v2.pdf","comment":"Update results of OpenFlamingo-9B (MPT), LLaMA-Adapter v2-7B, and\n  Otter-9B (MPT). Code, data and leaderboard:\n  https://github.com/yuweihao/MM-Vet"},{"id":"http://arxiv.org/abs/2310.09167v1","updated":"2023-10-13T15:01:55Z","published":"2023-10-13T15:01:55Z","title":"A Deep Neural Network -- Mechanistic Hybrid Model to Predict\n  Pharmacokinetics in Rat","summary":"  An important aspect in the development of small molecules as drugs or\nagro-chemicals is their systemic availability after intravenous and oral\nadministration.The prediction of the systemic availability from the chemical\nstructure of a poten-tial candidate is highly desirable, as it allows to focus\nthe drug or agrochemicaldevelopment on compounds with a favorable kinetic\nprofile. However, such pre-dictions are challenging as the availability is the\nresult of the complex interplaybetween molecular properties, biology and\nphysiology and training data is rare.In this work we improve the hybrid model\ndeveloped earlier [34]. We reducethe median fold change error for the total\noral exposure from 2.85 to 2.35 andfor intravenous administration from 1.95 to\n1.62. This is achieved by trainingon a larger data set, improving the neural\nnetwork architecture as well as theparametrization of mechanistic model.\nFurther, we extend our approach to predictadditional endpoints and to handle\ndifferent covariates, like sex and dosage form.In contrast to a pure machine\nlearning model, our model is able to predict newend points on which it has not\nbeen trained. We demonstrate this feature by1predicting the exposure over the\nfirst 24h, while the model has only been trainedon the total exposure.\n","authors":["Florian Führer","Andrea Gruber","Holger Diedam","Andreas H. Göller","Stephan Menz","Sebastian Schneckener"],"pdf_url":"https://arxiv.org/pdf/2310.09167v1.pdf","comment":"Journal of Computer-Aided Molecular Design"},{"id":"http://arxiv.org/abs/2310.09162v1","updated":"2023-10-13T14:56:38Z","published":"2023-10-13T14:56:38Z","title":"Quantum Machine Learning in Climate Change and Sustainability: a Review","summary":"  Climate change and its impact on global sustainability are critical\nchallenges, demanding innovative solutions that combine cutting-edge\ntechnologies and scientific insights. Quantum machine learning (QML) has\nemerged as a promising paradigm that harnesses the power of quantum computing\nto address complex problems in various domains including climate change and\nsustainability. In this work, we survey existing literature that applies\nquantum machine learning to solve climate change and sustainability-related\nproblems. We review promising QML methodologies that have the potential to\naccelerate decarbonization including energy systems, climate data forecasting,\nclimate monitoring, and hazardous events predictions. We discuss the challenges\nand current limitations of quantum machine learning approaches and provide an\noverview of potential opportunities and future work to leverage QML-based\nmethods in the important area of climate change research.\n","authors":["Amal Nammouchi","Andreas Kassler","Andreas Theorachis"],"pdf_url":"https://arxiv.org/pdf/2310.09162v1.pdf","comment":"8 pages Accepted for publication in AAAI proceedings (AAAI Fall\n  symposium 2023)"},{"id":"http://arxiv.org/abs/2310.09163v1","updated":"2023-10-13T14:56:38Z","published":"2023-10-13T14:56:38Z","title":"Jointly-Learned Exit and Inference for a Dynamic Neural Network :\n  JEI-DNN","summary":"  Large pretrained models, coupled with fine-tuning, are slowly becoming\nestablished as the dominant architecture in machine learning. Even though these\nmodels offer impressive performance, their practical application is often\nlimited by the prohibitive amount of resources required for every inference.\nEarly-exiting dynamic neural networks (EDNN) circumvent this issue by allowing\na model to make some of its predictions from intermediate layers (i.e.,\nearly-exit). Training an EDNN architecture is challenging as it consists of two\nintertwined components: the gating mechanism (GM) that controls early-exiting\ndecisions and the intermediate inference modules (IMs) that perform inference\nfrom intermediate representations. As a result, most existing approaches rely\non thresholding confidence metrics for the gating mechanism and strive to\nimprove the underlying backbone network and the inference modules. Although\nsuccessful, this approach has two fundamental shortcomings: 1) the GMs and the\nIMs are decoupled during training, leading to a train-test mismatch; and 2) the\nthresholding gating mechanism introduces a positive bias into the predictive\nprobabilities, making it difficult to readily extract uncertainty information.\nWe propose a novel architecture that connects these two modules. This leads to\nsignificant performance improvements on classification datasets and enables\nbetter uncertainty characterization capabilities.\n","authors":["Florence Regol","Joud Chataoui","Mark Coates"],"pdf_url":"https://arxiv.org/pdf/2310.09163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09157v1","updated":"2023-10-13T14:52:46Z","published":"2023-10-13T14:52:46Z","title":"The Computational Complexity of Finding Stationary Points in Non-Convex\n  Optimization","summary":"  Finding approximate stationary points, i.e., points where the gradient is\napproximately zero, of non-convex but smooth objective functions $f$ over\nunrestricted $d$-dimensional domains is one of the most fundamental problems in\nclassical non-convex optimization. Nevertheless, the computational and query\ncomplexity of this problem are still not well understood when the dimension $d$\nof the problem is independent of the approximation error. In this paper, we\nshow the following computational and query complexity results:\n  1. The problem of finding approximate stationary points over unrestricted\ndomains is PLS-complete.\n  2. For $d = 2$, we provide a zero-order algorithm for finding\n$\\varepsilon$-approximate stationary points that requires at most\n$O(1/\\varepsilon)$ value queries to the objective function.\n  3. We show that any algorithm needs at least $\\Omega(1/\\varepsilon)$ queries\nto the objective function and/or its gradient to find $\\varepsilon$-approximate\nstationary points when $d=2$. Combined with the above, this characterizes the\nquery complexity of this problem to be $\\Theta(1/\\varepsilon)$.\n  4. For $d = 2$, we provide a zero-order algorithm for finding\n$\\varepsilon$-KKT points in constrained optimization problems that requires at\nmost $O(1/\\sqrt{\\varepsilon})$ value queries to the objective function. This\ncloses the gap between the works of Bubeck and Mikulincer [2020] and Vavasis\n[1993] and characterizes the query complexity of this problem to be\n$\\Theta(1/\\sqrt{\\varepsilon})$.\n  5. Combining our results with the recent result of Fearnley et al. [2022], we\nshow that finding approximate KKT points in constrained optimization is\nreducible to finding approximate stationary points in unconstrained\noptimization but the converse is impossible.\n","authors":["Alexandros Hollender","Manolis Zampetakis"],"pdf_url":"https://arxiv.org/pdf/2310.09157v1.pdf","comment":"Full version of COLT 2023 extended abstract"},{"id":"http://arxiv.org/abs/2310.09149v1","updated":"2023-10-13T14:43:11Z","published":"2023-10-13T14:43:11Z","title":"Lattice Approximations in Wasserstein Space","summary":"  We consider structured approximation of measures in Wasserstein space\n$W_p(\\mathbb{R}^d)$ for $p\\in[1,\\infty)$ by discrete and piecewise constant\nmeasures based on a scaled Voronoi partition of $\\mathbb{R}^d$. We show that if\na full rank lattice $\\Lambda$ is scaled by a factor of $h\\in(0,1]$, then\napproximation of a measure based on the Voronoi partition of $h\\Lambda$ is\n$O(h)$ regardless of $d$ or $p$. We then use a covering argument to show that\n$N$-term approximations of compactly supported measures is $O(N^{-\\frac1d})$\nwhich matches known rates for optimal quantizers and empirical measure\napproximation in most instances. Finally, we extend these results to\nnoncompactly supported measures with sufficient decay.\n","authors":["Keaton Hamm","Varun Khurana"],"pdf_url":"https://arxiv.org/pdf/2310.09149v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09144v1","updated":"2023-10-13T14:35:59Z","published":"2023-10-13T14:35:59Z","title":"Goodhart's Law in Reinforcement Learning","summary":"  Implementing a reward function that perfectly captures a complex task in the\nreal world is impractical. As a result, it is often appropriate to think of the\nreward function as a proxy for the true objective rather than as its\ndefinition. We study this phenomenon through the lens of Goodhart's law, which\npredicts that increasing optimisation of an imperfect proxy beyond some\ncritical point decreases performance on the true objective. First, we propose a\nway to quantify the magnitude of this effect and show empirically that\noptimising an imperfect proxy reward often leads to the behaviour predicted by\nGoodhart's law for a wide range of environments and reward functions. We then\nprovide a geometric explanation for why Goodhart's law occurs in Markov\ndecision processes. We use these theoretical insights to propose an optimal\nearly stopping method that provably avoids the aforementioned pitfall and\nderive theoretical regret bounds for this method. Moreover, we derive a\ntraining method that maximises worst-case reward, for the setting where there\nis uncertainty about the true reward function. Finally, we evaluate our early\nstopping method experimentally. Our results support a foundation for a\ntheoretically-principled study of reinforcement learning under reward\nmisspecification.\n","authors":["Jacek Karwowski","Oliver Hayman","Xingjian Bai","Klaus Kiendlhofer","Charlie Griffin","Joar Skalse"],"pdf_url":"https://arxiv.org/pdf/2310.09144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09139v1","updated":"2023-10-13T14:27:21Z","published":"2023-10-13T14:27:21Z","title":"The Consensus Game: Language Model Generation via Equilibrium Search","summary":"  When applied to question answering and other text generation tasks, language\nmodels (LMs) may be queried generatively (by sampling answers from their output\ndistribution) or discriminatively (by using them to score or rank a set of\ncandidate outputs). These procedures sometimes yield very different\npredictions. How do we reconcile mutually incompatible scoring procedures to\nobtain coherent LM predictions? We introduce a new, a training-free,\ngame-theoretic procedure for language model decoding. Our approach casts\nlanguage model decoding as a regularized imperfect-information sequential\nsignaling game - which we term the CONSENSUS GAME - in which a GENERATOR seeks\nto communicate an abstract correctness parameter using natural language\nsentences to a DISCRIMINATOR. We develop computational procedures for finding\napproximate equilibria of this game, resulting in a decoding algorithm we call\nEQUILIBRIUM-RANKING. Applied to a large number of tasks (including reading\ncomprehension, commonsense reasoning, mathematical problem-solving, and\ndialog), EQUILIBRIUM-RANKING consistently, and sometimes substantially,\nimproves performance over existing LM decoding procedures - on multiple\nbenchmarks, we observe that applying EQUILIBRIUM-RANKING to LLaMA-7B\noutperforms the much larger LLaMA-65B and PaLM-540B models. These results\nhighlight the promise of game-theoretic tools for addressing fundamental\nchallenges of truthfulness and consistency in LMs.\n","authors":["Athul Paul Jacob","Yikang Shen","Gabriele Farina","Jacob Andreas"],"pdf_url":"https://arxiv.org/pdf/2310.09139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07321v2","updated":"2023-10-13T14:24:31Z","published":"2023-10-11T09:09:55Z","title":"On the Impact of Cross-Domain Data on German Language Models","summary":"  Traditionally, large language models have been either trained on general web\ncrawls or domain-specific data. However, recent successes of generative large\nlanguage models, have shed light on the benefits of cross-domain datasets. To\nexamine the significance of prioritizing data diversity over quality, we\npresent a German dataset comprising texts from five domains, along with another\ndataset aimed at containing high-quality data. Through training a series of\nmodels ranging between 122M and 750M parameters on both datasets, we conduct a\ncomprehensive benchmark on multiple downstream tasks. Our findings demonstrate\nthat the models trained on the cross-domain dataset outperform those trained on\nquality data alone, leading to improvements up to $4.45\\%$ over the previous\nstate-of-the-art. The models are available at\nhttps://huggingface.co/ikim-uk-essen\n","authors":["Amin Dada","Aokun Chen","Cheng Peng","Kaleb E Smith","Ahmad Idrissi-Yaghir","Constantin Marc Seibold","Jianning Li","Lars Heiliger","Xi Yang","Christoph M. Friedrich","Daniel Truhn","Jan Egger","Jiang Bian","Jens Kleesiek","Yonghui Wu"],"pdf_url":"https://arxiv.org/pdf/2310.07321v2.pdf","comment":"13 pages, 1 figure, accepted at Findings of the Association for\n  Computational Linguistics: EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.09129v1","updated":"2023-10-13T14:17:25Z","published":"2023-10-13T14:17:25Z","title":"Computing Marginal and Conditional Divergences between Decomposable\n  Models with Applications","summary":"  The ability to compute the exact divergence between two high-dimensional\ndistributions is useful in many applications but doing so naively is\nintractable. Computing the alpha-beta divergence -- a family of divergences\nthat includes the Kullback-Leibler divergence and Hellinger distance -- between\nthe joint distribution of two decomposable models, i.e chordal Markov networks,\ncan be done in time exponential in the treewidth of these models. However,\nreducing the dissimilarity between two high-dimensional objects to a single\nscalar value can be uninformative. Furthermore, in applications such as\nsupervised learning, the divergence over a conditional distribution might be of\nmore interest. Therefore, we propose an approach to compute the exact\nalpha-beta divergence between any marginal or conditional distribution of two\ndecomposable models. Doing so tractably is non-trivial as we need to decompose\nthe divergence between these distributions and therefore, require a\ndecomposition over the marginal and conditional distributions of these models.\nConsequently, we provide such a decomposition and also extend existing work to\ncompute the marginal and conditional alpha-beta divergence between these\ndecompositions. We then show how our method can be used to analyze\ndistributional changes by first applying it to a benchmark image dataset.\nFinally, based on our framework, we propose a novel way to quantify the error\nin contemporary superconducting quantum computers. Code for all experiments is\navailable at: https://lklee.dev/pub/2023-icdm/code\n","authors":["Loong Kuan Lee","Geoffrey I. Webb","Daniel F. Schmidt","Nico Piatkowski"],"pdf_url":"https://arxiv.org/pdf/2310.09129v1.pdf","comment":"10 pages, 8 figures, Accepted at the IEEE International Conference on\n  Data Mining (ICDM) 2023"},{"id":"http://arxiv.org/abs/2310.09127v1","updated":"2023-10-13T14:15:54Z","published":"2023-10-13T14:15:54Z","title":"On Generalization Bounds for Projective Clustering","summary":"  Given a set of points, clustering consists of finding a partition of a point\nset into $k$ clusters such that the center to which a point is assigned is as\nclose as possible. Most commonly, centers are points themselves, which leads to\nthe famous $k$-median and $k$-means objectives. One may also choose centers to\nbe $j$ dimensional subspaces, which gives rise to subspace clustering. In this\npaper, we consider learning bounds for these problems. That is, given a set of\n$n$ samples $P$ drawn independently from some unknown, but fixed distribution\n$\\mathcal{D}$, how quickly does a solution computed on $P$ converge to the\noptimal clustering of $\\mathcal{D}$? We give several near optimal results. In\nparticular,\n  For center-based objectives, we show a convergence rate of\n$\\tilde{O}\\left(\\sqrt{{k}/{n}}\\right)$. This matches the known optimal bounds\nof [Fefferman, Mitter, and Narayanan, Journal of the Mathematical Society 2016]\nand [Bartlett, Linder, and Lugosi, IEEE Trans. Inf. Theory 1998] for $k$-means\nand extends it to other important objectives such as $k$-median.\n  For subspace clustering with $j$-dimensional subspaces, we show a convergence\nrate of $\\tilde{O}\\left(\\sqrt{\\frac{kj^2}{n}}\\right)$. These are the first\nprovable bounds for most of these problems. For the specific case of projective\nclustering, which generalizes $k$-means, we show a convergence rate of\n$\\Omega\\left(\\sqrt{\\frac{kj}{n}}\\right)$ is necessary, thereby proving that the\nbounds from [Fefferman, Mitter, and Narayanan, Journal of the Mathematical\nSociety 2016] are essentially optimal.\n","authors":["Maria Sofia Bucarelli","Matilde Fjeldsø Larsen","Chris Schwiegelshohn","Mads Bech Toftrup"],"pdf_url":"https://arxiv.org/pdf/2310.09127v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09126v1","updated":"2023-10-13T14:14:43Z","published":"2023-10-13T14:14:43Z","title":"Physics-guided Noise Neural Proxy for Low-light Raw Image Denoising","summary":"  Low-light raw image denoising plays a crucial role in mobile photography, and\nlearning-based methods have become the mainstream approach. Training the\nlearning-based methods with synthetic data emerges as an efficient and\npractical alternative to paired real data. However, the quality of synthetic\ndata is inherently limited by the low accuracy of the noise model, which\ndecreases the performance of low-light raw image denoising. In this paper, we\ndevelop a novel framework for accurate noise modeling that learns a\nphysics-guided noise neural proxy (PNNP) from dark frames. PNNP integrates\nthree efficient techniques: physics-guided noise decoupling (PND),\nphysics-guided proxy model (PPM), and differentiable distribution-oriented loss\n(DDL). The PND decouples the dark frame into different components and handles\ndifferent levels of noise in a flexible manner, which reduces the complexity of\nthe noise neural proxy. The PPM incorporates physical priors to effectively\nconstrain the generated noise, which promotes the accuracy of the noise neural\nproxy. The DDL provides explicit and reliable supervision for noise modeling,\nwhich promotes the precision of the noise neural proxy. Extensive experiments\non public low-light raw image denoising datasets and real low-light imaging\nscenarios demonstrate the superior performance of our PNNP framework.\n","authors":["Hansen Feng","Lizhi Wang","Yiqi Huang","Yuzhi Wang","Hua Huang"],"pdf_url":"https://arxiv.org/pdf/2310.09126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09125v1","updated":"2023-10-13T14:14:00Z","published":"2023-10-13T14:14:00Z","title":"Training and Predicting Visual Error for Real-Time Applications","summary":"  Visual error metrics play a fundamental role in the quantification of\nperceived image similarity. Most recently, use cases for them in real-time\napplications have emerged, such as content-adaptive shading and shading reuse\nto increase performance and improve efficiency. A wide range of different\nmetrics has been established, with the most sophisticated being capable of\ncapturing the perceptual characteristics of the human visual system. However,\ntheir complexity, computational expense, and reliance on reference images to\ncompare against prevent their generalized use in real-time, restricting such\napplications to using only the simplest available metrics. In this work, we\nexplore the abilities of convolutional neural networks to predict a variety of\nvisual metrics without requiring either reference or rendered images.\nSpecifically, we train and deploy a neural network to estimate the visual error\nresulting from reusing shading or using reduced shading rates. The resulting\nmodels account for 70%-90% of the variance while achieving up to an order of\nmagnitude faster computation times. Our solution combines image-space\ninformation that is readily available in most state-of-the-art deferred shading\npipelines with reprojection from previous frames to enable an adequate estimate\nof visual errors, even in previously unseen regions. We describe a suitable\nconvolutional network architecture and considerations for data preparation for\ntraining. We demonstrate the capability of our network to predict complex error\nmetrics at interactive rates in a real-time application that implements\ncontent-adaptive shading in a deferred pipeline. Depending on the portion of\nunseen image regions, our approach can achieve up to $2\\times$ performance\ncompared to state-of-the-art methods.\n","authors":["João Libório Cardoso","Bernhard Kerbl","Lei Yang","Yury Uralsky","Michael Wimmer"],"pdf_url":"https://arxiv.org/pdf/2310.09125v1.pdf","comment":"Published at Proceedings of the ACM in Computer Graphics and\n  Interactive Techniques. 14 Pages, 16 Figures, 3 Tables. For paper website and\n  higher quality figures, see https://jaliborc.github.io/rt-percept/"},{"id":"http://arxiv.org/abs/2310.09123v1","updated":"2023-10-13T14:13:02Z","published":"2023-10-13T14:13:02Z","title":"Automatic Music Playlist Generation via Simulation-based Reinforcement\n  Learning","summary":"  Personalization of playlists is a common feature in music streaming services,\nbut conventional techniques, such as collaborative filtering, rely on explicit\nassumptions regarding content quality to learn how to make recommendations.\nSuch assumptions often result in misalignment between offline model objectives\nand online user satisfaction metrics. In this paper, we present a reinforcement\nlearning framework that solves for such limitations by directly optimizing for\nuser satisfaction metrics via the use of a simulated playlist-generation\nenvironment. Using this simulator we develop and train a modified Deep\nQ-Network, the action head DQN (AH-DQN), in a manner that addresses the\nchallenges imposed by the large state and action space of our RL formulation.\nThe resulting policy is capable of making recommendations from large and\ndynamic sets of candidate items with the expectation of maximizing consumption\nmetrics. We analyze and evaluate agents offline via simulations that use\nenvironment models trained on both public and proprietary streaming datasets.\nWe show how these agents lead to better user-satisfaction metrics compared to\nbaseline methods during online A/B tests. Finally, we demonstrate that\nperformance assessments produced from our simulator are strongly correlated\nwith observed online metric results.\n","authors":["Federico Tomasi","Joseph Cauteruccio","Surya Kanoria","Kamil Ciosek","Matteo Rinaldi","Zhenwen Dai"],"pdf_url":"https://arxiv.org/pdf/2310.09123v1.pdf","comment":"10 pages. KDD 23"},{"id":"http://arxiv.org/abs/2310.09118v1","updated":"2023-10-13T14:03:01Z","published":"2023-10-13T14:03:01Z","title":"DSG: An End-to-End Document Structure Generator","summary":"  Information in industry, research, and the public sector is widely stored as\nrendered documents (e.g., PDF files, scans). Hence, to enable downstream tasks,\nsystems are needed that map rendered documents onto a structured hierarchical\nformat. However, existing systems for this task are limited by heuristics and\nare not end-to-end trainable. In this work, we introduce the Document Structure\nGenerator (DSG), a novel system for document parsing that is fully end-to-end\ntrainable. DSG combines a deep neural network for parsing (i) entities in\ndocuments (e.g., figures, text blocks, headers, etc.) and (ii) relations that\ncapture the sequence and nested structure between entities. Unlike existing\nsystems that rely on heuristics, our DSG is trained end-to-end, making it\neffective and flexible for real-world applications. We further contribute a\nnew, large-scale dataset called E-Periodica comprising real-world magazines\nwith complex document structures for evaluation. Our results demonstrate that\nour DSG outperforms commercial OCR tools and, on top of that, achieves\nstate-of-the-art performance. To the best of our knowledge, our DSG system is\nthe first end-to-end trainable system for hierarchical document parsing.\n","authors":["Johannes Rausch","Gentiana Rashiti","Maxim Gusev","Ce Zhang","Stefan Feuerriegel"],"pdf_url":"https://arxiv.org/pdf/2310.09118v1.pdf","comment":"Accepted at ICDM 2023"},{"id":"http://arxiv.org/abs/2306.09675v3","updated":"2023-10-13T13:48:58Z","published":"2023-06-16T08:13:41Z","title":"Multi-View Class Incremental Learning","summary":"  Multi-view learning (MVL) has gained great success in integrating information\nfrom multiple perspectives of a dataset to improve downstream task performance.\nTo make MVL methods more practical in an open-ended environment, this paper\ninvestigates a novel paradigm called multi-view class incremental learning\n(MVCIL), where a single model incrementally classifies new classes from a\ncontinual stream of views, requiring no access to earlier views of data.\nHowever, MVCIL is challenged by the catastrophic forgetting of old information\nand the interference with learning new concepts. To address this, we first\ndevelop a randomization-based representation learning technique serving for\nfeature extraction to guarantee their separate view-optimal working states,\nduring which multiple views belonging to a class are presented sequentially;\nThen, we integrate them one by one in the orthogonality fusion subspace spanned\nby the extracted features; Finally, we introduce selective weight consolidation\nfor learning-without-forgetting decision-making while encountering new classes.\nExtensive experiments on synthetic and real-world datasets validate the\neffectiveness of our approach.\n","authors":["Depeng Li","Tianqi Wang","Junwei Chen","Kenji Kawaguchi","Cheng Lian","Zhigang Zeng"],"pdf_url":"https://arxiv.org/pdf/2306.09675v3.pdf","comment":"Accepted to Information Fusion"},{"id":"http://arxiv.org/abs/2309.03004v3","updated":"2023-10-13T13:34:45Z","published":"2023-09-06T13:48:40Z","title":"A Theoretical Explanation of Activation Sparsity through Flat Minima and\n  Adversarial Robustness","summary":"  A recent empirical observation (Li et al., 2022b) of activation sparsity in\nMLP blocks offers an opportunity to drastically reduce computation costs for\nfree. Although having attributed it to training dynamics, existing theoretical\nexplanations of activation sparsity are restricted to shallow networks, small\ntraining steps and special training, despite its emergence in deep models\nstandardly trained for a large number of steps. To fill these gaps, we propose\nthe notion of gradient sparsity as one source of activation sparsity and a\ntheoretical explanation based on it that sees sparsity a necessary step to\nadversarial robustness w.r.t. hidden features and parameters, which is\napproximately the flatness of minima for well-learned models. The theory\napplies to standardly trained LayerNorm-ed MLPs, and further to Transformers or\nother architectures trained with weight noises. Eliminating other sources of\nflatness except for sparsity, we discover the phenomenon that the ratio between\nthe largest and smallest non-zero singular values of weight matrices is small.\nWhen discussing the emergence of this spectral concentration, we use random\nmatrix theory (RMT) as a powerful tool to analyze stochastic gradient noises.\nValidational experiments are conducted to verify our gradient-sparsity-based\nexplanation. We propose two plug-and-play modules for both training and\nfinetuning for sparsity. Experiments on ImageNet-1k and C4 demonstrate their\n50% sparsity improvements, indicating further potential cost reduction in both\ntraining and inference.\n","authors":["Ze Peng","Lei Qi","Yinghuan Shi","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2309.03004v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09091v1","updated":"2023-10-13T13:22:05Z","published":"2023-10-13T13:22:05Z","title":"Insightful analysis of historical sources at scales beyond human\n  capabilities using unsupervised Machine Learning and XAI","summary":"  Historical materials are abundant. Yet, piecing together how human knowledge\nhas evolved and spread both diachronically and synchronically remains a\nchallenge that can so far only be very selectively addressed. The vast volume\nof materials precludes comprehensive studies, given the restricted number of\nhuman specialists. However, as large amounts of historical materials are now\navailable in digital form there is a promising opportunity for AI-assisted\nhistorical analysis. In this work, we take a pivotal step towards analyzing\nvast historical corpora by employing innovative machine learning (ML)\ntechniques, enabling in-depth historical insights on a grand scale. Our study\ncenters on the evolution of knowledge within the `Sacrobosco Collection' -- a\ndigitized collection of 359 early modern printed editions of textbooks on\nastronomy used at European universities between 1472 and 1650 -- roughly 76,000\npages, many of which contain astronomic, computational tables. An ML based\nanalysis of these tables helps to unveil important facets of the\nspatio-temporal evolution of knowledge and innovation in the field of\nmathematical astronomy in the period, as taught at European universities.\n","authors":["Oliver Eberle","Jochen Büttner","Hassan El-Hajj","Grégoire Montavon","Klaus-Robert Müller","Matteo Valleriani"],"pdf_url":"https://arxiv.org/pdf/2310.09091v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2107.06755v2","updated":"2023-10-13T13:19:31Z","published":"2021-07-14T15:09:27Z","title":"DIT4BEARs Smart Roads Internship","summary":"  The research internship at UiT - The Arctic University of Norway was offered\nfor our team being the winner of the 'Smart Roads - Winter Road Maintenance\n2021' Hackathon. The internship commenced on 3 May 2021 and ended on 21 May\n2021 with meetings happening twice each week. In spite of having different\nnationalities and educational backgrounds, we both interns tried to collaborate\nas a team as much as possible. The most alluring part was working on this\nproject made us realize the critical conditions faced by the arctic people,\nwhere it was hard to gain such a unique experience from our residence. We\ndeveloped and implemented several deep learning models to classify the states\n(dry, moist, wet, icy, snowy, slushy). Depending upon the best model, the\nweather forecast app will predict the state taking the Ta, Tsurf, Height,\nSpeed, Water, etc. into consideration. The crucial part was to define a safety\nmetric which is the product of the accident rates based on friction and the\naccident rates based on states. We developed a regressor that will predict the\nsafety metric depending upon the state obtained from the classifier and the\nfriction obtained from the sensor data. A pathfinding algorithm has been\ndesigned using the sensor data, open street map data, weather data.\n","authors":["Md Abrar Jahin","Andrii Krutsylo"],"pdf_url":"https://arxiv.org/pdf/2107.06755v2.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2306.10474v2","updated":"2023-10-13T13:05:26Z","published":"2023-06-18T04:34:17Z","title":"A Universal Semantic-Geometric Representation for Robotic Manipulation","summary":"  Robots rely heavily on sensors, especially RGB and depth cameras, to perceive\nand interact with the world. RGB cameras record 2D images with rich semantic\ninformation while missing precise spatial information. On the other side, depth\ncameras offer critical 3D geometry data but capture limited semantics.\nTherefore, integrating both modalities is crucial for learning representations\nfor robotic perception and control. However, current research predominantly\nfocuses on only one of these modalities, neglecting the benefits of\nincorporating both. To this end, we present $\\textbf{Semantic-Geometric\nRepresentation} (\\textbf{SGR})$, a universal perception module for robotics\nthat leverages the rich semantic information of large-scale pre-trained 2D\nmodels and inherits the merits of 3D spatial reasoning. Our experiments\ndemonstrate that SGR empowers the agent to successfully complete a diverse\nrange of simulated and real-world robotic manipulation tasks, outperforming\nstate-of-the-art methods significantly in both single-task and multi-task\nsettings. Furthermore, SGR possesses the capability to generalize to novel\nsemantic attributes, setting it apart from the other methods. Project website:\nhttps://semantic-geometric-representation.github.io.\n","authors":["Tong Zhang","Yingdong Hu","Hanchen Cui","Hang Zhao","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2306.10474v2.pdf","comment":"CoRL 2023. Project website:\n  https://semantic-geometric-representation.github.io"},{"id":"http://arxiv.org/abs/2303.15833v2","updated":"2023-10-13T12:49:35Z","published":"2023-03-28T09:05:15Z","title":"Complementary Domain Adaptation and Generalization for Unsupervised\n  Continual Domain Shift Learning","summary":"  Continual domain shift poses a significant challenge in real-world\napplications, particularly in situations where labeled data is not available\nfor new domains. The challenge of acquiring knowledge in this problem setting\nis referred to as unsupervised continual domain shift learning. Existing\nmethods for domain adaptation and generalization have limitations in addressing\nthis issue, as they focus either on adapting to a specific domain or\ngeneralizing to unseen domains, but not both. In this paper, we propose\nComplementary Domain Adaptation and Generalization (CoDAG), a simple yet\neffective learning framework that combines domain adaptation and generalization\nin a complementary manner to achieve three major goals of unsupervised\ncontinual domain shift learning: adapting to a current domain, generalizing to\nunseen domains, and preventing forgetting of previously seen domains. Our\napproach is model-agnostic, meaning that it is compatible with any existing\ndomain adaptation and generalization algorithms. We evaluate CoDAG on several\nbenchmark datasets and demonstrate that our model outperforms state-of-the-art\nmodels in all datasets and evaluation metrics, highlighting its effectiveness\nand robustness in handling unsupervised continual domain shift learning.\n","authors":["Wonguk Cho","Jinha Park","Taesup Kim"],"pdf_url":"https://arxiv.org/pdf/2303.15833v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2310.09071v1","updated":"2023-10-13T12:45:52Z","published":"2023-10-13T12:45:52Z","title":"Online Relocating and Matching of Ride-Hailing Services: A Model-Based\n  Modular Approach","summary":"  This study proposes an innovative model-based modular approach (MMA) to\ndynamically optimize order matching and vehicle relocation in a ride-hailing\nplatform. MMA utilizes a two-layer and modular modeling structure. The upper\nlayer determines the spatial transfer patterns of vehicle flow within the\nsystem to maximize the total revenue of the current and future stages. With the\nguidance provided by the upper layer, the lower layer performs rapid\nvehicle-to-order matching and vehicle relocation. MMA is interpretable, and\nequipped with the customized and polynomial-time algorithm, which, as an online\norder-matching and vehicle-relocation algorithm, can scale past thousands of\nvehicles. We theoretically prove that the proposed algorithm can achieve the\nglobal optimum in stylized networks, while the numerical experiments based on\nboth the toy network and realistic dataset demonstrate that MMA is capable of\nachieving superior systematic performance compared to batch matching and\nreinforcement-learning based methods. Moreover, its modular and lightweight\nmodeling structure further enables it to achieve a high level of robustness\nagainst demand variation while maintaining a relatively low computational cost.\n","authors":["Chang Gao","Xi Lin","Fang He","Xindi Tang"],"pdf_url":"https://arxiv.org/pdf/2310.09071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09044v1","updated":"2023-10-13T12:12:34Z","published":"2023-10-13T12:12:34Z","title":"KCTS: Knowledge-Constrained Tree Search Decoding with Token-Level\n  Hallucination Detection","summary":"  Large Language Models (LLMs) have demonstrated remarkable human-level natural\nlanguage generation capabilities. However, their potential to generate\nmisinformation, often called the hallucination problem, poses a significant\nrisk to their deployment. A common approach to address this issue is to\nretrieve relevant knowledge and fine-tune the LLM with the knowledge in its\ninput. Unfortunately, this method incurs high training costs and may cause\ncatastrophic forgetting for multi-tasking models. To overcome these\nlimitations, we propose a knowledge-constrained decoding method called KCTS\n(Knowledge-Constrained Tree Search), which guides a frozen LM to generate text\naligned with the reference knowledge at each decoding step using a knowledge\nclassifier score and MCTS (Monte-Carlo Tree Search). To adapt the\nsequence-level knowledge classifier to token-level guidance, we also propose a\nnovel token-level hallucination detection method called RIPA (Reward Inflection\nPoint Approximation). Our empirical results on knowledge-grounded dialogue and\nabstractive summarization demonstrate the strength of KCTS as a plug-and-play,\nmodel-agnostic decoding method that can effectively reduce hallucinations in\nnatural language generation.\n","authors":["Sehyun Choi","Tianqing Fang","Zhaowei Wang","Yangqiu Song"],"pdf_url":"https://arxiv.org/pdf/2310.09044v1.pdf","comment":"Accepted at EMNLP 2023 Main Conference"},{"id":"http://arxiv.org/abs/2207.09768v3","updated":"2023-10-13T12:09:26Z","published":"2022-07-20T09:23:35Z","title":"Learning Counterfactually Invariant Predictors","summary":"  Notions of counterfactual invariance (CI) have proven essential for\npredictors that are fair, robust, and generalizable in the real world. We\npropose graphical criteria that yield a sufficient condition for a predictor to\nbe counterfactually invariant in terms of a conditional independence in the\nobservational distribution. In order to learn such predictors, we propose a\nmodel-agnostic framework, called Counterfactually Invariant Prediction (CIP),\nbuilding on the Hilbert-Schmidt Conditional Independence Criterion (HSCIC), a\nkernel-based conditional dependence measure. Our experimental results\ndemonstrate the effectiveness of CIP in enforcing counterfactual invariance\nacross various simulated and real-world datasets including scalar and\nmulti-variate settings.\n","authors":["Francesco Quinzan","Cecilia Casolo","Krikamol Muandet","Yucen Luo","Niki Kilbertus"],"pdf_url":"https://arxiv.org/pdf/2207.09768v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09040v1","updated":"2023-10-13T12:07:36Z","published":"2023-10-13T12:07:36Z","title":"Optimal Scheduling of Electric Vehicle Charging with Deep Reinforcement\n  Learning considering End Users Flexibility","summary":"  The rapid growth of decentralized energy resources and especially Electric\nVehicles (EV), that are expected to increase sharply over the next decade, will\nput further stress on existing power distribution networks, increasing the need\nfor higher system reliability and flexibility. In an attempt to avoid\nunnecessary network investments and to increase the controllability over\ndistribution networks, network operators develop demand response (DR) programs\nthat incentivize end users to shift their consumption in return for financial\nor other benefits. Artificial intelligence (AI) methods are in the research\nforefront for residential load scheduling applications, mainly due to their\nhigh accuracy, high computational speed and lower dependence on the physical\ncharacteristics of the models under development. The aim of this work is to\nidentify households' EV cost-reducing charging policy under a Time-of-Use\ntariff scheme, with the use of Deep Reinforcement Learning, and more\nspecifically Deep Q-Networks (DQN). A novel end users flexibility potential\nreward is inferred from historical data analysis, where households with solar\npower generation have been used to train and test the designed algorithm. The\nsuggested DQN EV charging policy can lead to more than 20% of savings in end\nusers electricity bills.\n","authors":["Christoforos Menos-Aikateriniadis","Stavros Sykiotis","Pavlos S. Georgilakis"],"pdf_url":"https://arxiv.org/pdf/2310.09040v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09031v1","updated":"2023-10-13T11:47:41Z","published":"2023-10-13T11:47:41Z","title":"MINDE: Mutual Information Neural Diffusion Estimation","summary":"  In this work we present a new method for the estimation of Mutual Information\n(MI) between random variables. Our approach is based on an original\ninterpretation of the Girsanov theorem, which allows us to use score-based\ndiffusion models to estimate the Kullback Leibler divergence between two\ndensities as a difference between their score functions. As a by-product, our\nmethod also enables the estimation of the entropy of random variables. Armed\nwith such building blocks, we present a general recipe to measure MI, which\nunfolds in two directions: one uses conditional diffusion process, whereas the\nother uses joint diffusion processes that allow simultaneous modelling of two\nrandom variables. Our results, which derive from a thorough experimental\nprotocol over all the variants of our approach, indicate that our method is\nmore accurate than the main alternatives from the literature, especially for\nchallenging distributions. Furthermore, our methods pass MI self-consistency\ntests, including data processing and additivity under independence, which\ninstead are a pain-point of existing methods.\n","authors":["Giulio Franzese","Mustapha Bounoua","Pietro Michiardi"],"pdf_url":"https://arxiv.org/pdf/2310.09031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09028v1","updated":"2023-10-13T11:40:18Z","published":"2023-10-13T11:40:18Z","title":"Subspace Adaptation Prior for Few-Shot Learning","summary":"  Gradient-based meta-learning techniques aim to distill useful prior knowledge\nfrom a set of training tasks such that new tasks can be learned more\nefficiently with gradient descent. While these methods have achieved successes\nin various scenarios, they commonly adapt all parameters of trainable layers\nwhen learning new tasks. This neglects potentially more efficient learning\nstrategies for a given task distribution and may be susceptible to overfitting,\nespecially in few-shot learning where tasks must be learned from a limited\nnumber of examples. To address these issues, we propose Subspace Adaptation\nPrior (SAP), a novel gradient-based meta-learning algorithm that jointly learns\ngood initialization parameters (prior knowledge) and layer-wise parameter\nsubspaces in the form of operation subsets that should be adaptable. In this\nway, SAP can learn which operation subsets to adjust with gradient descent\nbased on the underlying task distribution, simultaneously decreasing the risk\nof overfitting when learning new tasks. We demonstrate that this ability is\nhelpful as SAP yields superior or competitive performance in few-shot image\nclassification settings (gains between 0.1% and 3.9% in accuracy). Analysis of\nthe learned subspaces demonstrates that low-dimensional operations often yield\nhigh activation strengths, indicating that they may be important for achieving\ngood few-shot learning performance. For reproducibility purposes, we publish\nall our research code publicly.\n","authors":["Mike Huisman","Aske Plaat","Jan N. van Rijn"],"pdf_url":"https://arxiv.org/pdf/2310.09028v1.pdf","comment":"Accepted at Machine Learning Journal, Special Issue of the ECML PKDD\n  2023 Journal Track"},{"id":"http://arxiv.org/abs/2302.06887v3","updated":"2023-10-13T11:08:19Z","published":"2023-02-14T08:20:41Z","title":"Learning Graph ARMA Processes from Time-Vertex Spectra","summary":"  The modeling of time-varying graph signals as stationary time-vertex\nstochastic processes permits the inference of missing signal values by\nefficiently employing the correlation patterns of the process across different\ngraph nodes and time instants. In this study, we propose an algorithm for\ncomputing graph autoregressive moving average (graph ARMA) processes based on\nlearning the joint time-vertex power spectral density of the process from its\nincomplete realizations for the task of signal interpolation. Our solution\nrelies on first roughly estimating the joint spectrum of the process from\npartially observed realizations and then refining this estimate by projecting\nit onto the spectrum manifold of the graph ARMA process through convex\nrelaxations. The initially missing signal values are then estimated based on\nthe learnt model. Experimental results show that the proposed approach achieves\nhigh accuracy in time-vertex signal estimation problems.\n","authors":["Eylem Tugce Guneyi","Berkay Yaldiz","Abdullah Canbolat","Elif Vural"],"pdf_url":"https://arxiv.org/pdf/2302.06887v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09002v1","updated":"2023-10-13T10:48:28Z","published":"2023-10-13T10:48:28Z","title":"Federated Meta-Learning for Few-Shot Fault Diagnosis with Representation\n  Encoding","summary":"  Deep learning-based fault diagnosis (FD) approaches require a large amount of\ntraining data, which are difficult to obtain since they are located across\ndifferent entities. Federated learning (FL) enables multiple clients to\ncollaboratively train a shared model with data privacy guaranteed. However, the\ndomain discrepancy and data scarcity problems among clients deteriorate the\nperformance of the global FL model. To tackle these issues, we propose a novel\nframework called representation encoding-based federated meta-learning (REFML)\nfor few-shot FD. First, a novel training strategy based on representation\nencoding and meta-learning is developed. It harnesses the inherent\nheterogeneity among training clients, effectively transforming it into an\nadvantage for out-of-distribution generalization on unseen working conditions\nor equipment types. Additionally, an adaptive interpolation method that\ncalculates the optimal combination of local and global models as the\ninitialization of local training is proposed. This helps to further utilize\nlocal information to mitigate the negative effects of domain discrepancy. As a\nresult, high diagnostic accuracy can be achieved on unseen working conditions\nor equipment types with limited training data. Compared with the\nstate-of-the-art methods, such as FedProx, the proposed REFML framework\nachieves an increase in accuracy by 2.17%-6.50% when tested on unseen working\nconditions of the same equipment type and 13.44%-18.33% when tested on totally\nunseen equipment types, respectively.\n","authors":["Jixuan Cui","Jun Li","Zhen Mei","Kang Wei","Sha Wei","Ming Ding","Wen Chen","Song Guo"],"pdf_url":"https://arxiv.org/pdf/2310.09002v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09000v1","updated":"2023-10-13T10:37:46Z","published":"2023-10-13T10:37:46Z","title":"Measuring the Stability of Process Outcome Predictions in Online\n  Settings","summary":"  Predictive Process Monitoring aims to forecast the future progress of process\ninstances using historical event data. As predictive process monitoring is\nincreasingly applied in online settings to enable timely interventions,\nevaluating the performance of the underlying models becomes crucial for\nensuring their consistency and reliability over time. This is especially\nimportant in high risk business scenarios where incorrect predictions may have\nsevere consequences. However, predictive models are currently usually evaluated\nusing a single, aggregated value or a time-series visualization, which makes it\nchallenging to assess their performance and, specifically, their stability over\ntime. This paper proposes an evaluation framework for assessing the stability\nof models for online predictive process monitoring. The framework introduces\nfour performance meta-measures: the frequency of significant performance drops,\nthe magnitude of such drops, the recovery rate, and the volatility of\nperformance. To validate this framework, we applied it to two artificial and\ntwo real-world event logs. The results demonstrate that these meta-measures\nfacilitate the comparison and selection of predictive models for different\nrisk-taking scenarios. Such insights are of particular value to enhance\ndecision-making in dynamic business environments.\n","authors":["Suhwan Lee","Marco Comuzzi","Xixi Lu","Hajo A. Reijers"],"pdf_url":"https://arxiv.org/pdf/2310.09000v1.pdf","comment":"8 pages, 3 figures, Proceedings of the 5th International Conference\n  on Process Mining (ICPM 2023)"},{"id":"http://arxiv.org/abs/2310.08988v1","updated":"2023-10-13T10:09:12Z","published":"2023-10-13T10:09:12Z","title":"Reroute Prediction Service","summary":"  The cost of delays was estimated as 33 billion US dollars only in 2019 for\nthe US National Airspace System, a peak value following a growth trend in past\nyears. Aiming to address this huge inefficiency, we designed and developed a\nnovel Data Analytics and Machine Learning system, which aims at reducing delays\nby proactively supporting re-routing decisions.\n  Given a time interval up to a few days in the future, the system predicts if\na reroute advisory for a certain Air Route Traffic Control Center or for a\ncertain advisory identifier will be issued, which may impact the pertinent\nroutes. To deliver such predictions, the system uses historical reroute data,\ncollected from the System Wide Information Management (SWIM) data services\nprovided by the FAA, and weather data, provided by the US National Centers for\nEnvironmental Prediction (NCEP). The data is huge in volume, and has many items\nstreamed at high velocity, uncorrelated and noisy. The system continuously\nprocesses the incoming raw data and makes it available for the next step where\nan interim data store is created and adaptively maintained for efficient query\nprocessing. The resulting data is fed into an array of ML algorithms, which\ncompete for higher accuracy. The best performing algorithm is used in the final\nprediction, generating the final results. Mean accuracy values higher than 90%\nwere obtained in our experiments with this system.\n  Our algorithm divides the area of interest in units of aggregation and uses\ntemporal series of the aggregate measures of weather forecast parameters in\neach geographical unit, in order to detect correlations with reroutes and where\nthey will most likely occur. Aiming at practical application, the system is\nformed by a number of microservices, which are deployed in the cloud, making\nthe system distributed, scalable and highly available.\n","authors":["Ítalo Romani de Oliveira","Samet Ayhan","Michael Biglin","Pablo Costas","Euclides C. Pinto Neto"],"pdf_url":"https://arxiv.org/pdf/2310.08988v1.pdf","comment":"Submitted to the 2023 IEEE/AIAA Digital Aviation Systems Conference\n  (DASC)"},{"id":"http://arxiv.org/abs/2308.09259v2","updated":"2023-10-13T09:41:45Z","published":"2023-08-18T02:34:37Z","title":"FRGNN: Mitigating the Impact of Distribution Shift on Graph Neural\n  Networks via Test-Time Feature Reconstruction","summary":"  Due to inappropriate sample selection and limited training data, a\ndistribution shift often exists between the training and test sets. This shift\ncan adversely affect the test performance of Graph Neural Networks (GNNs).\nExisting approaches mitigate this issue by either enhancing the robustness of\nGNNs to distribution shift or reducing the shift itself. However, both\napproaches necessitate retraining the model, which becomes unfeasible when the\nmodel structure and parameters are inaccessible. To address this challenge, we\npropose FR-GNN, a general framework for GNNs to conduct feature reconstruction.\nFRGNN constructs a mapping relationship between the output and input of a\nwell-trained GNN to obtain class representative embeddings and then uses these\nembeddings to reconstruct the features of labeled nodes. These reconstructed\nfeatures are then incorporated into the message passing mechanism of GNNs to\ninfluence the predictions of unlabeled nodes at test time. Notably, the\nreconstructed node features can be directly utilized for testing the\nwell-trained model, effectively reducing the distribution shift and leading to\nimproved test performance. This remarkable achievement is attained without any\nmodifications to the model structure or parameters. We provide theoretical\nguarantees for the effectiveness of our framework. Furthermore, we conduct\ncomprehensive experiments on various public datasets. The experimental results\ndemonstrate the superior performance of FRGNN in comparison to multiple\ncategories of baseline methods.\n","authors":["Rui Ding","Jielong Yang","Feng Ji","Xionghu Zhong","Linbo Xie"],"pdf_url":"https://arxiv.org/pdf/2308.09259v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16978v2","updated":"2023-10-13T09:37:06Z","published":"2023-06-29T14:32:06Z","title":"Learning Coverage Paths in Unknown Environments with Reinforcement\n  Learning","summary":"  Coverage path planning (CPP) is the problem of finding a path that covers the\nentire free space of a confined area, with applications ranging from robotic\nlawn mowing and vacuum cleaning, to demining and search-and-rescue tasks. While\noffline methods can find provably complete, and in some cases optimal, paths\nfor known environments, their value is limited in online scenarios where the\nenvironment is not known beforehand. In this case, the path needs to be planned\nonline while mapping the environment. We investigate how suitable reinforcement\nlearning is for this challenging problem, and analyze the involved components\nrequired to efficiently learn coverage paths, such as action space, input\nfeature representation, neural network architecture, and reward function.\nCompared to existing classical methods, this approach allows for a flexible\npath space, and enables the agent to adapt to specific environment dynamics. In\naddition to local sensory inputs for acting on short-term obstacle detections,\nwe propose to use egocentric maps in multiple scales based on frontiers. This\nallows the agent to plan a long-term path in large-scale environments with\nfeasible computational and memory complexity. Furthermore, we propose a novel\ntotal variation reward term for guiding the agent not to leave small holes of\nnon-covered free space. To validate the effectiveness of our approach, we\nperform extensive experiments in simulation with a 2D ranging sensor on\ndifferent variations of the CPP problem, surpassing the performance of both\nprevious RL-based approaches and highly specialized methods.\n","authors":["Arvi Jonnarth","Jie Zhao","Michael Felsberg"],"pdf_url":"https://arxiv.org/pdf/2306.16978v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08627v2","updated":"2023-10-13T09:12:03Z","published":"2023-03-15T13:54:11Z","title":"From Images to Features: Unbiased Morphology Classification via\n  Variational Auto-Encoders and Domain Adaptation","summary":"  We present a novel approach for the dimensionality reduction of galaxy images\nby leveraging a combination of variational auto-encoders (VAE) and domain\nadaptation (DA). We demonstrate the effectiveness of this approach using a\nsample of low redshift galaxies with detailed morphological type labels from\nthe Galaxy-Zoo DECaLS project. We show that 40-dimensional latent variables can\neffectively reproduce most morphological features in galaxy images. To further\nvalidate the effectiveness of our approach, we utilised a classical random\nforest (RF) classifier on the 40-dimensional latent variables to make detailed\nmorphology feature classifications. This approach performs similarly to a\ndirect neural network application on galaxy images. We further enhance our\nmodel by tuning the VAE network via DA using galaxies in the overlapping\nfootprint of DECaLS and BASS+MzLS, enabling the unbiased application of our\nmodel to galaxy images in both surveys. We observed that DA led to even better\nmorphological feature extraction and classification performance. Overall, this\ncombination of VAE and DA can be applied to achieve image dimensionality\nreduction, defect image identification, and morphology classification in large\noptical surveys.\n","authors":["Quanfeng Xu","Shiyin Shen","Rafael S. de Souza","Mi Chen","Renhao Ye","Yumei She","Zhu Chen","Emille E. O. Ishida","Alberto Krone-Martins","Rupesh Durgesh"],"pdf_url":"https://arxiv.org/pdf/2303.08627v2.pdf","comment":"Accepted by MNRAS 2023 October 12. 10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2310.08961v1","updated":"2023-10-13T09:11:35Z","published":"2023-10-13T09:11:35Z","title":"PAGE: Equilibrate Personalization and Generalization in Federated\n  Learning","summary":"  Federated learning (FL) is becoming a major driving force behind machine\nlearning as a service, where customers (clients) collaboratively benefit from\nshared local updates under the orchestration of the service provider (server).\nRepresenting clients' current demands and the server's future demand, local\nmodel personalization and global model generalization are separately\ninvestigated, as the ill-effects of data heterogeneity enforce the community to\nfocus on one over the other. However, these two seemingly competing goals are\nof equal importance rather than black and white issues, and should be achieved\nsimultaneously. In this paper, we propose the first algorithm to balance\npersonalization and generalization on top of game theory, dubbed PAGE, which\nreshapes FL as a co-opetition game between clients and the server. To explore\nthe equilibrium, PAGE further formulates the game as Markov decision processes,\nand leverages the reinforcement learning algorithm, which simplifies the\nsolving complexity. Extensive experiments on four widespread datasets show that\nPAGE outperforms state-of-the-art FL baselines in terms of global and local\nprediction accuracy simultaneously, and the accuracy can be improved by up to\n35.20% and 39.91%, respectively. In addition, biased variants of PAGE imply\npromising adaptiveness to demand shifts in practice.\n","authors":["Qian Chen","Zilong Wang","Jiaqi Hu","Haonan Yan","Jianying Zhou","Xiaodong Lin"],"pdf_url":"https://arxiv.org/pdf/2310.08961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09323v2","updated":"2023-10-13T08:58:46Z","published":"2023-09-17T17:01:05Z","title":"Answering Layer 3 queries with DiscoSCMs","summary":"  Addressing causal queries across the Pearl Causal Hierarchy (PCH) (i.e.,\nassociational, interventional and counterfactual), which is formalized as\n\\Layer{} Valuations, is a central task in contemporary causal inference\nresearch. Counterfactual questions, in particular, pose a significant challenge\nas they often necessitate a complete knowledge of structural equations. This\npaper identifies \\textbf{the degeneracy problem} caused by the consistency\nrule. To tackle this, the \\textit{Distribution-consistency Structural Causal\nModels} (DiscoSCMs) is introduced, which extends both the structural causal\nmodels (SCM) and the potential outcome framework. The correlation pattern of\npotential outcomes in personalized incentive scenarios, described by $P(y_x,\ny'_{x'})$, is used as a case study for elucidation. Although counterfactuals\nare no longer degenerate, they remain indeterminable. As a result, the\ncondition of independent potential noise is incorporated into DiscoSCM. It is\nfound that by adeptly using homogeneity, counterfactuals can be identified.\nFurthermore, more refined results are achieved in the unit problem scenario. In\nsimpler terms, when modeling counterfactuals, one should contemplate: \"Consider\na person with average ability who takes a test and, due to good luck, achieves\nan exceptionally high score. If this person were to retake the test under\nidentical external conditions, what score will he obtain? An exceptionally high\nscore or an average score?\" If your choose is predicting an average score, then\nyou are essentially choosing DiscoSCM over the traditional frameworks based on\nthe consistency rule.\n","authors":["Heyang Gong"],"pdf_url":"https://arxiv.org/pdf/2309.09323v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00527v2","updated":"2023-10-13T08:58:34Z","published":"2023-07-02T09:38:43Z","title":"Graph Neural Networks based Log Anomaly Detection and Explanation","summary":"  Event logs are widely used to record the status of high-tech systems, making\nlog anomaly detection important for monitoring those systems. Most existing log\nanomaly detection methods take a log event count matrix or log event sequences\nas input, exploiting quantitative and/or sequential relationships between log\nevents to detect anomalies. Unfortunately, only considering quantitative or\nsequential relationships may result in low detection accuracy. To alleviate\nthis problem, we propose a graph-based method for unsupervised log anomaly\ndetection, dubbed Logs2Graphs, which first converts event logs into attributed,\ndirected, and weighted graphs, and then leverages graph neural networks to\nperform graph-level anomaly detection. Specifically, we introduce One-Class\nDigraph Inception Convolutional Networks, abbreviated as OCDiGCN, a novel graph\nneural network model for detecting graph-level anomalies in a collection of\nattributed, directed, and weighted graphs. By coupling the graph representation\nand anomaly detection steps, OCDiGCN can learn a representation that is\nespecially suited for anomaly detection, resulting in a high detection\naccuracy. Importantly, for each identified anomaly, we additionally provide a\nsmall subset of nodes that play a crucial role in OCDiGCN's prediction as\nexplanations, which can offer valuable cues for subsequent root cause\ndiagnosis. Experiments on five benchmark datasets show that Logs2Graphs\nperforms at least on par with state-of-the-art log anomaly detection methods on\nsimple datasets while largely outperforming state-of-the-art log anomaly\ndetection methods on complicated datasets.\n","authors":["Zhong Li","Jiayang Shi","Matthijs van Leeuwen"],"pdf_url":"https://arxiv.org/pdf/2307.00527v2.pdf","comment":"Preprint submitted to Engineering Applications of Artificial\n  Intelligence"},{"id":"http://arxiv.org/abs/2310.04993v2","updated":"2023-10-13T08:37:29Z","published":"2023-10-08T03:41:16Z","title":"Prompt-augmented Temporal Point Process for Streaming Event Sequence","summary":"  Neural Temporal Point Processes (TPPs) are the prevalent paradigm for\nmodeling continuous-time event sequences, such as user activities on the web\nand financial transactions. In real-world applications, event data is typically\nreceived in a \\emph{streaming} manner, where the distribution of patterns may\nshift over time. Additionally, \\emph{privacy and memory constraints} are\ncommonly observed in practical scenarios, further compounding the challenges.\nTherefore, the continuous monitoring of a TPP to learn the streaming event\nsequence is an important yet under-explored problem. Our work paper addresses\nthis challenge by adopting Continual Learning (CL), which makes the model\ncapable of continuously learning a sequence of tasks without catastrophic\nforgetting under realistic constraints. Correspondingly, we propose a simple\nyet effective framework, PromptTPP\\footnote{Our code is available at {\\small\n\\url{ https://github.com/yanyanSann/PromptTPP}}}, by integrating the base TPP\nwith a continuous-time retrieval prompt pool. The prompts, small learnable\nparameters, are stored in a memory space and jointly optimized with the base\nTPP, ensuring that the model learns event streams sequentially without\nbuffering past examples or task-specific attributes. We present a novel and\nrealistic experimental setup for modeling event streams, where PromptTPP\nconsistently achieves state-of-the-art performance across three real user\nbehavior datasets.\n","authors":["Siqiao Xue","Yan Wang","Zhixuan Chu","Xiaoming Shi","Caigao Jiang","Hongyan Hao","Gangwei Jiang","Xiaoyun Feng","James Y. Zhang","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.04993v2.pdf","comment":"NeurIPS 2023 camera ready version"},{"id":"http://arxiv.org/abs/2310.08944v1","updated":"2023-10-13T08:19:31Z","published":"2023-10-13T08:19:31Z","title":"CAMELL: Confidence-based Acquisition Model for Efficient Self-supervised\n  Active Learning with Label Validation","summary":"  Supervised neural approaches are hindered by their dependence on large,\nmeticulously annotated datasets, a requirement that is particularly cumbersome\nfor sequential tasks. The quality of annotations tends to deteriorate with the\ntransition from expert-based to crowd-sourced labelling. To address these\nchallenges, we present \\textbf{CAMELL} (Confidence-based Acquisition Model for\nEfficient self-supervised active Learning with Label validation), a pool-based\nactive learning framework tailored for sequential multi-output problems. CAMELL\npossesses three core features: (1) it requires expert annotators to label only\na fraction of a chosen sequence, (2) it facilitates self-supervision for the\nremainder of the sequence, and (3) it employs a label validation mechanism to\nprevent erroneous labels from contaminating the dataset and harming model\nperformance. We evaluate CAMELL on sequential tasks, with a special emphasis on\ndialogue belief tracking, a task plagued by the constraints of limited and\nnoisy datasets. Our experiments demonstrate that CAMELL outperforms the\nbaselines in terms of efficiency. Furthermore, the data corrections suggested\nby our method contribute to an overall improvement in the quality of the\nresulting datasets.\n","authors":["Carel van Niekerk","Christian Geishauser","Michael Heck","Shutong Feng","Hsien-chin Lin","Nurul Lubis","Benjamin Ruppik","Renato Vukovic","Milica Gašić"],"pdf_url":"https://arxiv.org/pdf/2310.08944v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05880v4","updated":"2023-10-13T08:00:04Z","published":"2023-06-09T13:20:04Z","title":"Time Series Continuous Modeling for Imputation and Forecasting with\n  Implicit Neural Representations","summary":"  We introduce a novel modeling approach for time series imputation and\nforecasting, tailored to address the challenges often encountered in real-world\ndata, such as irregular samples, missing data, or unaligned measurements from\nmultiple sensors. Our method relies on a continuous-time-dependent model of the\nseries' evolution dynamics. It leverages adaptations of conditional, implicit\nneural representations for sequential data. A modulation mechanism, driven by a\nmeta-learning algorithm, allows adaptation to unseen samples and extrapolation\nbeyond observed time-windows for long-term predictions. The model provides a\nhighly flexible and unified framework for imputation and forecasting tasks\nacross a wide range of challenging scenarios. It achieves state-of-the-art\nperformance on classical benchmarks and outperforms alternative time-continuous\nmodels.\n","authors":["Etienne Le Naour","Louis Serrano","Léon Migus","Yuan Yin","Ghislain Agoua","Nicolas Baskiotis","Patrick Gallinari","Vincent Guigue"],"pdf_url":"https://arxiv.org/pdf/2306.05880v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14642v2","updated":"2023-10-13T07:55:20Z","published":"2023-05-24T02:23:00Z","title":"Newton-Cotes Graph Neural Networks: On the Time Evolution of Dynamic\n  Systems","summary":"  Reasoning system dynamics is one of the most important analytical approaches\nfor many scientific studies. With the initial state of a system as input, the\nrecent graph neural networks (GNNs)-based methods are capable of predicting the\nfuture state distant in time with high accuracy. Although these methods have\ndiverse designs in modeling the coordinates and interacting forces of the\nsystem, we show that they actually share a common paradigm that learns the\nintegration of the velocity over the interval between the initial and terminal\ncoordinates. However, their integrand is constant w.r.t. time. Inspired by this\nobservation, we propose a new approach to predict the integration based on\nseveral velocity estimations with Newton-Cotes formulas and prove its\neffectiveness theoretically. Extensive experiments on several benchmarks\nempirically demonstrate consistent and significant improvement compared with\nthe state-of-the-art methods.\n","authors":["Lingbing Guo","Weiqing Wang","Zhuo Chen","Ningyu Zhang","Zequn Sun","Yixuan Lai","Qiang Zhang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2305.14642v2.pdf","comment":"NeurIPS 2023 (spotlight)"},{"id":"http://arxiv.org/abs/2310.08922v1","updated":"2023-10-13T07:47:44Z","published":"2023-10-13T07:47:44Z","title":"LLaMA Rider: Spurring Large Language Models to Explore the Open World","summary":"  Recently, various studies have leveraged Large Language Models (LLMs) to help\ndecision-making and planning in environments, and try to align the LLMs'\nknowledge with the world conditions. Nonetheless, the capacity of LLMs to\ncontinuously acquire environmental knowledge and adapt in an open world remains\nuncertain. In this paper, we propose an approach to spur LLMs to explore the\nopen world, gather experiences, and learn to improve their task-solving\ncapabilities. In this approach, a multi-round feedback-revision mechanism is\nutilized to encourage LLMs to actively select appropriate revision actions\nguided by feedback information from the environment. This facilitates\nexploration and enhances the model's performance. Besides, we integrate\nsub-task relabeling to assist LLMs in maintaining consistency in sub-task\nplanning and help the model learn the combinatorial nature between tasks,\nenabling it to complete a wider range of tasks through training based on the\nacquired exploration experiences. By evaluation in Minecraft, an open-ended\nsandbox world, we demonstrate that our approach LLaMA-Rider enhances the\nefficiency of the LLM in exploring the environment, and effectively improves\nthe LLM's ability to accomplish more tasks through fine-tuning with merely 1.3k\ninstances of collected data, showing minimal training costs compared to the\nbaseline using reinforcement learning.\n","authors":["Yicheng Feng","Yuxuan Wang","Jiazheng Liu","Sipeng Zheng","Zongqing Lu"],"pdf_url":"https://arxiv.org/pdf/2310.08922v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2308.02416v2","updated":"2023-10-13T07:44:39Z","published":"2023-08-03T02:07:32Z","title":"Local-Global Temporal Fusion Network with an Attention Mechanism for\n  Multiple and Multiclass Arrhythmia Classification","summary":"  Clinical decision support systems (CDSSs) have been widely utilized to\nsupport the decisions made by cardiologists when detecting and classifying\narrhythmia from electrocardiograms (ECGs). However, forming a CDSS for the\narrhythmia classification task is challenging due to the varying lengths of\narrhythmias. Although the onset time of arrhythmia varies, previously developed\nmethods have not considered such conditions. Thus, we propose a framework that\nconsists of (i) local temporal information extraction, (ii) global pattern\nextraction, and (iii) local-global information fusion with attention to perform\narrhythmia detection and classification with a constrained input length. The\n10-class and 4-class performances of our approach were assessed by detecting\nthe onset and offset of arrhythmia as an episode and the duration of arrhythmia\nbased on the MIT-BIH arrhythmia database (MITDB) and MIT-BIH atrial\nfibrillation database (AFDB), respectively. The results were statistically\nsuperior to those achieved by the comparison models. To check the\ngeneralization ability of the proposed method, an AFDB-trained model was tested\non the MITDB, and superior performance was attained compared with that of a\nstate-of-the-art model. The proposed method can capture local-global\ninformation and dynamics without incurring information losses. Therefore,\narrhythmias can be recognized more accurately, and their occurrence times can\nbe calculated; thus, the clinical field can create more accurate treatment\nplans by using the proposed method.\n","authors":["Yun Kwan Kim","Minji Lee","Kunwook Jo","Hee Seok Song","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2308.02416v2.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2310.08920v1","updated":"2023-10-13T07:44:05Z","published":"2023-10-13T07:44:05Z","title":"Embarrassingly Simple Text Watermarks","summary":"  We propose Easymark, a family of embarrassingly simple yet effective\nwatermarks. Text watermarking is becoming increasingly important with the\nadvent of Large Language Models (LLM). LLMs can generate texts that cannot be\ndistinguished from human-written texts. This is a serious problem for the\ncredibility of the text. Easymark is a simple yet effective solution to this\nproblem. Easymark can inject a watermark without changing the meaning of the\ntext at all while a validator can detect if a text was generated from a system\nthat adopted Easymark or not with high credibility. Easymark is extremely easy\nto implement so that it only requires a few lines of code. Easymark does not\nrequire access to LLMs, so it can be implemented on the user-side when the LLM\nproviders do not offer watermarked LLMs. In spite of its simplicity, it\nachieves higher detection accuracy and BLEU scores than the state-of-the-art\ntext watermarking methods. We also prove the impossibility theorem of perfect\nwatermarking, which is valuable in its own right. This theorem shows that no\nmatter how sophisticated a watermark is, a malicious user could remove it from\nthe text, which motivate us to use a simple watermark such as Easymark. We\ncarry out experiments with LLM-generated texts and confirm that Easymark can be\ndetected reliably without any degradation of BLEU and perplexity, and\noutperform state-of-the-art watermarks in terms of both quality and\nreliability.\n","authors":["Ryoma Sato","Yuki Takezawa","Han Bao","Kenta Niwa","Makoto Yamada"],"pdf_url":"https://arxiv.org/pdf/2310.08920v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08917v1","updated":"2023-10-13T07:40:12Z","published":"2023-10-13T07:40:12Z","title":"Relation-aware Ensemble Learning for Knowledge Graph Embedding","summary":"  Knowledge graph (KG) embedding is a fundamental task in natural language\nprocessing, and various methods have been proposed to explore semantic patterns\nin distinctive ways. In this paper, we propose to learn an ensemble by\nleveraging existing methods in a relation-aware manner. However, exploring\nthese semantics using relation-aware ensemble leads to a much larger search\nspace than general ensemble methods. To address this issue, we propose a\ndivide-search-combine algorithm RelEns-DSC that searches the relation-wise\nensemble weights independently. This algorithm has the same computation cost as\ngeneral ensemble methods but with much better performance. Experimental results\non benchmark datasets demonstrate the effectiveness of the proposed method in\nefficiently searching relation-aware ensemble weights and achieving\nstate-of-the-art embedding performance. The code is public at\nhttps://github.com/LARS-research/RelEns.\n","authors":["Ling Yue","Yongqi Zhang","Quanming Yao","Yong Li","Xian Wu","Ziheng Zhang","Zhenxi Lin","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2310.08917v1.pdf","comment":"This short paper has been accepted by EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.08910v1","updated":"2023-10-13T07:31:04Z","published":"2023-10-13T07:31:04Z","title":"Scalarization for Multi-Task and Multi-Domain Learning at Scale","summary":"  Training a single model on multiple input domains and/or output tasks allows\nfor compressing information from multiple sources into a unified backbone hence\nimproves model efficiency. It also enables potential positive knowledge\ntransfer across tasks/domains, leading to improved accuracy and data-efficient\ntraining. However, optimizing such networks is a challenge, in particular due\nto discrepancies between the different tasks or domains: Despite several\nhypotheses and solutions proposed over the years, recent work has shown that\nuniform scalarization training, i.e., simply minimizing the average of the task\nlosses, yields on-par performance with more costly SotA optimization methods.\nThis raises the issue of how well we understand the training dynamics of\nmulti-task and multi-domain networks. In this work, we first devise a\nlarge-scale unified analysis of multi-domain and multi-task learning to better\nunderstand the dynamics of scalarization across varied task/domain combinations\nand model sizes. Following these insights, we then propose to leverage\npopulation-based training to efficiently search for the optimal scalarization\nweights when dealing with a large number of tasks or domains.\n","authors":["Amelie Royer","Tijmen Blankevoort","Babak Ehteshami Bejnordi"],"pdf_url":"https://arxiv.org/pdf/2310.08910v1.pdf","comment":"NeurIPS 2023; https://openreview.net/forum?id=TSuq3debnD"},{"id":"http://arxiv.org/abs/2310.08909v1","updated":"2023-10-13T07:30:50Z","published":"2023-10-13T07:30:50Z","title":"Community Membership Hiding as Counterfactual Graph Search via Deep\n  Reinforcement Learning","summary":"  Community detection techniques are useful tools for social media platforms to\ndiscover tightly connected groups of users who share common interests. However,\nthis functionality often comes at the expense of potentially exposing\nindividuals to privacy breaches by inadvertently revealing their tastes or\npreferences. Therefore, some users may wish to safeguard their anonymity and\nopt out of community detection for various reasons, such as affiliation with\npolitical or religious organizations.\n  In this study, we address the challenge of community membership hiding, which\ninvolves strategically altering the structural properties of a network graph to\nprevent one or more nodes from being identified by a given community detection\nalgorithm. We tackle this problem by formulating it as a constrained\ncounterfactual graph objective, and we solve it via deep reinforcement\nlearning. We validate the effectiveness of our method through two distinct\ntasks: node and community deception. Extensive experiments show that our\napproach overall outperforms existing baselines in both tasks.\n","authors":["Andrea Bernini","Fabrizio Silvestri","Gabriele Tolomei"],"pdf_url":"https://arxiv.org/pdf/2310.08909v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.03574v4","updated":"2023-10-13T07:13:51Z","published":"2022-02-04T12:30:49Z","title":"Structured Prediction Problem Archive","summary":"  Structured prediction problems are one of the fundamental tools in machine\nlearning. In order to facilitate algorithm development for their numerical\nsolution, we collect in one place a large number of datasets in easy to read\nformats for a diverse set of problem classes. We provide archival links to\ndatasets, description of the considered problems and problem formats, and a\nshort summary of problem characteristics including size, number of instances\netc. For reference we also give a non-exhaustive selection of algorithms\nproposed in the literature for their solution. We hope that this central\nrepository will make benchmarking and comparison to established works easier.\nWe welcome submission of interesting new datasets and algorithms for inclusion\nin our archive.\n","authors":["Paul Swoboda","Ahmed Abbas","Florian Bernard","Andrea Hornakova","Paul Roetzer","Bogdan Savchynskyy"],"pdf_url":"https://arxiv.org/pdf/2202.03574v4.pdf","comment":"Added new shape matching instances based of learned descriptors"},{"id":"http://arxiv.org/abs/2310.08897v1","updated":"2023-10-13T06:58:52Z","published":"2023-10-13T06:58:52Z","title":"Self supervised convolutional kernel based handcrafted feature\n  harmonization: Enhanced left ventricle hypertension disease phenotyping on\n  echocardiography","summary":"  Radiomics, a medical imaging technique, extracts quantitative handcrafted\nfeatures from images to predict diseases. Harmonization in those features\nensures consistent feature extraction across various imaging devices and\nprotocols. Methods for harmonization include standardized imaging protocols,\nstatistical adjustments, and evaluating feature robustness. Myocardial diseases\nsuch as Left Ventricular Hypertrophy (LVH) and Hypertensive Heart Disease (HHD)\nare diagnosed via echocardiography, but variable imaging settings pose\nchallenges. Harmonization techniques are crucial for applying handcrafted\nfeatures in disease diagnosis in such scenario. Self-supervised learning (SSL)\nenhances data understanding within limited datasets and adapts to diverse data\nsettings. ConvNeXt-V2 integrates convolutional layers into SSL, displaying\nsuperior performance in various tasks. This study focuses on convolutional\nfilters within SSL, using them as preprocessing to convert images into feature\nmaps for handcrafted feature harmonization. Our proposed method excelled in\nharmonization evaluation and exhibited superior LVH classification performance\ncompared to existing methods.\n","authors":["Jina Lee","Youngtaek Hong","Dawun Jeong","Yeonggul Jang","Sihyeon Jeong","Taekgeun Jung","Yeonyee E. Yoon","Inki Moon","Seung-Ah Lee","Hyuk-Jae Chang"],"pdf_url":"https://arxiv.org/pdf/2310.08897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08891v1","updated":"2023-10-13T06:53:02Z","published":"2023-10-13T06:53:02Z","title":"EHI: End-to-end Learning of Hierarchical Index for Efficient Dense\n  Retrieval","summary":"  Dense embedding-based retrieval is now the industry standard for semantic\nsearch and ranking problems, like obtaining relevant web documents for a given\nquery. Such techniques use a two-stage process: (a) contrastive learning to\ntrain a dual encoder to embed both the query and documents and (b) approximate\nnearest neighbor search (ANNS) for finding similar documents for a given query.\nThese two stages are disjoint; the learned embeddings might be ill-suited for\nthe ANNS method and vice-versa, leading to suboptimal performance. In this\nwork, we propose End-to-end Hierarchical Indexing -- EHI -- that jointly learns\nboth the embeddings and the ANNS structure to optimize retrieval performance.\nEHI uses a standard dual encoder model for embedding queries and documents\nwhile learning an inverted file index (IVF) style tree structure for efficient\nANNS. To ensure stable and efficient learning of discrete tree-based ANNS\nstructure, EHI introduces the notion of dense path embedding that captures the\nposition of a query/document in the tree. We demonstrate the effectiveness of\nEHI on several benchmarks, including de-facto industry standard MS MARCO (Dev\nset and TREC DL19) datasets. For example, with the same compute budget, EHI\noutperforms state-of-the-art (SOTA) in by 0.6% (MRR@10) on MS MARCO dev set and\nby 4.2% (nDCG@10) on TREC DL19 benchmarks.\n","authors":["Ramnath Kumar","Anshul Mittal","Nilesh Gupta","Aditya Kusupati","Inderjit Dhillon","Prateek Jain"],"pdf_url":"https://arxiv.org/pdf/2310.08891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02182v3","updated":"2023-10-13T06:51:27Z","published":"2023-08-04T07:54:45Z","title":"AutoML4ETC: Automated Neural Architecture Search for Real-World\n  Encrypted Traffic Classification","summary":"  Deep learning (DL) has been successfully applied to encrypted network traffic\nclassification in experimental settings. However, in production use, it has\nbeen shown that a DL classifier's performance inevitably decays over time.\nRe-training the model on newer datasets has been shown to only partially\nimprove its performance. Manually re-tuning the model architecture to meet the\nperformance expectations on newer datasets is time-consuming and requires\ndomain expertise. We propose AutoML4ETC, a novel tool to automatically design\nefficient and high-performing neural architectures for encrypted traffic\nclassification. We define a novel, powerful search space tailored specifically\nfor the early classification of encrypted traffic using packet header bytes. We\nshow that with different search strategies over our search space, AutoML4ETC\ngenerates neural architectures that outperform the state-of-the-art encrypted\ntraffic classifiers on several datasets, including public benchmark datasets\nand real-world TLS and QUIC traffic collected from the Orange mobile network.\nIn addition to being more accurate, AutoML4ETC's architectures are\nsignificantly more efficient and lighter in terms of the number of parameters.\nFinally, we make AutoML4ETC publicly available for future research.\n","authors":["Navid Malekghaini","Elham Akbari","Mohammad A. Salahuddin","Noura Limam","Raouf Boutaba","Bertrand Mathieu","Stephanie Moteau","Stephane Tuffin"],"pdf_url":"https://arxiv.org/pdf/2308.02182v3.pdf","comment":"Paper accepted for publication in IEEE TNSM journal. Please cite that\n  version"},{"id":"http://arxiv.org/abs/2303.13937v5","updated":"2023-10-13T06:43:44Z","published":"2023-03-24T11:50:08Z","title":"Topological Reconstruction of Particle Physics Processes using Graph\n  Neural Networks","summary":"  We present a new approach, the Topograph, which reconstructs underlying\nphysics processes, including the intermediary particles, by leveraging\nunderlying priors from the nature of particle physics decays and the\nflexibility of message passing graph neural networks. The Topograph not only\nsolves the combinatoric assignment of observed final state objects, associating\nthem to their original mother particles, but directly predicts the properties\nof intermediate particles in hard scatter processes and their subsequent\ndecays. In comparison to standard combinatoric approaches or modern approaches\nusing graph neural networks, which scale exponentially or quadratically, the\ncomplexity of Topographs scales linearly with the number of reconstructed\nobjects.\n  We apply Topographs to top quark pair production in the all hadronic decay\nchannel, where we outperform the standard approach and match the performance of\nthe state-of-the-art machine learning technique.\n","authors":["Lukas Ehrke","John Andrew Raine","Knut Zoch","Manuel Guth","Tobias Golling"],"pdf_url":"https://arxiv.org/pdf/2303.13937v5.pdf","comment":"25 pages, 24 figures, 8 tables"},{"id":"http://arxiv.org/abs/2310.08887v1","updated":"2023-10-13T06:43:11Z","published":"2023-10-13T06:43:11Z","title":"METRA: Scalable Unsupervised RL with Metric-Aware Abstraction","summary":"  Unsupervised pre-training strategies have proven to be highly effective in\nnatural language processing and computer vision. Likewise, unsupervised\nreinforcement learning (RL) holds the promise of discovering a variety of\npotentially useful behaviors that can accelerate the learning of a wide array\nof downstream tasks. Previous unsupervised RL approaches have mainly focused on\npure exploration and mutual information skill learning. However, despite the\nprevious attempts, making unsupervised RL truly scalable still remains a major\nopen challenge: pure exploration approaches might struggle in complex\nenvironments with large state spaces, where covering every possible transition\nis infeasible, and mutual information skill learning approaches might\ncompletely fail to explore the environment due to the lack of incentives. To\nmake unsupervised RL scalable to complex, high-dimensional environments, we\npropose a novel unsupervised RL objective, which we call Metric-Aware\nAbstraction (METRA). Our main idea is, instead of directly covering the entire\nstate space, to only cover a compact latent space $Z$ that is metrically\nconnected to the state space $S$ by temporal distances. By learning to move in\nevery direction in the latent space, METRA obtains a tractable set of diverse\nbehaviors that approximately cover the state space, being scalable to\nhigh-dimensional environments. Through our experiments in five locomotion and\nmanipulation environments, we demonstrate that METRA can discover a variety of\nuseful behaviors even in complex, pixel-based environments, being the first\nunsupervised RL method that discovers diverse locomotion behaviors in\npixel-based Quadruped and Humanoid. Our code and videos are available at\nhttps://seohong.me/projects/metra/\n","authors":["Seohong Park","Oleh Rybkin","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2310.08887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17625v2","updated":"2023-10-13T06:36:06Z","published":"2023-05-28T04:08:40Z","title":"Cross-Domain Policy Adaptation via Value-Guided Data Filtering","summary":"  Generalizing policies across different domains with dynamics mismatch poses a\nsignificant challenge in reinforcement learning. For example, a robot learns\nthe policy in a simulator, but when it is deployed in the real world, the\ndynamics of the environment may be different. Given the source and target\ndomain with dynamics mismatch, we consider the online dynamics adaptation\nproblem, in which case the agent can access sufficient source domain data while\nonline interactions with the target domain are limited. Existing research has\nattempted to solve the problem from the dynamics discrepancy perspective. In\nthis work, we reveal the limitations of these methods and explore the problem\nfrom the value difference perspective via a novel insight on the value\nconsistency across domains. Specifically, we present the Value-Guided Data\nFiltering (VGDF) algorithm, which selectively shares transitions from the\nsource domain based on the proximity of paired value targets across the two\ndomains. Empirical results on various environments with kinematic and\nmorphology shifts demonstrate that our method achieves superior performance\ncompared to prior approaches.\n","authors":["Kang Xu","Chenjia Bai","Xiaoteng Ma","Dong Wang","Bin Zhao","Zhen Wang","Xuelong Li","Wei Li"],"pdf_url":"https://arxiv.org/pdf/2305.17625v2.pdf","comment":"27 pages, 15 figures"},{"id":"http://arxiv.org/abs/2310.08876v1","updated":"2023-10-13T06:03:07Z","published":"2023-10-13T06:03:07Z","title":"Gesture Recognition for FMCW Radar on the Edge","summary":"  This paper introduces a lightweight gesture recognition system based on 60\nGHz frequency modulated continuous wave (FMCW) radar. We show that gestures can\nbe characterized efficiently by a set of five features, and propose a slim\nradar processing algorithm to extract these features. In contrast to previous\napproaches, we avoid heavy 2D processing, i.e. range-Doppler imaging, and\nperform instead an early target detection - this allows us to port the system\nto fully embedded platforms with tight constraints on memory, compute and power\nconsumption. A recurrent neural network (RNN) based architecture exploits these\nfeatures to jointly detect and classify five different gestures. The proposed\nsystem recognizes gestures with an F1 score of 98.4% on our hold-out test\ndataset, it runs on an Arm Cortex-M4 microcontroller requiring less than 280 kB\nof flash memory, 120 kB of RAM, and consuming 75 mW of power.\n","authors":["Maximilian Strobel","Stephan Schoenfeldt","Jonas Daugalas"],"pdf_url":"https://arxiv.org/pdf/2310.08876v1.pdf","comment":"4 pages, 5 figures, submitted to 2024 IEEE Topical Conference on\n  Wireless Sensors and Sensor Networks (WiSNeT)"},{"id":"http://arxiv.org/abs/2309.03084v3","updated":"2023-10-13T06:01:17Z","published":"2023-09-04T09:16:49Z","title":"Pure Monte Carlo Counterfactual Regret Minimization","summary":"  Counterfactual Regret Minimization (CFR) and its variants are the best\nalgorithms so far for solving large-scale incomplete information games.\nHowever, we believe that there are two problems with CFR: First, matrix\nmultiplication is required in CFR iteration, and the time complexity of one\niteration is too high; Secondly, the game characteristics in the real world are\ndifferent. Just using one CFR algorithm will not be perfectly suitable for all\ngame problems.\n  For these two problems, this paper proposes a new algorithm called Pure CFR\n(PCFR) based on CFR. PCFR can be seen as a combination of CFR and Fictitious\nPlay (FP), inheriting the concept of counterfactual regret (value) from CFR,\nand using the best response strategy instead of the regret matching strategy\nfor the next iteration. This algorithm has three advantages. First, PCFR can be\ncombined with any CFR variant. The resulting Pure MCCFR (PMCCFR) can\nsignificantly reduce the time and space complexity of one iteration. Secondly,\nour experiments show that the convergence speed of the PMCCFR is 2$\\sim$3 times\nthat of the MCCFR. Finally, there is a type of game that is very suitable for\nPCFR. We call this type of game clear-game, which is characterized by a high\nproportion of dominated strategies. Experiments show that in clear-game, the\nconvergence rate of PMCCFR is two orders of magnitude higher than that of\nMCCFR.\n","authors":["Ju Qi","Ting Feng","Falun Hei","Zhemei Fang","Yunfeng Luo"],"pdf_url":"https://arxiv.org/pdf/2309.03084v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03986v2","updated":"2023-10-13T05:35:40Z","published":"2023-10-06T03:04:21Z","title":"Robust Multimodal Learning with Missing Modalities via\n  Parameter-Efficient Adaptation","summary":"  Multimodal learning seeks to utilize data from multiple sources to improve\nthe overall performance of downstream tasks. It is desirable for redundancies\nin the data to make multimodal systems robust to missing or corrupted\nobservations in some correlated modalities. However, we observe that the\nperformance of several existing multimodal networks significantly deteriorates\nif one or multiple modalities are absent at test time. To enable robustness to\nmissing modalities, we propose simple and parameter-efficient adaptation\nprocedures for pretrained multimodal networks. In particular, we exploit\nlow-rank adaptation and modulation of intermediate features to compensate for\nthe missing modalities. We demonstrate that such adaptation can partially\nbridge performance drop due to missing modalities and outperform independent,\ndedicated networks trained for the available modality combinations in some\ncases. The proposed adaptation requires extremely small number of parameters\n(e.g., fewer than 0.7% of the total parameters in most experiments). We conduct\na series of experiments to highlight the robustness of our proposed method\nusing diverse datasets for RGB-thermal and RGB-Depth semantic segmentation,\nmultimodal material segmentation, and multimodal sentiment analysis tasks. Our\nproposed method demonstrates versatility across various tasks and datasets, and\noutperforms existing methods for robust multimodal learning with missing\nmodalities.\n","authors":["Md Kaykobad Reza","Ashley Prater-Bennette","M. Salman Asif"],"pdf_url":"https://arxiv.org/pdf/2310.03986v2.pdf","comment":"18 pages, 3 figures, 11 tables"},{"id":"http://arxiv.org/abs/2310.08867v1","updated":"2023-10-13T05:35:13Z","published":"2023-10-13T05:35:13Z","title":"A Survey of Methods for Handling Disk Data Imbalance","summary":"  Class imbalance exists in many classification problems, and since the data is\ndesigned for accuracy, imbalance in data classes can lead to classification\nchallenges with a few classes having higher misclassification costs. The\nBackblaze dataset, a widely used dataset related to hard discs, has a small\namount of failure data and a large amount of health data, which exhibits a\nserious class imbalance. This paper provides a comprehensive overview of\nresearch in the field of imbalanced data classification. The discussion is\norganized into three main aspects: data-level methods, algorithmic-level\nmethods, and hybrid methods. For each type of method, we summarize and analyze\nthe existing problems, algorithmic ideas, strengths, and weaknesses.\nAdditionally, the challenges of unbalanced data classification are discussed,\nalong with strategies to address them. It is convenient for researchers to\nchoose the appropriate method according to their needs.\n","authors":["Shuangshuang Yuan","Peng Wu","Yuehui Chen","Qiang Li"],"pdf_url":"https://arxiv.org/pdf/2310.08867v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08866v1","updated":"2023-10-13T05:29:09Z","published":"2023-10-13T05:29:09Z","title":"Adaptivity and Modularity for Efficient Generalization Over Task\n  Complexity","summary":"  Can transformers generalize efficiently on problems that require dealing with\nexamples with different levels of difficulty? We introduce a new task tailored\nto assess generalization over different complexities and present results that\nindicate that standard transformers face challenges in solving these tasks.\nThese tasks are variations of pointer value retrieval previously introduced by\nZhang et al. (2021). We investigate how the use of a mechanism for adaptive and\nmodular computation in transformers facilitates the learning of tasks that\ndemand generalization over the number of sequential computation steps (i.e.,\nthe depth of the computation graph). Based on our observations, we propose a\ntransformer-based architecture called Hyper-UT, which combines dynamic function\ngeneration from hyper networks with adaptive depth from Universal Transformers.\nThis model demonstrates higher accuracy and a fairer allocation of\ncomputational resources when generalizing to higher numbers of computation\nsteps. We conclude that mechanisms for adaptive depth and modularity complement\neach other in improving efficient generalization concerning example complexity.\nAdditionally, to emphasize the broad applicability of our findings, we\nillustrate that in a standard image recognition task, Hyper- UT's performance\nmatches that of a ViT model but with considerably reduced computational demands\n(achieving over 70\\% average savings by effectively using fewer layers).\n","authors":["Samira Abnar","Omid Saremi","Laurent Dinh","Shantel Wilson","Miguel Angel Bautista","Chen Huang","Vimal Thilak","Etai Littwin","Jiatao Gu","Josh Susskind","Samy Bengio"],"pdf_url":"https://arxiv.org/pdf/2310.08866v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08863v1","updated":"2023-10-13T05:12:48Z","published":"2023-10-13T05:12:48Z","title":"In-Context Learning for Few-Shot Molecular Property Prediction","summary":"  In-context learning has become an important approach for few-shot learning in\nLarge Language Models because of its ability to rapidly adapt to new tasks\nwithout fine-tuning model parameters. However, it is restricted to applications\nin natural language and inapplicable to other domains. In this paper, we adapt\nthe concepts underpinning in-context learning to develop a new algorithm for\nfew-shot molecular property prediction. Our approach learns to predict\nmolecular properties from a context of (molecule, property measurement) pairs\nand rapidly adapts to new properties without fine-tuning. On the FS-Mol and\nBACE molecular property prediction benchmarks, we find this method surpasses\nthe performance of recent meta-learning algorithms at small support sizes and\nis competitive with the best methods at large support sizes.\n","authors":["Christopher Fifty","Jure Leskovec","Sebastian Thrun"],"pdf_url":"https://arxiv.org/pdf/2310.08863v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08858v1","updated":"2023-10-13T04:59:44Z","published":"2023-10-13T04:59:44Z","title":"Adam-family Methods with Decoupled Weight Decay in Deep Learning","summary":"  In this paper, we investigate the convergence properties of a wide class of\nAdam-family methods for minimizing quadratically regularized nonsmooth\nnonconvex optimization problems, especially in the context of training\nnonsmooth neural networks with weight decay. Motivated by the AdamW method, we\npropose a novel framework for Adam-family methods with decoupled weight decay.\nWithin our framework, the estimators for the first-order and second-order\nmoments of stochastic subgradients are updated independently of the weight\ndecay term. Under mild assumptions and with non-diminishing stepsizes for\nupdating the primary optimization variables, we establish the convergence\nproperties of our proposed framework. In addition, we show that our proposed\nframework encompasses a wide variety of well-known Adam-family methods, hence\noffering convergence guarantees for these methods in the training of nonsmooth\nneural networks. More importantly, we show that our proposed framework\nasymptotically approximates the SGD method, thereby providing an explanation\nfor the empirical observation that decoupled weight decay enhances\ngeneralization performance for Adam-family methods. As a practical application\nof our proposed framework, we propose a novel Adam-family method named Adam\nwith Decoupled Weight Decay (AdamD), and establish its convergence properties\nunder mild conditions. Numerical experiments demonstrate that AdamD outperforms\nAdam and is comparable to AdamW, in the aspects of both generalization\nperformance and efficiency.\n","authors":["Kuangyu Ding","Nachuan Xiao","Kim-Chuan Toh"],"pdf_url":"https://arxiv.org/pdf/2310.08858v1.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2310.08855v1","updated":"2023-10-13T04:50:40Z","published":"2023-10-13T04:50:40Z","title":"Overcoming Recency Bias of Normalization Statistics in Continual\n  Learning: Balance and Adaptation","summary":"  Continual learning entails learning a sequence of tasks and balancing their\nknowledge appropriately. With limited access to old training samples, much of\nthe current work in deep neural networks has focused on overcoming catastrophic\nforgetting of old tasks in gradient-based optimization. However, the\nnormalization layers provide an exception, as they are updated interdependently\nby the gradient and statistics of currently observed training samples, which\nrequire specialized strategies to mitigate recency bias. In this work, we focus\non the most popular Batch Normalization (BN) and provide an in-depth\ntheoretical analysis of its sub-optimality in continual learning. Our analysis\ndemonstrates the dilemma between balance and adaptation of BN statistics for\nincremental tasks, which potentially affects training stability and\ngeneralization. Targeting on these particular challenges, we propose Adaptive\nBalance of BN (AdaB$^2$N), which incorporates appropriately a Bayesian-based\nstrategy to adapt task-wise contributions and a modified momentum to balance BN\nstatistics, corresponding to the training and testing stages. By implementing\nBN in a continual learning fashion, our approach achieves significant\nperformance gains across a wide range of benchmarks, particularly for the\nchallenging yet realistic online scenarios (e.g., up to 7.68%, 6.86% and 4.26%\non Split CIFAR-10, Split CIFAR-100 and Split Mini-ImageNet, respectively). Our\ncode is available at https://github.com/lvyilin/AdaB2N.\n","authors":["Yilin Lyu","Liyuan Wang","Xingxing Zhang","Zicheng Sun","Hang Su","Jun Zhu","Liping Jing"],"pdf_url":"https://arxiv.org/pdf/2310.08855v1.pdf","comment":"Accepted by NeurIPS 2023"},{"id":"http://arxiv.org/abs/2306.17439v2","updated":"2023-10-13T04:50:04Z","published":"2023-06-30T07:24:32Z","title":"Provable Robust Watermarking for AI-Generated Text","summary":"  We study the problem of watermarking large language models (LLMs) generated\ntext -- one of the most promising approaches for addressing the safety\nchallenges of LLM usage. In this paper, we propose a rigorous theoretical\nframework to quantify the effectiveness and robustness of LLM watermarks. We\npropose a robust and high-quality watermark method, Unigram-Watermark, by\nextending an existing approach with a simplified fixed grouping strategy. We\nprove that our watermark method enjoys guaranteed generation quality,\ncorrectness in watermark detection, and is robust against text editing and\nparaphrasing. Experiments on three varying LLMs and two datasets verify that\nour Unigram-Watermark achieves superior detection accuracy and comparable\ngeneration quality in perplexity, thus promoting the responsible use of LLMs.\nCode is available at https://github.com/XuandongZhao/Unigram-Watermark.\n","authors":["Xuandong Zhao","Prabhanjan Ananth","Lei Li","Yu-Xiang Wang"],"pdf_url":"https://arxiv.org/pdf/2306.17439v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08854v1","updated":"2023-10-13T04:48:32Z","published":"2023-10-13T04:48:32Z","title":"Rank-DETR for High Quality Object Detection","summary":"  Modern detection transformers (DETRs) use a set of object queries to predict\na list of bounding boxes, sort them by their classification confidence scores,\nand select the top-ranked predictions as the final detection results for the\ngiven input image. A highly performant object detector requires accurate\nranking for the bounding box predictions. For DETR-based detectors, the\ntop-ranked bounding boxes suffer from less accurate localization quality due to\nthe misalignment between classification scores and localization accuracy, thus\nimpeding the construction of high-quality detectors. In this work, we introduce\na simple and highly performant DETR-based object detector by proposing a series\nof rank-oriented designs, combinedly called Rank-DETR. Our key contributions\ninclude: (i) a rank-oriented architecture design that can prompt positive\npredictions and suppress the negative ones to ensure lower false positive\nrates, as well as (ii) a rank-oriented loss function and matching cost design\nthat prioritizes predictions of more accurate localization accuracy during\nranking to boost the AP under high IoU thresholds. We apply our method to\nimprove the recent SOTA methods (e.g., H-DETR and DINO-DETR) and report strong\nCOCO object detection results when using different backbones such as\nResNet-$50$, Swin-T, and Swin-L, demonstrating the effectiveness of our\napproach. Code is available at \\url{https://github.com/LeapLabTHU/Rank-DETR}.\n","authors":["Yifan Pu","Weicong Liang","Yiduo Hao","Yuhui Yuan","Yukang Yang","Chao Zhang","Han Hu","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2310.08854v1.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.08848v1","updated":"2023-10-13T04:22:21Z","published":"2023-10-13T04:22:21Z","title":"Semi-Supervised End-To-End Contrastive Learning For Time Series\n  Classification","summary":"  Time series classification is a critical task in various domains, such as\nfinance, healthcare, and sensor data analysis. Unsupervised contrastive\nlearning has garnered significant interest in learning effective\nrepresentations from time series data with limited labels. The prevalent\napproach in existing contrastive learning methods consists of two separate\nstages: pre-training the encoder on unlabeled datasets and fine-tuning the\nwell-trained model on a small-scale labeled dataset. However, such two-stage\napproaches suffer from several shortcomings, such as the inability of\nunsupervised pre-training contrastive loss to directly affect downstream\nfine-tuning classifiers, and the lack of exploiting the classification loss\nwhich is guided by valuable ground truth. In this paper, we propose an\nend-to-end model called SLOTS (Semi-supervised Learning fOr Time\nclasSification). SLOTS receives semi-labeled datasets, comprising a large\nnumber of unlabeled samples and a small proportion of labeled samples, and maps\nthem to an embedding space through an encoder. We calculate not only the\nunsupervised contrastive loss but also measure the supervised contrastive loss\non the samples with ground truth. The learned embeddings are fed into a\nclassifier, and the classification loss is calculated using the available true\nlabels. The unsupervised, supervised contrastive losses and classification loss\nare jointly used to optimize the encoder and classifier. We evaluate SLOTS by\ncomparing it with ten state-of-the-art methods across five datasets. The\nresults demonstrate that SLOTS is a simple yet effective framework. When\ncompared to the two-stage framework, our end-to-end SLOTS utilizes the same\ninput data, consumes a similar computational cost, but delivers significantly\nimproved performance. We release code and datasets at\nhttps://anonymous.4open.science/r/SLOTS-242E.\n","authors":["Huili Cai","Xiang Zhang","Xiaofeng Liu"],"pdf_url":"https://arxiv.org/pdf/2310.08848v1.pdf","comment":"Submitted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.08847v1","updated":"2023-10-13T04:14:51Z","published":"2023-10-13T04:14:51Z","title":"On the Over-Memorization During Natural, Robust and Catastrophic\n  Overfitting","summary":"  Overfitting negatively impacts the generalization ability of deep neural\nnetworks (DNNs) in both natural and adversarial training. Existing methods\nstruggle to consistently address different types of overfitting, typically\ndesigning strategies that focus separately on either natural or adversarial\npatterns. In this work, we adopt a unified perspective by solely focusing on\nnatural patterns to explore different types of overfitting. Specifically, we\nexamine the memorization effect in DNNs and reveal a shared behaviour termed\nover-memorization, which impairs their generalization capacity. This behaviour\nmanifests as DNNs suddenly becoming high-confidence in predicting certain\ntraining patterns and retaining a persistent memory for them. Furthermore, when\nDNNs over-memorize an adversarial pattern, they tend to simultaneously exhibit\nhigh-confidence prediction for the corresponding natural pattern. These\nfindings motivate us to holistically mitigate different types of overfitting by\nhindering the DNNs from over-memorization natural patterns. To this end, we\npropose a general framework, Distraction Over-Memorization (DOM), which\nexplicitly prevents over-memorization by either removing or augmenting the\nhigh-confidence natural patterns. Extensive experiments demonstrate the\neffectiveness of our proposed method in mitigating overfitting across various\ntraining paradigms.\n","authors":["Runqi Lin","Chaojian Yu","Bo Han","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2310.08847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10577v3","updated":"2023-10-13T04:05:07Z","published":"2023-06-18T14:38:29Z","title":"OpenDataVal: a Unified Benchmark for Data Valuation","summary":"  Assessing the quality and impact of individual data points is critical for\nimproving model performance and mitigating undesirable biases within the\ntraining dataset. Several data valuation algorithms have been proposed to\nquantify data quality, however, there lacks a systemic and standardized\nbenchmarking system for data valuation. In this paper, we introduce\nOpenDataVal, an easy-to-use and unified benchmark framework that empowers\nresearchers and practitioners to apply and compare various data valuation\nalgorithms. OpenDataVal provides an integrated environment that includes (i) a\ndiverse collection of image, natural language, and tabular datasets, (ii)\nimplementations of eleven different state-of-the-art data valuation algorithms,\nand (iii) a prediction model API that can import any models in scikit-learn.\nFurthermore, we propose four downstream machine learning tasks for evaluating\nthe quality of data values. We perform benchmarking analysis using OpenDataVal,\nquantifying and comparing the efficacy of state-of-the-art data valuation\napproaches. We find that no single algorithm performs uniformly best across all\ntasks, and an appropriate algorithm should be employed for a user's downstream\ntask. OpenDataVal is publicly available at https://opendataval.github.io with\ncomprehensive documentation. Furthermore, we provide a leaderboard where\nresearchers can evaluate the effectiveness of their own data valuation\nalgorithms.\n","authors":["Kevin Fu Jiang","Weixin Liang","James Zou","Yongchan Kwon"],"pdf_url":"https://arxiv.org/pdf/2306.10577v3.pdf","comment":"25 pages, NeurIPS 2023 Track on Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2206.12403v2","updated":"2023-10-13T03:48:11Z","published":"2022-06-24T17:59:02Z","title":"ZSON: Zero-Shot Object-Goal Navigation using Multimodal Goal Embeddings","summary":"  We present a scalable approach for learning open-world object-goal navigation\n(ObjectNav) -- the task of asking a virtual robot (agent) to find any instance\nof an object in an unexplored environment (e.g., \"find a sink\"). Our approach\nis entirely zero-shot -- i.e., it does not require ObjectNav rewards or\ndemonstrations of any kind. Instead, we train on the image-goal navigation\n(ImageNav) task, in which agents find the location where a picture (i.e., goal\nimage) was captured. Specifically, we encode goal images into a multimodal,\nsemantic embedding space to enable training semantic-goal navigation\n(SemanticNav) agents at scale in unannotated 3D environments (e.g., HM3D).\nAfter training, SemanticNav agents can be instructed to find objects described\nin free-form natural language (e.g., \"sink\", \"bathroom sink\", etc.) by\nprojecting language goals into the same multimodal, semantic embedding space.\nAs a result, our approach enables open-world ObjectNav. We extensively evaluate\nour agents on three ObjectNav datasets (Gibson, HM3D, and MP3D) and observe\nabsolute improvements in success of 4.2% - 20.0% over existing zero-shot\nmethods. For reference, these gains are similar or better than the 5%\nimprovement in success between the Habitat 2020 and 2021 ObjectNav challenge\nwinners. In an open-world setting, we discover that our agents can generalize\nto compound instructions with a room explicitly mentioned (e.g., \"Find a\nkitchen sink\") and when the target room can be inferred (e.g., \"Find a sink and\na stove\").\n","authors":["Arjun Majumdar","Gunjan Aggarwal","Bhavika Devnani","Judy Hoffman","Dhruv Batra"],"pdf_url":"https://arxiv.org/pdf/2206.12403v2.pdf","comment":"code: https://github.com/gunagg/zson"},{"id":"http://arxiv.org/abs/2310.04457v2","updated":"2023-10-13T03:42:43Z","published":"2023-10-04T22:23:40Z","title":"ProGO: Probabilistic Global Optimizer","summary":"  In the field of global optimization, many existing algorithms face challenges\nposed by non-convex target functions and high computational complexity or\nunavailability of gradient information. These limitations, exacerbated by\nsensitivity to initial conditions, often lead to suboptimal solutions or failed\nconvergence. This is true even for Metaheuristic algorithms designed to\namalgamate different optimization techniques to improve their efficiency and\nrobustness. To address these challenges, we develop a sequence of\nmultidimensional integration-based methods that we show to converge to the\nglobal optima under some mild regularity conditions. Our probabilistic approach\ndoes not require the use of gradients and is underpinned by a mathematically\nrigorous convergence framework anchored in the nuanced properties of nascent\noptima distribution. In order to alleviate the problem of multidimensional\nintegration, we develop a latent slice sampler that enjoys a geometric rate of\nconvergence in generating samples from the nascent optima distribution, which\nis used to approximate the global optima. The proposed Probabilistic Global\nOptimizer (ProGO) provides a scalable unified framework to approximate the\nglobal optima of any continuous function defined on a domain of arbitrary\ndimension. Empirical illustrations of ProGO across a variety of popular\nnon-convex test functions (having finite global optima) reveal that the\nproposed algorithm outperforms, by order of magnitude, many existing\nstate-of-the-art methods, including gradient-based, zeroth-order gradient-free,\nand some Bayesian Optimization methods, in term regret value and speed of\nconvergence. It is, however, to be noted that our approach may not be suitable\nfor functions that are expensive to compute.\n","authors":["Xinyu Zhang","Sujit Ghosh"],"pdf_url":"https://arxiv.org/pdf/2310.04457v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15395v3","updated":"2023-10-13T03:20:33Z","published":"2023-09-27T04:33:09Z","title":"Model-Free, Regret-Optimal Best Policy Identification in Online CMDPs","summary":"  This paper considers the best policy identification (BPI) problem in online\nConstrained Markov Decision Processes (CMDPs). We are interested in algorithms\nthat are model-free, have low regret, and identify an optimal policy with a\nhigh probability. Existing model-free algorithms for online CMDPs with\nsublinear regret and constraint violation do not provide any convergence\nguarantee to an optimal policy and provide only average performance guarantees\nwhen a policy is uniformly sampled at random from all previously used policies.\nIn this paper, we develop a new algorithm, named\nPruning-Refinement-Identification (PRI), based on a fundamental structural\nproperty of CMDPs proved in Koole(1988); Ross(1989), which we call limited\nstochasticity. The property says for a CMDP with $N$ constraints, there exists\nan optimal policy with at most $N$ stochastic decisions.\n  The proposed algorithm first identifies at which step and in which state a\nstochastic decision has to be taken and then fine-tunes the distributions of\nthese stochastic decisions. PRI achieves trio objectives: (i) PRI is a\nmodel-free algorithm; and (ii) it outputs a near-optimal policy with a high\nprobability at the end of learning; and (iii) in the tabular setting, PRI\nguarantees $\\tilde{\\mathcal{O}}(\\sqrt{K})$ regret and constraint violation,\nwhich significantly improves the best existing regret bound\n$\\tilde{\\mathcal{O}}(K^{\\frac{4}{5}})$ under a model-free algorithm, where $K$\nis the total number of episodes.\n","authors":["Zihan Zhou","Honghao Wei","Lei Ying"],"pdf_url":"https://arxiv.org/pdf/2309.15395v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08836v1","updated":"2023-10-13T03:15:42Z","published":"2023-10-13T03:15:42Z","title":"A Framework for Few-Shot Policy Transfer through Observation Mapping and\n  Behavior Cloning","summary":"  Despite recent progress in Reinforcement Learning for robotics applications,\nmany tasks remain prohibitively difficult to solve because of the expensive\ninteraction cost. Transfer learning helps reduce the training time in the\ntarget domain by transferring knowledge learned in a source domain. Sim2Real\ntransfer helps transfer knowledge from a simulated robotic domain to a physical\ntarget domain. Knowledge transfer reduces the time required to train a task in\nthe physical world, where the cost of interactions is high. However, most\nexisting approaches assume exact correspondence in the task structure and the\nphysical properties of the two domains. This work proposes a framework for\nFew-Shot Policy Transfer between two domains through Observation Mapping and\nBehavior Cloning. We use Generative Adversarial Networks (GANs) along with a\ncycle-consistency loss to map the observations between the source and target\ndomains and later use this learned mapping to clone the successful source task\nbehavior policy to the target domain. We observe successful behavior policy\ntransfer with limited target task interactions and in cases where the source\nand target task are semantically dissimilar.\n","authors":["Yash Shukla","Bharat Kesari","Shivam Goel","Robert Wright","Jivko Sinapov"],"pdf_url":"https://arxiv.org/pdf/2310.08836v1.pdf","comment":"Paper accepted to the IROS 2023 Conference"},{"id":"http://arxiv.org/abs/2310.01404v2","updated":"2023-10-13T03:14:16Z","published":"2023-10-02T17:59:03Z","title":"H-InDex: Visual Reinforcement Learning with Hand-Informed\n  Representations for Dexterous Manipulation","summary":"  Human hands possess remarkable dexterity and have long served as a source of\ninspiration for robotic manipulation. In this work, we propose a human\n$\\textbf{H}$and$\\textbf{-In}$formed visual representation learning framework to\nsolve difficult $\\textbf{Dex}$terous manipulation tasks ($\\textbf{H-InDex}$)\nwith reinforcement learning. Our framework consists of three stages: (i)\npre-training representations with 3D human hand pose estimation, (ii) offline\nadapting representations with self-supervised keypoint detection, and (iii)\nreinforcement learning with exponential moving average BatchNorm. The last two\nstages only modify $0.36\\%$ parameters of the pre-trained representation in\ntotal, ensuring the knowledge from pre-training is maintained to the full\nextent. We empirically study 12 challenging dexterous manipulation tasks and\nfind that H-InDex largely surpasses strong baseline methods and the recent\nvisual foundation models for motor control. Code is available at\nhttps://yanjieze.com/H-InDex .\n","authors":["Yanjie Ze","Yuyao Liu","Ruizhe Shi","Jiaxin Qin","Zhecheng Yuan","Jiashun Wang","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2310.01404v2.pdf","comment":"NeurIPS 2023. Code and videos: https://yanjieze.com/H-InDex"},{"id":"http://arxiv.org/abs/2310.08833v1","updated":"2023-10-13T03:08:59Z","published":"2023-10-13T03:08:59Z","title":"Optimal Sample Complexity for Average Reward Markov Decision Processes","summary":"  We settle the sample complexity of policy learning for the maximization of\nthe long run average reward associated with a uniformly ergodic Markov decision\nprocess (MDP), assuming a generative model. In this context, the existing\nliterature provides a sample complexity upper bound of $\\widetilde\nO(|S||A|t_{\\text{mix}}^2 \\epsilon^{-2})$ and a lower bound of\n$\\Omega(|S||A|t_{\\text{mix}} \\epsilon^{-2})$. In these expressions, $|S|$ and\n$|A|$ denote the cardinalities of the state and action spaces respectively,\n$t_{\\text{mix}}$ serves as a uniform upper limit for the total variation mixing\ntimes, and $\\epsilon$ signifies the error tolerance. Therefore, a notable gap\nof $t_{\\text{mix}}$ still remains to be bridged. Our primary contribution is to\nestablish an estimator for the optimal policy of average reward MDPs with a\nsample complexity of $\\widetilde O(|S||A|t_{\\text{mix}}\\epsilon^{-2})$,\neffectively reaching the lower bound in the literature. This is achieved by\ncombining algorithmic ideas in Jin and Sidford (2021) with those of Li et al.\n(2020).\n","authors":["Shengbo Wang","Jose Blanchet","Peter Glynn"],"pdf_url":"https://arxiv.org/pdf/2310.08833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07805v2","updated":"2023-10-13T03:06:00Z","published":"2023-10-11T18:38:28Z","title":"Generative Modeling with Phase Stochastic Bridges","summary":"  Diffusion models (DMs) represent state-of-the-art generative models for\ncontinuous inputs. DMs work by constructing a Stochastic Differential Equation\n(SDE) in the input space (ie, position space), and using a neural network to\nreverse it. In this work, we introduce a novel generative modeling framework\ngrounded in \\textbf{phase space dynamics}, where a phase space is defined as\n{an augmented space encompassing both position and velocity.} Leveraging\ninsights from Stochastic Optimal Control, we construct a path measure in the\nphase space that enables efficient sampling. {In contrast to DMs, our framework\ndemonstrates the capability to generate realistic data points at an early stage\nof dynamics propagation.} This early prediction sets the stage for efficient\ndata generation by leveraging additional velocity information along the\ntrajectory. On standard image generation benchmarks, our model yields favorable\nperformance over baselines in the regime of small Number of Function\nEvaluations (NFEs). Furthermore, our approach rivals the performance of\ndiffusion models equipped with efficient sampling techniques, underscoring its\npotential as a new tool generative modeling.\n","authors":["Tianrong Chen","Jiatao Gu","Laurent Dinh","Evangelos A. Theodorou","Josh Susskind","Shuangfei Zhai"],"pdf_url":"https://arxiv.org/pdf/2310.07805v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07958v2","updated":"2023-10-13T02:50:56Z","published":"2023-10-12T00:51:06Z","title":"Towards Causal Deep Learning for Vulnerability Detection","summary":"  Deep learning vulnerability detection has shown promising results in recent\nyears. However, an important challenge that still blocks it from being very\nuseful in practice is that the model is not robust under perturbation and it\ncannot generalize well over the out-of-distribution (OOD) data, e.g., applying\na trained model to unseen projects in real world. We hypothesize that this is\nbecause the model learned non-robust features, e.g., variable names, that have\nspurious correlations with labels. When the perturbed and OOD datasets no\nlonger have the same spurious features, the model prediction fails. To address\nthe challenge, in this paper, we introduced causality into deep learning\nvulnerability detection. Our approach CausalVul consists of two phases. First,\nwe designed novel perturbations to discover spurious features that the model\nmay use to make predictions. Second, we applied the causal learning algorithms,\nspecifically, do-calculus, on top of existing deep learning models to\nsystematically remove the use of spurious features and thus promote causal\nbased prediction. Our results show that CausalVul consistently improved the\nmodel accuracy, robustness and OOD performance for all the state-of-the-art\nmodels and datasets we experimented. To the best of our knowledge, this is the\nfirst work that introduces do calculus based causal learning to software\nengineering models and shows it's indeed useful for improving the model\naccuracy, robustness and generalization. Our replication package is located at\nhttps://figshare.com/s/0ffda320dcb96c249ef2.\n","authors":["Md Mahbubur Rahman","Ira Ceka","Chengzhi Mao","Saikat Chakraborty","Baishakhi Ray","Wei Le"],"pdf_url":"https://arxiv.org/pdf/2310.07958v2.pdf","comment":"Accepted at ICSE 2024 (not camera-ready version)"},{"id":"http://arxiv.org/abs/2310.08823v1","updated":"2023-10-13T02:38:35Z","published":"2023-10-13T02:38:35Z","title":"Distance-rank Aware Sequential Reward Learning for Inverse Reinforcement\n  Learning with Sub-optimal Demonstrations","summary":"  Inverse reinforcement learning (IRL) aims to explicitly infer an underlying\nreward function based on collected expert demonstrations. Considering that\nobtaining expert demonstrations can be costly, the focus of current IRL\ntechniques is on learning a better-than-demonstrator policy using a reward\nfunction derived from sub-optimal demonstrations. However, existing IRL\nalgorithms primarily tackle the challenge of trajectory ranking ambiguity when\nlearning the reward function. They overlook the crucial role of considering the\ndegree of difference between trajectories in terms of their returns, which is\nessential for further removing reward ambiguity. Additionally, it is important\nto note that the reward of a single transition is heavily influenced by the\ncontext information within the trajectory. To address these issues, we\nintroduce the Distance-rank Aware Sequential Reward Learning (DRASRL)\nframework. Unlike existing approaches, DRASRL takes into account both the\nranking of trajectories and the degrees of dissimilarity between them to\ncollaboratively eliminate reward ambiguity when learning a sequence of\ncontextually informed reward signals. Specifically, we leverage the distance\nbetween policies, from which the trajectories are generated, as a measure to\nquantify the degree of differences between traces. This distance-aware\ninformation is then used to infer embeddings in the representation space for\nreward learning, employing the contrastive learning technique. Meanwhile, we\nintegrate the pairwise ranking loss function to incorporate ranking information\ninto the latent features. Moreover, we resort to the Transformer architecture\nto capture the contextual dependencies within the trajectories in the latent\nspace, leading to more accurate reward estimation. Through extensive\nexperimentation, our DRASRL framework demonstrates significant performance\nimprovements over previous SOTA methods.\n","authors":["Lu Li","Yuxin Pan","Ruobing Chen","Jie Liu","Zilin Wang","Yu Liu","Zhiheng Li"],"pdf_url":"https://arxiv.org/pdf/2310.08823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07418v2","updated":"2023-10-13T02:27:42Z","published":"2023-08-14T19:12:40Z","title":"Locally Adaptive and Differentiable Regression","summary":"  Over-parameterized models like deep nets and random forests have become very\npopular in machine learning. However, the natural goals of continuity and\ndifferentiability, common in regression models, are now often ignored in modern\noverparametrized, locally-adaptive models. We propose a general framework to\nconstruct a global continuous and differentiable model based on a weighted\naverage of locally learned models in corresponding local regions. This model is\ncompetitive in dealing with data with different densities or scales of function\nvalues in different local regions. We demonstrate that when we mix kernel ridge\nand polynomial regression terms in the local models, and stitch them together\ncontinuously, we achieve faster statistical convergence in theory and improved\nperformance in various practical settings.\n","authors":["Mingxuan Han","Varun Shankar","Jeff M Phillips","Chenglong Ye"],"pdf_url":"https://arxiv.org/pdf/2308.07418v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08817v1","updated":"2023-10-13T02:06:52Z","published":"2023-10-13T02:06:52Z","title":"Exploring the relationship between response time sequence in scale\n  answering process and severity of insomnia: a machine learning approach","summary":"  Objectives: The study aims to investigate the relationship between insomnia\nand response time. Additionally, it aims to develop a machine learning model to\npredict the presence of insomnia in participants using response time data.\nMethods: A mobile application was designed to administer scale tests and\ncollect response time data from 2729 participants. The relationship between\nsymptom severity and response time was explored, and a machine learning model\nwas developed to predict the presence of insomnia. Results: The result revealed\na statistically significant difference (p<.001) in the total response time\nbetween participants with or without insomnia symptoms. A correlation was\nobserved between the severity of specific insomnia aspects and response times\nat the individual questions level. The machine learning model demonstrated a\nhigh predictive accuracy of 0.743 in predicting insomnia symptoms based on\nresponse time data. Conclusions: These findings highlight the potential utility\nof response time data to evaluate cognitive and psychological measures,\ndemonstrating the effectiveness of using response time as a diagnostic tool in\nthe assessment of insomnia.\n","authors":["Zhao Su","Rongxun Liu","Keyin Zhou","Xinru Wei","Ning Wang","Zexin Lin","Yuanchen Xie","Jie Wang","Fei Wang","Shenzhong Zhang","Xizhe Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08812v1","updated":"2023-10-13T01:50:43Z","published":"2023-10-13T01:50:43Z","title":"A Nonlinear Method for time series forecasting using VMD-GARCH-LSTM\n  model","summary":"  Time series forecasting represents a significant and challenging task across\nvarious fields. Recently, methods based on mode decomposition have dominated\nthe forecasting of complex time series because of the advantages of capturing\nlocal characteristics and extracting intrinsic modes from data. Unfortunately,\nmost models fail to capture the implied volatilities that contain significant\ninformation. To enhance the forecasting of current, rapidly evolving, and\nvolatile time series, we propose a novel decomposition-ensemble paradigm, the\nVMD-LSTM-GARCH model. The Variational Mode Decomposition algorithm is employed\nto decompose the time series into K sub-modes. Subsequently, the GARCH model\nextracts the volatility information from these sub-modes, which serve as the\ninput for the LSTM. The numerical and volatility information of each sub-mode\nis utilized to train a Long Short-Term Memory network. This network predicts\nthe sub-mode, and then we aggregate the predictions from all sub-modes to\nproduce the output. By integrating econometric and artificial intelligence\nmethods, and taking into account both the numerical and volatility information\nof the time series, our proposed model demonstrates superior performance in\ntime series forecasting, as evidenced by the significant decrease in MSE, RMSE,\nand MAPE in our comparative experimental results.\n","authors":["Zhengtao Gui","Haoyuan Li","Sijie Xu","Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2310.08812v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.00521v2","updated":"2023-10-13T01:47:48Z","published":"2023-01-02T04:19:56Z","title":"A Policy Optimization Method Towards Optimal-time Stability","summary":"  In current model-free reinforcement learning (RL) algorithms, stability\ncriteria based on sampling methods are commonly utilized to guide policy\noptimization. However, these criteria only guarantee the infinite-time\nconvergence of the system's state to an equilibrium point, which leads to\nsub-optimality of the policy. In this paper, we propose a policy optimization\ntechnique incorporating sampling-based Lyapunov stability. Our approach enables\nthe system's state to reach an equilibrium point within an optimal time and\nmaintain stability thereafter, referred to as \"optimal-time stability\". To\nachieve this, we integrate the optimization method into the Actor-Critic\nframework, resulting in the development of the Adaptive Lyapunov-based\nActor-Critic (ALAC) algorithm. Through evaluations conducted on ten robotic\ntasks, our approach outperforms previous studies significantly, effectively\nguiding the system to generate stable patterns.\n","authors":["Shengjie Wang","Fengbo Lan","Xiang Zheng","Yuxue Cao","Oluwatosin Oseni","Haotian Xu","Tao Zhang","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2301.00521v2.pdf","comment":"27 pages, 11 figues. 7th Annual Conference on Robot Learning. 2023"},{"id":"http://arxiv.org/abs/2310.08800v1","updated":"2023-10-13T01:18:41Z","published":"2023-10-13T01:18:41Z","title":"DDMT: Denoising Diffusion Mask Transformer Models for Multivariate Time\n  Series Anomaly Detection","summary":"  Anomaly detection in multivariate time series has emerged as a crucial\nchallenge in time series research, with significant research implications in\nvarious fields such as fraud detection, fault diagnosis, and system state\nestimation. Reconstruction-based models have shown promising potential in\nrecent years for detecting anomalies in time series data. However, due to the\nrapid increase in data scale and dimensionality, the issues of noise and Weak\nIdentity Mapping (WIM) during time series reconstruction have become\nincreasingly pronounced. To address this, we introduce a novel Adaptive Dynamic\nNeighbor Mask (ADNM) mechanism and integrate it with the Transformer and\nDenoising Diffusion Model, creating a new framework for multivariate time\nseries anomaly detection, named Denoising Diffusion Mask Transformer (DDMT).\nThe ADNM module is introduced to mitigate information leakage between input and\noutput features during data reconstruction, thereby alleviating the problem of\nWIM during reconstruction. The Denoising Diffusion Transformer (DDT) employs\nthe Transformer as an internal neural network structure for Denoising Diffusion\nModel. It learns the stepwise generation process of time series data to model\nthe probability distribution of the data, capturing normal data patterns and\nprogressively restoring time series data by removing noise, resulting in a\nclear recovery of anomalies. To the best of our knowledge, this is the first\nmodel that combines Denoising Diffusion Model and the Transformer for\nmultivariate time series anomaly detection. Experimental evaluations were\nconducted on five publicly available multivariate time series anomaly detection\ndatasets. The results demonstrate that the model effectively identifies\nanomalies in time series data, achieving state-of-the-art performance in\nanomaly detection.\n","authors":["Chaocheng Yang","Tingyin Wang","Xuanhui Yan"],"pdf_url":"https://arxiv.org/pdf/2310.08800v1.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2310.08475v2","updated":"2023-10-13T01:12:25Z","published":"2023-10-12T16:32:44Z","title":"Can We Edit Multimodal Large Language Models?","summary":"  In this paper, we focus on editing Multimodal Large Language Models (MLLMs).\nCompared to editing single-modal LLMs, multimodal model editing is more\nchallenging, which demands a higher level of scrutiny and careful consideration\nin the editing process. To facilitate research in this area, we construct a new\nbenchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite\nof innovative metrics for evaluation. We conduct comprehensive experiments\ninvolving various model editing baselines and analyze the impact of editing\ndifferent components for multimodal LLMs. Empirically, we notice that previous\nbaselines can implement editing multimodal LLMs to some extent, but the effect\nis still barely satisfactory, indicating the potential difficulty of this task.\nWe hope that our work can provide the NLP community with insights. Code and\ndataset are available in https://github.com/zjunlp/EasyEdit.\n","authors":["Siyuan Cheng","Bozhong Tian","Qingbin Liu","Xi Chen","Yongheng Wang","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08475v2.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.08795v1","updated":"2023-10-13T00:49:09Z","published":"2023-10-13T00:49:09Z","title":"Mitigating Bias for Question Answering Models by Tracking Bias Influence","summary":"  Models of various NLP tasks have been shown to exhibit stereotypes, and the\nbias in the question answering (QA) models is especially harmful as the output\nanswers might be directly consumed by the end users. There have been datasets\nto evaluate bias in QA models, while bias mitigation technique for the QA\nmodels is still under-explored. In this work, we propose BMBI, an approach to\nmitigate the bias of multiple-choice QA models. Based on the intuition that a\nmodel would lean to be more biased if it learns from a biased example, we\nmeasure the bias level of a query instance by observing its influence on\nanother instance. If the influenced instance is more biased, we derive that the\nquery instance is biased. We then use the bias level detected as an\noptimization objective to form a multi-task learning setting in addition to the\noriginal QA task. We further introduce a new bias evaluation metric to quantify\nbias in a comprehensive and sensitive way. We show that our method could be\napplied to multiple QA formulations across multiple bias categories. It can\nsignificantly reduce the bias level in all 9 bias categories in the BBQ dataset\nwhile maintaining comparable QA accuracy.\n","authors":["Mingyu Derek Ma","Jiun-Yu Kao","Arpit Gupta","Yu-Hsiang Lin","Wenbo Zhao","Tagyoung Chung","Wei Wang","Kai-Wei Chang","Nanyun Peng"],"pdf_url":"https://arxiv.org/pdf/2310.08795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08793v1","updated":"2023-10-13T00:46:12Z","published":"2023-10-13T00:46:12Z","title":"Analysis of Weather and Time Features in Machine Learning-aided ERCOT\n  Load Forecasting","summary":"  Accurate load forecasting is critical for efficient and reliable operations\nof the electric power system. A large part of electricity consumption is\naffected by weather conditions, making weather information an important\ndeterminant of electricity usage. Personal appliances and industry equipment\nalso contribute significantly to electricity demand with temporal patterns,\nmaking time a useful factor to consider in load forecasting. This work develops\nseveral machine learning (ML) models that take various time and weather\ninformation as part of the input features to predict the short-term system-wide\ntotal load. Ablation studies were also performed to investigate and compare the\nimpacts of different weather factors on the prediction accuracy. Actual load\nand historical weather data for the same region were processed and then used to\ntrain the ML models. It is interesting to observe that using all available\nfeatures, each of which may be correlated to the load, is unlikely to achieve\nthe best forecasting performance; features with redundancy may even decrease\nthe inference capabilities of ML models. This indicates the importance of\nfeature selection for ML models. Overall, case studies demonstrated the\neffectiveness of ML models trained with different weather and time input\nfeatures for ERCOT load forecasting.\n","authors":["Jonathan Yang","Mingjian Tuo","Jin Lu","Xingpeng Li"],"pdf_url":"https://arxiv.org/pdf/2310.08793v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07171v3","updated":"2023-10-13T00:35:33Z","published":"2023-10-11T03:39:56Z","title":"Federated Generalization via Information-Theoretic Distribution\n  Diversification","summary":"  Federated Learning (FL) has surged in prominence due to its capability of\ncollaborative model training without direct data sharing. However, the vast\ndisparity in local data distributions among clients, often termed the\nnon-Independent Identically Distributed (non-IID) challenge, poses a\nsignificant hurdle to FL's generalization efficacy. The scenario becomes even\nmore complex when not all clients participate in the training process, a common\noccurrence due to unstable network connections or limited computational\ncapacities. This can greatly complicate the assessment of the trained models'\ngeneralization abilities. While a plethora of recent studies has centered on\nthe generalization gap pertaining to unseen data from participating clients\nwith diverse distributions, the divergence between the training distributions\nof participating clients and the testing distributions of non-participating\nones has been largely overlooked. In response, our paper unveils an\ninformation-theoretic generalization framework for FL. Specifically, it\nquantifies generalization errors by evaluating the information entropy of local\ndistributions and discerning discrepancies across these distributions. Inspired\nby our deduced generalization bounds, we introduce a weighted aggregation\napproach and a duo of client selection strategies. These innovations aim to\nbolster FL's generalization prowess by encompassing a more varied set of client\ndata distributions. Our extensive empirical evaluations reaffirm the potency of\nour proposed methods, aligning seamlessly with our theoretical construct.\n","authors":["Zheshun Wu","Zenglin Xu","Dun Zeng","Qifan Wang"],"pdf_url":"https://arxiv.org/pdf/2310.07171v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08792v1","updated":"2023-10-13T00:34:12Z","published":"2023-10-13T00:34:12Z","title":"Incentive Mechanism Design for Distributed Ensemble Learning","summary":"  Distributed ensemble learning (DEL) involves training multiple models at\ndistributed learners, and then combining their predictions to improve\nperformance. Existing related studies focus on DEL algorithm design and\noptimization but ignore the important issue of incentives, without which\nself-interested learners may be unwilling to participate in DEL. We aim to fill\nthis gap by presenting a first study on the incentive mechanism design for DEL.\nOur proposed mechanism specifies both the amount of training data and reward\nfor learners with heterogeneous computation and communication costs. One design\nchallenge is to have an accurate understanding regarding how learners'\ndiversity (in terms of training data) affects the ensemble accuracy. To this\nend, we decompose the ensemble accuracy into a diversity-precision tradeoff to\nguide the mechanism design. Another challenge is that the mechanism design\ninvolves solving a mixed-integer program with a large search space. To this\nend, we propose an alternating algorithm that iteratively updates each\nlearner's training data size and reward. We prove that under mild conditions,\nthe algorithm converges. Numerical results using MNIST dataset show an\ninteresting result: our proposed mechanism may prefer a lower level of learner\ndiversity to achieve a higher ensemble accuracy.\n","authors":["Chao Huang","Pengchao Han","Jianwei Huang"],"pdf_url":"https://arxiv.org/pdf/2310.08792v1.pdf","comment":"Accepted to IEEE GLOBECOM 2023"},{"id":"http://arxiv.org/abs/2302.04062v5","updated":"2023-10-13T00:29:41Z","published":"2023-02-08T13:59:31Z","title":"Machine Learning for Synthetic Data Generation: A Review","summary":"  Machine learning heavily relies on data, but real-world applications often\nencounter various data-related issues. These include data of poor quality,\ninsufficient data points leading to under-fitting of machine learning models,\nand difficulties in data access due to concerns surrounding privacy, safety,\nand regulations. In light of these challenges, the concept of synthetic data\ngeneration emerges as a promising alternative that allows for data sharing and\nutilization in ways that real-world data cannot facilitate. This paper presents\na comprehensive systematic review of existing studies that employ machine\nlearning models for the purpose of generating synthetic data. The review\nencompasses various perspectives, starting with the applications of synthetic\ndata generation, spanning computer vision, speech, natural language processing,\nhealthcare, and business domains. Additionally, it explores different machine\nlearning methods, with particular emphasis on neural network architectures and\ndeep generative models. The paper also addresses the crucial aspects of privacy\nand fairness concerns related to synthetic data generation. Furthermore, this\nstudy identifies the challenges and opportunities prevalent in this emerging\nfield, shedding light on the potential avenues for future research. By delving\ninto the intricacies of synthetic data generation, this paper aims to\ncontribute to the advancement of knowledge and inspire further exploration in\nsynthetic data generation.\n","authors":["Yingzhou Lu","Minjie Shen","Huazheng Wang","Capucine van Rechem","Wenqi Wei"],"pdf_url":"https://arxiv.org/pdf/2302.04062v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08790v1","updated":"2023-10-13T00:25:21Z","published":"2023-10-13T00:25:21Z","title":"Price of Stability in Quality-Aware Federated Learning","summary":"  Federated Learning (FL) is a distributed machine learning scheme that enables\nclients to train a shared global model without exchanging local data. The\npresence of label noise can severely degrade the FL performance, and some\nexisting studies have focused on algorithm design for label denoising. However,\nthey ignored the important issue that clients may not apply costly label\ndenoising strategies due to them being self-interested and having heterogeneous\nvaluations on the FL performance. To fill this gap, we model the clients'\ninteractions as a novel label denoising game and characterize its equilibrium.\nWe also analyze the price of stability, which quantifies the difference in the\nsystem performance (e.g., global model accuracy, social welfare) between the\nequilibrium outcome and the socially optimal solution. We prove that the\nequilibrium outcome always leads to a lower global model accuracy than the\nsocially optimal solution does. We further design an efficient algorithm to\ncompute the socially optimal solution. Numerical experiments on MNIST dataset\nshow that the price of stability increases as the clients' data become noisier,\ncalling for an effective incentive mechanism.\n","authors":["Yizhou Yan","Xinyu Tang","Chao Huang","Ming Tang"],"pdf_url":"https://arxiv.org/pdf/2310.08790v1.pdf","comment":"Accepted to IEEE GLOBECOM 2023"},{"id":"http://arxiv.org/abs/2310.08782v1","updated":"2023-10-13T00:07:49Z","published":"2023-10-13T00:07:49Z","title":"Selectivity Drives Productivity: Efficient Dataset Pruning for Enhanced\n  Transfer Learning","summary":"  Massive data is often considered essential for deep learning applications,\nbut it also incurs significant computational and infrastructural costs.\nTherefore, dataset pruning (DP) has emerged as an effective way to improve data\nefficiency by identifying and removing redundant training samples without\nsacrificing performance. In this work, we aim to address the problem of DP for\ntransfer learning, i.e., how to prune a source dataset for improved pretraining\nefficiency and lossless finetuning accuracy on downstream target tasks. To our\nbest knowledge, the problem of DP for transfer learning remains open, as\nprevious studies have primarily addressed DP and transfer learning as separate\nproblems. By contrast, we establish a unified viewpoint to integrate DP with\ntransfer learning and find that existing DP methods are not suitable for the\ntransfer learning paradigm. We then propose two new DP methods, label mapping\nand feature mapping, for supervised and self-supervised pretraining settings\nrespectively, by revisiting the DP problem through the lens of source-target\ndomain mapping. Furthermore, we demonstrate the effectiveness of our approach\non numerous transfer learning tasks. We show that source data classes can be\npruned by up to 40% ~ 80% without sacrificing downstream performance, resulting\nin a significant 2 ~ 5 times speed-up during the pretraining stage. Besides,\nour proposal exhibits broad applicability and can improve other computationally\nintensive transfer learning techniques, such as adversarial pretraining. Codes\nare available at https://github.com/OPTML-Group/DP4TL.\n","authors":["Yihua Zhang","Yimeng Zhang","Aochuan Chen","Jinghan Jia","Jiancheng Liu","Gaowen Liu","Mingyi Hong","Shiyu Chang","Sijia Liu"],"pdf_url":"https://arxiv.org/pdf/2310.08782v1.pdf","comment":"Thirty-seventh Conference on Neural Information Processing Systems\n  (NeurIPS 2023)"}],"Multimedia":[{"id":"http://arxiv.org/abs/2310.09147v1","updated":"2023-10-13T14:39:34Z","published":"2023-10-13T14:39:34Z","title":"Exploring Sparse Spatial Relation in Graph Inference for Text-Based VQA","summary":"  Text-based visual question answering (TextVQA) faces the significant\nchallenge of avoiding redundant relational inference. To be specific, a large\nnumber of detected objects and optical character recognition (OCR) tokens\nresult in rich visual relationships. Existing works take all visual\nrelationships into account for answer prediction. However, there are three\nobservations: (1) a single subject in the images can be easily detected as\nmultiple objects with distinct bounding boxes (considered repetitive objects).\nThe associations between these repetitive objects are superfluous for answer\nreasoning; (2) two spatially distant OCR tokens detected in the image\nfrequently have weak semantic dependencies for answer reasoning; and (3) the\nco-existence of nearby objects and tokens may be indicative of important visual\ncues for predicting answers. Rather than utilizing all of them for answer\nprediction, we make an effort to identify the most important connections or\neliminate redundant ones. We propose a sparse spatial graph network (SSGN) that\nintroduces a spatially aware relation pruning technique to this task. As\nspatial factors for relation measurement, we employ spatial distance, geometric\ndimension, overlap area, and DIoU for spatially aware pruning. We consider\nthree visual relationships for graph learning: object-object, OCR-OCR tokens,\nand object-OCR token relationships. SSGN is a progressive graph learning\narchitecture that verifies the pivotal relations in the correlated object-token\nsparse graph, and then in the respective object-based sparse graph and\ntoken-based sparse graph. Experiment results on TextVQA and ST-VQA datasets\ndemonstrate that SSGN achieves promising performances. And some visualization\nresults further demonstrate the interpretability of our method.\n","authors":["Sheng Zhou","Dan Guo","Jia Li","Xun Yang","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2310.09147v1.pdf","comment":"Accepted by TIP 2023"},{"id":"http://arxiv.org/abs/2306.09675v3","updated":"2023-10-13T13:48:58Z","published":"2023-06-16T08:13:41Z","title":"Multi-View Class Incremental Learning","summary":"  Multi-view learning (MVL) has gained great success in integrating information\nfrom multiple perspectives of a dataset to improve downstream task performance.\nTo make MVL methods more practical in an open-ended environment, this paper\ninvestigates a novel paradigm called multi-view class incremental learning\n(MVCIL), where a single model incrementally classifies new classes from a\ncontinual stream of views, requiring no access to earlier views of data.\nHowever, MVCIL is challenged by the catastrophic forgetting of old information\nand the interference with learning new concepts. To address this, we first\ndevelop a randomization-based representation learning technique serving for\nfeature extraction to guarantee their separate view-optimal working states,\nduring which multiple views belonging to a class are presented sequentially;\nThen, we integrate them one by one in the orthogonality fusion subspace spanned\nby the extracted features; Finally, we introduce selective weight consolidation\nfor learning-without-forgetting decision-making while encountering new classes.\nExtensive experiments on synthetic and real-world datasets validate the\neffectiveness of our approach.\n","authors":["Depeng Li","Tianqi Wang","Junwei Chen","Kenji Kawaguchi","Cheng Lian","Zhigang Zeng"],"pdf_url":"https://arxiv.org/pdf/2306.09675v3.pdf","comment":"Accepted to Information Fusion"},{"id":"http://arxiv.org/abs/2310.09036v1","updated":"2023-10-13T11:57:04Z","published":"2023-10-13T11:57:04Z","title":"MM-BigBench: Evaluating Multimodal Models on Multimodal Content\n  Comprehension Tasks","summary":"  The popularity of multimodal large language models (MLLMs) has triggered a\nrecent surge in research efforts dedicated to evaluating these models.\nNevertheless, existing evaluation studies of MLLMs primarily focus on the\ncomprehension and reasoning of unimodal (vision) content, neglecting\nperformance evaluations in the domain of multimodal (vision-language) content\nunderstanding. Beyond multimodal reasoning, tasks related to multimodal content\ncomprehension necessitate a profound understanding of multimodal contexts,\nachieved through the multimodal interaction to obtain a final answer. In this\npaper, we introduce a comprehensive assessment framework called MM-BigBench,\nwhich incorporates a diverse range of metrics to offer an extensive evaluation\nof the performance of various models and instructions across a wide spectrum of\ndiverse multimodal content comprehension tasks. Consequently, our work\ncomplements research on the performance of MLLMs in multimodal comprehension\ntasks, achieving a more comprehensive and holistic evaluation of MLLMs. To\nbegin, we employ the Best Performance metric to ascertain each model's\nperformance upper bound on different datasets. Subsequently, the Mean Relative\nGain metric offers an assessment of the overall performance of various models\nand instructions, while the Stability metric measures their sensitivity.\nFurthermore, previous research centers on evaluating models independently or\nsolely assessing instructions, neglecting the adaptability between models and\ninstructions. We propose the Adaptability metric to quantify the adaptability\nbetween models and instructions. Our paper evaluates a total of 20 language\nmodels (14 MLLMs) on 14 multimodal datasets spanning 6 tasks, with 10\ninstructions for each task, and derives novel insights. Our code will be\nreleased at https://github.com/declare-lab/MM-BigBench.\n","authors":["Xiaocui Yang","Wenfang Wu","Shi Feng","Ming Wang","Daling Wang","Yang Li","Qi Sun","Yifei Zhang","Xiaoming Fu","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2310.09036v1.pdf","comment":"Underview"},{"id":"http://arxiv.org/abs/2310.08981v1","updated":"2023-10-13T09:57:09Z","published":"2023-10-13T09:57:09Z","title":"Low-latency Speech Enhancement via Speech Token Generation","summary":"  Existing deep learning based speech enhancement mainly employ a data-driven\napproach, which leverage large amounts of data with a variety of noise types to\nachieve noise removal from noisy signal. However, the high dependence on the\ndata limits its generalization on the unseen complex noises in real-life\nenvironment. In this paper, we focus on the low-latency scenario and regard\nspeech enhancement as a speech generation problem conditioned on the noisy\nsignal, where we generate clean speech instead of identifying and removing\nnoises. Specifically, we propose a conditional generative framework for speech\nenhancement, which models clean speech by acoustic codes of a neural speech\ncodec and generates the speech codes conditioned on past noisy frames in an\nauto-regressive way. Moreover, we propose an explicit-alignment approach to\nalign noisy frames with the generated speech tokens to improve the robustness\nand scalability to different input lengths. Different from other methods that\nleverage multiple stages to generate speech codes, we leverage a single-stage\nspeech generation approach based on the TF-Codec neural codec to achieve high\nspeech quality with low latency. Extensive results on both synthetic and\nreal-recorded test set show its superiority over data-driven approaches in\nterms of noise robustness and temporal speech coherence.\n","authors":["Huaying Xue","Xiulian Peng","Yan Lu"],"pdf_url":"https://arxiv.org/pdf/2310.08981v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2112.09726v3","updated":"2023-10-13T08:10:41Z","published":"2021-12-17T19:22:01Z","title":"Soundify: Matching Sound Effects to Video","summary":"  In the art of video editing, sound helps add character to an object and\nimmerse the viewer within a space. Through formative interviews with\nprofessional editors (N=10), we found that the task of adding sounds to video\ncan be challenging. This paper presents Soundify, a system that assists editors\nin matching sounds to video. Given a video, Soundify identifies matching\nsounds, synchronizes the sounds to the video, and dynamically adjusts panning\nand volume to create spatial audio. In a human evaluation study (N=889), we\nshow that Soundify is capable of matching sounds to video out-of-the-box for a\ndiverse range of audio categories. In a within-subjects expert study (N=12), we\ndemonstrate the usefulness of Soundify in helping video editors match sounds to\nvideo with lighter workload, reduced task completion time, and improved\nusability.\n","authors":["David Chuan-En Lin","Anastasis Germanidis","Cristóbal Valenzuela","Yining Shi","Nikolas Martelaro"],"pdf_url":"https://arxiv.org/pdf/2112.09726v3.pdf","comment":"Full paper in UIST 2023; Short paper in NeurIPS 2021 ML4CD Workshop;\n  Online demo: http://soundify.cc"},{"id":"http://arxiv.org/abs/2310.08475v2","updated":"2023-10-13T01:12:25Z","published":"2023-10-12T16:32:44Z","title":"Can We Edit Multimodal Large Language Models?","summary":"  In this paper, we focus on editing Multimodal Large Language Models (MLLMs).\nCompared to editing single-modal LLMs, multimodal model editing is more\nchallenging, which demands a higher level of scrutiny and careful consideration\nin the editing process. To facilitate research in this area, we construct a new\nbenchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite\nof innovative metrics for evaluation. We conduct comprehensive experiments\ninvolving various model editing baselines and analyze the impact of editing\ndifferent components for multimodal LLMs. Empirically, we notice that previous\nbaselines can implement editing multimodal LLMs to some extent, but the effect\nis still barely satisfactory, indicating the potential difficulty of this task.\nWe hope that our work can provide the NLP community with insights. Code and\ndataset are available in https://github.com/zjunlp/EasyEdit.\n","authors":["Siyuan Cheng","Bozhong Tian","Qingbin Liu","Xi Chen","Yongheng Wang","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.08475v2.pdf","comment":"EMNLP 2023"}]},"2023-10-17T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2310.11454v1","updated":"2023-10-17T17:59:46Z","published":"2023-10-17T17:59:46Z","title":"VeRA: Vector-based Random Matrix Adaptation","summary":"  Low-rank adapation (LoRA) is a popular method that reduces the number of\ntrainable parameters when finetuning large language models, but still faces\nacute storage challenges when scaling to even larger models or deploying\nnumerous per-user or per-task adapted models. In this work, we present\nVector-based Random Matrix Adaptation (VeRA), which reduces the number of\ntrainable parameters by 10x compared to LoRA, yet maintains the same\nperformance. It achieves this by using a single pair of low-rank matrices\nshared across all layers and learning small scaling vectors instead. We\ndemonstrate its effectiveness on the GLUE and E2E benchmarks, and show its\napplication in instruction-following with just 1.4M parameters using the Llama2\n7B model.\n","authors":["Dawid Jan Kopiczko","Tijmen Blankevoort","Yuki Markus Asano"],"pdf_url":"https://arxiv.org/pdf/2310.11454v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11453v1","updated":"2023-10-17T17:59:15Z","published":"2023-10-17T17:59:15Z","title":"BitNet: Scaling 1-bit Transformers for Large Language Models","summary":"  The increasing size of large language models has posed challenges for\ndeployment and raised concerns about environmental impact due to high energy\nconsumption. In this work, we introduce BitNet, a scalable and stable 1-bit\nTransformer architecture designed for large language models. Specifically, we\nintroduce BitLinear as a drop-in replacement of the nn.Linear layer in order to\ntrain 1-bit weights from scratch. Experimental results on language modeling\nshow that BitNet achieves competitive performance while substantially reducing\nmemory footprint and energy consumption, compared to state-of-the-art 8-bit\nquantization methods and FP16 Transformer baselines. Furthermore, BitNet\nexhibits a scaling law akin to full-precision Transformers, suggesting its\npotential for effective scaling to even larger language models while\nmaintaining efficiency and performance benefits.\n","authors":["Hongyu Wang","Shuming Ma","Li Dong","Shaohan Huang","Huaijie Wang","Lingxiao Ma","Fan Yang","Ruiping Wang","Yi Wu","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2310.11453v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2310.11451v1","updated":"2023-10-17T17:58:34Z","published":"2023-10-17T17:58:34Z","title":"Seeking Neural Nuggets: Knowledge Transfer in Large Language Models from\n  a Parametric Perspective","summary":"  Large Language Models (LLMs) inherently encode a wealth of knowledge within\ntheir parameters through pre-training on extensive corpora. While prior\nresearch has delved into operations on these parameters to manipulate the\nunderlying implicit knowledge (encompassing detection, editing, and merging),\nthere remains an ambiguous understanding regarding their transferability across\nmodels with varying scales. In this paper, we seek to empirically investigate\nknowledge transfer from larger to smaller models through a parametric\nperspective. To achieve this, we employ sensitivity-based techniques to extract\nand align knowledge-specific parameters between different LLMs. Moreover, the\nLoRA module is used as the intermediary mechanism for injecting the extracted\nknowledge into smaller models. Evaluations across four benchmarks validate the\nefficacy of our proposed method. Our findings highlight the critical factors\ncontributing to the process of parametric knowledge transfer, underscoring the\ntransferability of model parameters across LLMs of different scales. We release\ncode and data at \\url{https://github.com/maszhongming/ParaKnowTransfer}.\n","authors":["Ming Zhong","Chenxin An","Weizhu Chen","Jiawei Han","Pengcheng He"],"pdf_url":"https://arxiv.org/pdf/2310.11451v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2310.11446v1","updated":"2023-10-17T17:56:18Z","published":"2023-10-17T17:56:18Z","title":"Functional Invariants to Watermark Large Transformers","summary":"  The rapid growth of transformer-based models increases the concerns about\ntheir integrity and ownership insurance. Watermarking addresses this issue by\nembedding a unique identifier into the model, while preserving its performance.\nHowever, most existing approaches require to optimize the weights to imprint\nthe watermark signal, which is not suitable at scale due to the computational\ncost. This paper explores watermarks with virtually no computational cost,\napplicable to a non-blind white-box setting (assuming access to both the\noriginal and watermarked networks). They generate functionally equivalent\ncopies by leveraging the models' invariance, via operations like dimension\npermutations or scaling/unscaling. This enables to watermark models without any\nchange in their outputs and remains stealthy. Experiments demonstrate the\neffectiveness of the approach and its robustness against various model\ntransformations (fine-tuning, quantization, pruning), making it a practical\nsolution to protect the integrity of large models.\n","authors":["Fernandez Pierre","Couairon Guillaume","Furon Teddy","Douze Matthijs"],"pdf_url":"https://arxiv.org/pdf/2310.11446v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11441v1","updated":"2023-10-17T17:51:31Z","published":"2023-10-17T17:51:31Z","title":"Set-of-Mark Prompting Unleashes Extraordinary Visual Grounding in GPT-4V","summary":"  We present Set-of-Mark (SoM), a new visual prompting method, to unleash the\nvisual grounding abilities of large multimodal models (LMMs), such as GPT-4V.\nAs illustrated in Fig. 1 (right), we employ off-the-shelf interactive\nsegmentation models, such as SAM, to partition an image into regions at\ndifferent levels of granularity, and overlay these regions with a set of marks\ne.g., alphanumerics, masks, boxes. Using the marked image as input, GPT-4V can\nanswer the questions that require visual grounding. We perform a comprehensive\nempirical study to validate the effectiveness of SoM on a wide range of\nfine-grained vision and multimodal tasks. For example, our experiments show\nthat GPT-4V with SoM outperforms the state-of-the-art fully-finetuned referring\nsegmentation model on RefCOCOg in a zero-shot setting.\n","authors":["Jianwei Yang","Hao Zhang","Feng Li","Xueyan Zou","Chunyuan Li","Jianfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2310.11441v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09896v4","updated":"2023-10-17T17:51:27Z","published":"2023-06-16T15:13:17Z","title":"Is Self-Repair a Silver Bullet for Code Generation?","summary":"  Large language models have shown remarkable aptitude in code generation, but\nstill struggle on challenging tasks. Self-repair -- in which the model debugs\nand fixes mistakes in its own code -- has recently become a popular way to\nboost performance in these settings. However, only very limited studies on how\nand when self-repair works effectively exist in the literature, and one might\nwonder to what extent a model is really capable of repairing mistakes in code\nwhich was originally generated by that very same model. In this paper, we\nanalyze Code Llama, GPT-3.5 and GPT-4's ability to perform self-repair on\nproblems taken from HumanEval or APPS, finding that when the cost of carrying\nout repair is taken into account, gains are often modest, vary significantly\nbetween subsets of the data, and are sometimes not present at all. We\nhypothesize that this is because self-repair is bottlenecked by the model's\nability to provide feedback on its own code; boosting the feedback with\nstronger models, we observe performance gains even in settings where the model\ndoes not benefit from self-repair. Finally, we find that providing the model\nwith feedback from human participants greatly benefits repair even for GPT-4,\nand carry out a brief qualitative analysis of the differences observed.\n","authors":["Theo X. Olausson","Jeevana Priya Inala","Chenglong Wang","Jianfeng Gao","Armando Solar-Lezama"],"pdf_url":"https://arxiv.org/pdf/2306.09896v4.pdf","comment":"Added experiments for HumanEval (dataset) and Code Llama (model)"},{"id":"http://arxiv.org/abs/2305.14260v2","updated":"2023-10-17T17:46:41Z","published":"2023-05-23T17:12:09Z","title":"R2H: Building Multimodal Navigation Helpers that Respond to Help\n  Requests","summary":"  Intelligent navigation-helper agents are critical as they can navigate users\nin unknown areas through environmental awareness and conversational ability,\nserving as potential accessibility tools for individuals with disabilities. In\nthis work, we first introduce a novel benchmark, Respond to Help Requests\n(R2H), to promote the development of multi-modal navigation helpers capable of\nresponding to requests for help, utilizing existing dialog-based embodied\ndatasets. R2H mainly includes two tasks: (1) Respond to Dialog History (RDH),\nwhich assesses the helper agent's ability to generate informative responses\nbased on a given dialog history, and (2) Respond during Interaction (RdI),\nwhich evaluates the effectiveness and efficiency of the response during\nconsistent cooperation with a task performer. Furthermore, we explore two\napproaches to construct the navigation-helper agent, including fine-tuning a\nnovel task-oriented multi-modal response generation model that can see and\nrespond, named SeeRee, and employing a multi-modal large language model in a\nzero-shot manner. Analysis of the task and method was conducted based on both\nautomatic benchmarking and human evaluations. Project website:\nhttps://sites.google.com/view/response2helprequests/home.\n","authors":["Yue Fan","Jing Gu","Kaizhi Zheng","Xin Eric Wang"],"pdf_url":"https://arxiv.org/pdf/2305.14260v2.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.11430v1","updated":"2023-10-17T17:40:21Z","published":"2023-10-17T17:40:21Z","title":"An Empirical Study of Translation Hypothesis Ensembling with Large\n  Language Models","summary":"  Large language models (LLMs) are becoming a one-fits-many solution, but they\nsometimes hallucinate or produce unreliable output. In this paper, we\ninvestigate how hypothesis ensembling can improve the quality of the generated\ntext for the specific problem of LLM-based machine translation. We experiment\nwith several techniques for ensembling hypotheses produced by LLMs such as\nChatGPT, LLaMA, and Alpaca. We provide a comprehensive study along multiple\ndimensions, including the method to generate hypotheses (multiple prompts,\ntemperature-based sampling, and beam search) and the strategy to produce the\nfinal translation (instruction-based, quality-based reranking, and minimum\nBayes risk (MBR) decoding). Our results show that MBR decoding is a very\neffective method, that translation quality can be improved using a small number\nof samples, and that instruction tuning has a strong impact on the relation\nbetween the diversity of the hypotheses and the sampling temperature.\n","authors":["António Farinhas","José G. C. de Souza","André F. T. Martins"],"pdf_url":"https://arxiv.org/pdf/2310.11430v1.pdf","comment":"EMNLP 2023 (main conference)"},{"id":"http://arxiv.org/abs/2309.16039v2","updated":"2023-10-17T17:32:17Z","published":"2023-09-27T21:41:49Z","title":"Effective Long-Context Scaling of Foundation Models","summary":"  We present a series of long-context LLMs that support effective context\nwindows of up to 32,768 tokens. Our model series are built through continual\npretraining from Llama 2 with longer training sequences and on a dataset where\nlong texts are upsampled. We perform extensive evaluation on language modeling,\nsynthetic context probing tasks, and a wide range of research benchmarks. On\nresearch benchmarks, our models achieve consistent improvements on most regular\ntasks and significant improvements on long-context tasks over Llama 2. Notably,\nwith a cost-effective instruction tuning procedure that does not require\nhuman-annotated long instruction data, the 70B variant can already surpass\ngpt-3.5-turbo-16k's overall performance on a suite of long-context tasks.\nAlongside these results, we provide an in-depth analysis on the individual\ncomponents of our method. We delve into Llama's position encodings and discuss\nits limitation in modeling long dependencies. We also examine the impact of\nvarious design choices in the pretraining process, including the data mix and\nthe training curriculum of sequence lengths -- our ablation experiments suggest\nthat having abundant long texts in the pretrain dataset is not the key to\nachieving strong performance, and we empirically verify that long context\ncontinual pretraining is more efficient and similarly effective compared to\npretraining from scratch with long sequences.\n","authors":["Wenhan Xiong","Jingyu Liu","Igor Molybog","Hejia Zhang","Prajjwal Bhargava","Rui Hou","Louis Martin","Rashi Rungta","Karthik Abinav Sankararaman","Barlas Oguz","Madian Khabsa","Han Fang","Yashar Mehdad","Sharan Narang","Kshitiz Malik","Angela Fan","Shruti Bhosale","Sergey Edunov","Mike Lewis","Sinong Wang","Hao Ma"],"pdf_url":"https://arxiv.org/pdf/2309.16039v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10066v2","updated":"2023-10-17T17:24:36Z","published":"2023-09-18T18:33:40Z","title":"Automatic Personalized Impression Generation for PET Reports Using Large\n  Language Models","summary":"  In this study, we aimed to determine if fine-tuned large language models\n(LLMs) can generate accurate, personalized impressions for whole-body PET\nreports. Twelve language models were trained on a corpus of PET reports using\nthe teacher-forcing algorithm, with the report findings as input and the\nclinical impressions as reference. An extra input token encodes the reading\nphysician's identity, allowing models to learn physician-specific reporting\nstyles. Our corpus comprised 37,370 retrospective PET reports collected from\nour institution between 2010 and 2022. To identify the best LLM, 30 evaluation\nmetrics were benchmarked against quality scores from two nuclear medicine (NM)\nphysicians, with the most aligned metrics selecting the model for expert\nevaluation. In a subset of data, model-generated impressions and original\nclinical impressions were assessed by three NM physicians according to 6\nquality dimensions (3-point scale) and an overall utility score (5-point\nscale). Each physician reviewed 12 of their own reports and 12 reports from\nother physicians. Bootstrap resampling was used for statistical analysis. Of\nall evaluation metrics, domain-adapted BARTScore and PEGASUSScore showed the\nhighest Spearman's rank correlations (0.568 and 0.563) with physician\npreferences. Based on these metrics, the fine-tuned PEGASUS model was selected\nas the top LLM. When physicians reviewed PEGASUS-generated impressions in their\nown style, 89% were considered clinically acceptable, with a mean utility score\nof 4.08 out of 5. Physicians rated these personalized impressions as comparable\nin overall utility to the impressions dictated by other physicians (4.03,\nP=0.41). In conclusion, personalized impressions generated by PEGASUS were\nclinically useful, highlighting its potential to expedite PET reporting.\n","authors":["Xin Tie","Muheon Shin","Ali Pirasteh","Nevein Ibrahim","Zachary Huemann","Sharon M. Castellino","Kara M. Kelly","John Garrett","Junjie Hu","Steve Y. Cho","Tyler J. Bradshaw"],"pdf_url":"https://arxiv.org/pdf/2309.10066v2.pdf","comment":"25 pages in total. 6 figures and 3 tables in the main body. The\n  manuscript has been submitted to a journal for potential publication"},{"id":"http://arxiv.org/abs/2309.15714v2","updated":"2023-10-17T17:17:59Z","published":"2023-09-27T15:12:08Z","title":"Integrating LLM, EEG, and Eye-Tracking Biomarker Analysis for Word-Level\n  Neural State Classification in Semantic Inference Reading Comprehension","summary":"  With the recent proliferation of large language models (LLMs), such as\nGenerative Pre-trained Transformers (GPT), there has been a significant shift\nin exploring human and machine comprehension of semantic language meaning. This\nshift calls for interdisciplinary research that bridges cognitive science and\nnatural language processing (NLP). This pilot study aims to provide insights\ninto individuals' neural states during a semantic relation\nreading-comprehension task. We propose jointly analyzing LLMs, eye-gaze, and\nelectroencephalographic (EEG) data to study how the brain processes words with\nvarying degrees of relevance to a keyword during reading. We also use a feature\nengineering approach to improve the fixation-related EEG data classification\nwhile participants read words with high versus low relevance to the keyword.\nThe best validation accuracy in this word-level classification is over 60\\%\nacross 12 subjects. Words of high relevance to the inference keyword had\nsignificantly more eye fixations per word: 1.0584 compared to 0.6576 when\nexcluding no-fixation words, and 1.5126 compared to 1.4026 when including them.\nThis study represents the first attempt to classify brain states at a word\nlevel using LLM knowledge. It provides valuable insights into human cognitive\nabilities and the realm of Artificial General Intelligence (AGI), and offers\nguidance for developing potential reading-assisted technologies.\n","authors":["Yuhong Zhang","Qin Li","Sujal Nahata","Tasnia Jamal","Shih-kuen Cheng","Gert Cauwenberghs","Tzyy-Ping Jung"],"pdf_url":"https://arxiv.org/pdf/2309.15714v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13812v2","updated":"2023-10-17T17:07:29Z","published":"2023-05-23T08:28:38Z","title":"Coarse-to-Fine Contrastive Learning in Image-Text-Graph Space for\n  Improved Vision-Language Compositionality","summary":"  Contrastively trained vision-language models have achieved remarkable\nprogress in vision and language representation learning, leading to\nstate-of-the-art models for various downstream multimodal tasks. However,\nrecent research has highlighted severe limitations of these models in their\nability to perform compositional reasoning over objects, attributes, and\nrelations. Scene graphs have emerged as an effective way to understand images\ncompositionally. These are graph-structured semantic representations of images\nthat contain objects, their attributes, and relations with other objects in a\nscene. In this work, we consider the scene graph parsed from text as a proxy\nfor the image scene graph and propose a graph decomposition and augmentation\nframework along with a coarse-to-fine contrastive learning objective between\nimages and text that aligns sentences of various complexities to the same\nimage. Along with this, we propose novel negative mining techniques in the\nscene graph space for improving attribute binding and relation understanding.\nThrough extensive experiments, we demonstrate the effectiveness of our approach\nthat significantly improves attribute binding, relation understanding,\nsystematic generalization, and productivity on multiple recently proposed\nbenchmarks (For example, improvements upto $18\\%$ for systematic\ngeneralization, $16.5\\%$ for relation understanding over a strong baseline),\nwhile achieving similar or better performance than CLIP on various general\nmultimodal tasks.\n","authors":["Harman Singh","Pengchuan Zhang","Qifan Wang","Mengjiao Wang","Wenhan Xiong","Jingfei Du","Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2305.13812v2.pdf","comment":"EMNLP 2023 (main)"},{"id":"http://arxiv.org/abs/2310.11398v1","updated":"2023-10-17T17:06:26Z","published":"2023-10-17T17:06:26Z","title":"Neural Attention: Enhancing QKV Calculation in Self-Attention Mechanism\n  with Neural Networks","summary":"  In the realm of deep learning, the self-attention mechanism has substantiated\nits pivotal role across a myriad of tasks, encompassing natural language\nprocessing and computer vision. Despite achieving success across diverse\napplications, the traditional self-attention mechanism primarily leverages\nlinear transformations for the computation of query, key, and value (QKV),\nwhich may not invariably be the optimal choice under specific circumstances.\nThis paper probes into a novel methodology for QKV computation-implementing a\nspecially-designed neural network structure for the calculation. Utilizing a\nmodified Marian model, we conducted experiments on the IWSLT 2017\nGerman-English translation task dataset and juxtaposed our method with the\nconventional approach. The experimental results unveil a significant\nenhancement in BLEU scores with our method. Furthermore, our approach also\nmanifested superiority when training the Roberta model with the Wikitext-103\ndataset, reflecting a notable reduction in model perplexity compared to its\noriginal counterpart. These experimental outcomes not only validate the\nefficacy of our method but also reveal the immense potential in optimizing the\nself-attention mechanism through neural network-based QKV computation, paving\nthe way for future research and practical applications. The source code and\nimplementation details for our proposed method can be accessed at\nhttps://github.com/ocislyjrti/NeuralAttention.\n","authors":["Muhan Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.11398v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14324v2","updated":"2023-10-17T16:33:33Z","published":"2023-05-23T17:54:57Z","title":"Ties Matter: Meta-Evaluating Modern Metrics with Pairwise Accuracy and\n  Tie Calibration","summary":"  Kendall's tau is frequently used to meta-evaluate how well machine\ntranslation (MT) evaluation metrics score individual translations. Its focus on\npairwise score comparisons is intuitive but raises the question of how ties\nshould be handled, a gray area that has motivated different variants in the\nliterature. We demonstrate that, in settings like modern MT meta-evaluation,\nexisting variants have weaknesses arising from their handling of ties, and in\nsome situations can even be gamed. We propose instead to meta-evaluate metrics\nwith a version of pairwise accuracy that gives metrics credit for correctly\npredicting ties, in combination with a tie calibration procedure that\nautomatically introduces ties into metric scores, enabling fair comparison\nbetween metrics that do and do not predict ties. We argue and provide\nexperimental evidence that these modifications lead to fairer ranking-based\nassessments of metric performance.\n","authors":["Daniel Deutsch","George Foster","Markus Freitag"],"pdf_url":"https://arxiv.org/pdf/2305.14324v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11379v1","updated":"2023-10-17T16:22:18Z","published":"2023-10-17T16:22:18Z","title":"Robust Wake-Up Word Detection by Two-stage Multi-resolution Ensembles","summary":"  Voice-based interfaces rely on a wake-up word mechanism to initiate\ncommunication with devices. However, achieving a robust, energy-efficient, and\nfast detection remains a challenge. This paper addresses these real production\nneeds by enhancing data with temporal alignments and using detection based on\ntwo phases with multi-resolution. It employs two models: a lightweight\non-device model for real-time processing of the audio stream and a verification\nmodel on the server-side, which is an ensemble of heterogeneous architectures\nthat refine detection. This scheme allows the optimization of two operating\npoints. To protect privacy, audio features are sent to the cloud instead of raw\naudio. The study investigated different parametric configurations for feature\nextraction to select one for on-device detection and another for the\nverification model. Furthermore, thirteen different audio classifiers were\ncompared in terms of performance and inference time. The proposed ensemble\noutperforms our stronger classifier in every noise condition.\n","authors":["Fernando López","Jordi Luque","Carlos Segura","Pablo Gómez"],"pdf_url":"https://arxiv.org/pdf/2310.11379v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.01263v2","updated":"2023-10-17T16:21:55Z","published":"2023-08-02T16:30:40Z","title":"XSTest: A Test Suite for Identifying Exaggerated Safety Behaviours in\n  Large Language Models","summary":"  Without proper safeguards, large language models will readily follow\nmalicious instructions and generate toxic content. This risk motivates safety\nefforts such as red-teaming and large-scale feedback learning, which aim to\nmake models both helpful and harmless. However, there is a tension between\nthese two objectives, since harmlessness requires models to refuse to comply\nwith unsafe prompts, and thus not be helpful. Recent anecdotal evidence\nsuggests that some models may have struck a poor balance, so that even clearly\nsafe prompts are refused if they use similar language to unsafe prompts or\nmention sensitive topics. In this paper, we introduce a new test suite called\nXSTest to identify such eXaggerated Safety behaviours in a systematic way.\nXSTest comprises 250 safe prompts across ten prompt types that well-calibrated\nmodels should not refuse to comply with, and 200 unsafe prompts as contrasts\nthat models, for most applications, should refuse. We describe XSTest's\ncreation and composition, and then use the test suite to highlight systematic\nfailure modes in state-of-the-art language models as well as more general\nchallenges in building safer language models.\n","authors":["Paul Röttger","Hannah Rose Kirk","Bertie Vidgen","Giuseppe Attanasio","Federico Bianchi","Dirk Hovy"],"pdf_url":"https://arxiv.org/pdf/2308.01263v2.pdf","comment":"v2 prepared for conference submission"},{"id":"http://arxiv.org/abs/2310.11374v1","updated":"2023-10-17T16:15:34Z","published":"2023-10-17T16:15:34Z","title":"DialogueLLM: Context and Emotion Knowledge-Tuned LLaMA Models for\n  Emotion Recognition in Conversations","summary":"  Large language models (LLMs) and their variants have shown extraordinary\nefficacy across numerous downstream natural language processing (NLP) tasks,\nwhich has presented a new vision for the development of NLP. Despite their\nremarkable performance in natural language generating (NLG), LLMs lack a\ndistinct focus on the emotion understanding domain. As a result, using LLMs for\nemotion recognition may lead to suboptimal and inadequate precision. Another\nlimitation of LLMs is that they are typical trained without leveraging\nmulti-modal information. To overcome these limitations, we propose DialogueLLM,\na context and emotion knowledge tuned LLM that is obtained by fine-tuning LLaMA\nmodels with 13,638 multi-modal (i.e., texts and videos) emotional dialogues.\nThe visual information is considered as the supplementary knowledge to\nconstruct high-quality instructions. We offer a comprehensive evaluation of our\nproposed model on three benchmarking emotion recognition in conversations (ERC)\ndatasets and compare the results against the SOTA baselines and other SOTA\nLLMs. Additionally, DialogueLLM-7B can be easily trained using LoRA on a 40GB\nA100 GPU in 5 hours, facilitating reproducibility for other researchers.\n","authors":["Yazhou Zhang","Mengyao Wang","Prayag Tiwari","Qiuchi Li","Benyou Wang","Jing Qin"],"pdf_url":"https://arxiv.org/pdf/2310.11374v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11368v1","updated":"2023-10-17T16:05:52Z","published":"2023-10-17T16:05:52Z","title":"VECHR: A Dataset for Explainable and Robust Classification of\n  Vulnerability Type in the European Court of Human Rights","summary":"  Recognizing vulnerability is crucial for understanding and implementing\ntargeted support to empower individuals in need. This is especially important\nat the European Court of Human Rights (ECtHR), where the court adapts\nConvention standards to meet actual individual needs and thus ensures effective\nhuman rights protection. However, the concept of vulnerability remains elusive\nat the ECtHR and no prior NLP research has dealt with it. To enable future\nresearch in this area, we present VECHR, a novel expert-annotated multi-label\ndataset comprising of vulnerability type classification and explanation\nrationale. We benchmark the performance of state-of-the-art models on VECHR\nfrom both prediction and explainability perspectives. Our results demonstrate\nthe challenging nature of the task with lower prediction performance and\nlimited agreement between models and experts. Further, we analyze the\nrobustness of these models in dealing with out-of-domain (OOD) data and observe\noverall limited performance. Our dataset poses unique challenges offering\nsignificant room for improvement regarding performance, explainability, and\nrobustness.\n","authors":["Shanshan Xu","Leon Staufer","Santosh T. Y. S. S","Oana Ichim","Corina Heri","Matthias Grabmair"],"pdf_url":"https://arxiv.org/pdf/2310.11368v1.pdf","comment":"Accepted to EMNLP 2022"},{"id":"http://arxiv.org/abs/2302.13959v2","updated":"2023-10-17T16:03:05Z","published":"2023-02-27T17:00:06Z","title":"Make Every Example Count: On the Stability and Utility of Self-Influence\n  for Learning from Noisy NLP Datasets","summary":"  Increasingly larger datasets have become a standard ingredient to advancing\nthe state-of-the-art in NLP. However, data quality might have already become\nthe bottleneck to unlock further gains. Given the diversity and the sizes of\nmodern datasets, standard data filtering is not straight-forward to apply,\nbecause of the multifacetedness of the harmful data and elusiveness of\nfiltering rules that would generalize across multiple tasks. We study the\nfitness of task-agnostic self-influence scores of training examples for data\ncleaning, analyze their efficacy in capturing naturally occurring outliers, and\ninvestigate to what extent self-influence based data cleaning can improve\ndownstream performance in machine translation, question answering and text\nclassification, building up on recent approaches to self-influence calculation\nand automated curriculum learning.\n","authors":["Irina Bejan","Artem Sokolov","Katja Filippova"],"pdf_url":"https://arxiv.org/pdf/2302.13959v2.pdf","comment":"Published at EMNLP 2023"},{"id":"http://arxiv.org/abs/2204.04793v2","updated":"2023-10-17T16:02:42Z","published":"2022-04-10T23:16:00Z","title":"Fake news detection using parallel BERT deep neural networks","summary":"  Fake news is a growing challenge for social networks and media. Detection of\nfake news always has been a problem for many years, but after the evolution of\nsocial networks and increasing speed of news dissemination in recent years has\nbeen considered again. There are several approaches to solving this problem,\none of which is to detect fake news based on its text style using deep neural\nnetworks. In recent years, one of the most used forms of deep neural networks\nfor natural language processing is transfer learning with transformers. BERT is\none of the most promising transformers who outperforms other models in many NLP\nbenchmarks. This article, we introduce MWPBert, which uses two parallel BERT\nnetworks to perform veracity detection on full-text news articles. One of the\nBERT networks encodes news headline, and another encodes news body. Since the\ninput length of the BERT network is limited and constant and the news body is\nusually a long text, we cannot fed the whole news text into the BERT.\nTherefore, using the MaxWorth algorithm, we selected the part of the news text\nthat is more valuable for fact-checking, and fed it into the BERT network.\nFinally, we encode the output of the two BERT networks to an output network to\nclassify the news. The experiment results showed that the proposed model\noutperformed previous models in terms of accuracy and other performance\nmeasures.\n","authors":["Mahmood Farokhian","Vahid Rafe","Hadi Veisi"],"pdf_url":"https://arxiv.org/pdf/2204.04793v2.pdf","comment":"Multimed Tools Appl (2023)"},{"id":"http://arxiv.org/abs/2310.11363v1","updated":"2023-10-17T16:00:26Z","published":"2023-10-17T16:00:26Z","title":"Disentangling the Linguistic Competence of Privacy-Preserving BERT","summary":"  Differential Privacy (DP) has been tailored to address the unique challenges\nof text-to-text privatization. However, text-to-text privatization is known for\ndegrading the performance of language models when trained on perturbed text.\nEmploying a series of interpretation techniques on the internal representations\nextracted from BERT trained on perturbed pre-text, we intend to disentangle at\nthe linguistic level the distortion induced by differential privacy.\nExperimental results from a representational similarity analysis indicate that\nthe overall similarity of internal representations is substantially reduced.\nUsing probing tasks to unpack this dissimilarity, we find evidence that\ntext-to-text privatization affects the linguistic competence across several\nformalisms, encoding localized properties of words while falling short at\nencoding the contextual relationships between spans of words.\n","authors":["Stefan Arnold","Nils Kemmerzell","Annika Schreiner"],"pdf_url":"https://arxiv.org/pdf/2310.11363v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11360v1","updated":"2023-10-17T15:55:31Z","published":"2023-10-17T15:55:31Z","title":"Enhancing Neural Machine Translation with Semantic Units","summary":"  Conventional neural machine translation (NMT) models typically use subwords\nand words as the basic units for model input and comprehension. However,\ncomplete words and phrases composed of several tokens are often the fundamental\nunits for expressing semantics, referred to as semantic units. To address this\nissue, we propose a method Semantic Units for Machine Translation (SU4MT) which\nmodels the integral meanings of semantic units within a sentence, and then\nleverages them to provide a new perspective for understanding the sentence.\nSpecifically, we first propose Word Pair Encoding (WPE), a phrase extraction\nmethod to help identify the boundaries of semantic units. Next, we design an\nAttentive Semantic Fusion (ASF) layer to integrate the semantics of multiple\nsubwords into a single vector: the semantic unit representation. Lastly, the\nsemantic-unit-level sentence representation is concatenated to the token-level\none, and they are combined as the input of encoder. Experimental results\ndemonstrate that our method effectively models and leverages\nsemantic-unit-level information and outperforms the strong baselines. The code\nis available at https://github.com/ictnlp/SU4MT.\n","authors":["Langlin Huang","Shuhao Gu","Zhuocheng Zhang","Yang Feng"],"pdf_url":"https://arxiv.org/pdf/2310.11360v1.pdf","comment":"Accepted to EMNLP findings 2023"},{"id":"http://arxiv.org/abs/2305.12785v2","updated":"2023-10-17T15:48:15Z","published":"2023-05-22T07:30:35Z","title":"MacLaSa: Multi-Aspect Controllable Text Generation via Efficient\n  Sampling from Compact Latent Space","summary":"  Multi-aspect controllable text generation aims to generate fluent sentences\nthat possess multiple desired attributes simultaneously. Traditional methods\neither combine many operators in the decoding stage, often with costly\niteration or search in the discrete text space, or train separate controllers\nfor each aspect, resulting in a degeneration of text quality due to the\ndiscrepancy between different aspects. To address these limitations, we\nintroduce a novel approach for multi-aspect control, namely MacLaSa, that\nestimates compact latent space for multiple aspects and performs efficient\nsampling with a robust sampler based on ordinary differential equations (ODEs).\nTo eliminate the domain gaps between different aspects, we utilize a\nVariational Autoencoder (VAE) network to map text sequences from varying data\nsources into close latent representations. The estimated latent space enables\nthe formulation of joint energy-based models (EBMs) and the plugging in of\narbitrary attribute discriminators to achieve multi-aspect control. Afterwards,\nwe draw latent vector samples with an ODE-based sampler and feed sampled\nexamples to the VAE decoder to produce target text sequences. Experimental\nresults demonstrate that MacLaSa outperforms several strong baselines on\nattribute relevance and textual quality while maintaining a high inference\nspeed.\n","authors":["Hanxing Ding","Liang Pang","Zihao Wei","Huawei Shen","Xueqi Cheng","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2305.12785v2.pdf","comment":"Accepted to the Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2304.12206v2","updated":"2023-10-17T15:46:54Z","published":"2023-04-24T15:46:26Z","title":"PAXQA: Generating Cross-lingual Question Answering Examples at Training\n  Scale","summary":"  Existing question answering (QA) systems owe much of their success to large,\nhigh-quality training data. Such annotation efforts are costly, and the\ndifficulty compounds in the cross-lingual setting. Therefore, prior\ncross-lingual QA work has focused on releasing evaluation datasets, and then\napplying zero-shot methods as baselines. This work proposes a synthetic data\ngeneration method for cross-lingual QA which leverages indirect supervision\nfrom existing parallel corpora. Our method termed PAXQA (Projecting annotations\nfor cross-lingual (x) QA) decomposes cross-lingual QA into two stages. First,\nwe apply a question generation (QG) model to the English side. Second, we apply\nannotation projection to translate both the questions and answers. To better\ntranslate questions, we propose a novel use of lexically-constrained machine\ntranslation, in which constrained entities are extracted from the parallel\nbitexts.\n  We apply PAXQA to generate cross-lingual QA examples in 4 languages (662K\nexamples total), and perform human evaluation on a subset to create validation\nand test splits. We then show that models fine-tuned on these datasets\noutperform prior synthetic data generation models over several extractive QA\ndatasets. The largest performance gains are for directions with non-English\nquestions and English contexts. Ablation studies show that our dataset\ngeneration method is relatively robust to noise from automatic word alignments,\nshowing the sufficient quality of our generations. To facilitate follow-up\nwork, we release our code and datasets at https://github.com/manestay/paxqa .\n","authors":["Bryan Li","Chris Callison-Burch"],"pdf_url":"https://arxiv.org/pdf/2304.12206v2.pdf","comment":"EMNLP 2023 (Findings)"},{"id":"http://arxiv.org/abs/2310.11344v1","updated":"2023-10-17T15:26:40Z","published":"2023-10-17T15:26:40Z","title":"The effect of stemming and lemmatization on Portuguese fake news text\n  classification","summary":"  With the popularization of the internet, smartphones and social media,\ninformation is being spread quickly and easily way, which implies bigger\ntraffic of information in the world, but there is a problem that is harming\nsociety with the dissemination of fake news. With a bigger flow of information,\nsome people are trying to disseminate deceptive information and fake news. The\nautomatic detection of fake news is a challenging task because to obtain a good\nresult is necessary to deal with linguistics problems, especially when we are\ndealing with languages that not have been comprehensively studied yet, besides\nthat, some techniques can help to reach a good result when we are dealing with\ntext data, although, the motivation of detecting this deceptive information it\nis in the fact that the people need to know which information is true and\ntrustful and which one is not. In this work, we present the effect the\npre-processing methods such as lemmatization and stemming have on fake news\nclassification, for that we designed some classifier models applying different\npre-processing techniques. The results show that the pre-processing step is\nimportant to obtain betters results, the stemming and lemmatization techniques\nare interesting methods and need to be more studied to develop techniques\nfocused on the Portuguese language so we can reach better results.\n","authors":["Lucca de Freitas Santos","Murilo Varges da Silva"],"pdf_url":"https://arxiv.org/pdf/2310.11344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10236v3","updated":"2023-10-17T15:20:19Z","published":"2023-07-16T08:28:04Z","title":"Look Before You Leap: An Exploratory Study of Uncertainty Measurement\n  for Large Language Models","summary":"  The recent performance leap of Large Language Models (LLMs) opens up new\nopportunities across numerous industrial applications and domains. However,\nerroneous generations, such as false predictions, misinformation, and\nhallucination made by LLMs, have also raised severe concerns for the\ntrustworthiness of LLMs', especially in safety-, security- and\nreliability-sensitive scenarios, potentially hindering real-world adoptions.\nWhile uncertainty estimation has shown its potential for interpreting the\nprediction risks made by general machine learning (ML) models, little is known\nabout whether and to what extent it can help explore an LLM's capabilities and\ncounteract its undesired behavior. To bridge the gap, in this paper, we\ninitiate an exploratory study on the risk assessment of LLMs from the lens of\nuncertainty. In particular, we experiment with twelve uncertainty estimation\nmethods and four LLMs on four prominent natural language processing (NLP) tasks\nto investigate to what extent uncertainty estimation techniques could help\ncharacterize the prediction risks of LLMs. Our findings validate the\neffectiveness of uncertainty estimation for revealing LLMs'\nuncertain/non-factual predictions. In addition to general NLP tasks, we\nextensively conduct experiments with four LLMs for code generation on two\ndatasets. We find that uncertainty estimation can potentially uncover buggy\nprograms generated by LLMs. Insights from our study shed light on future design\nand development for reliable LLMs, facilitating further research toward\nenhancing the trustworthiness of LLMs.\n","authors":["Yuheng Huang","Jiayang Song","Zhijie Wang","Shengming Zhao","Huaming Chen","Felix Juefei-Xu","Lei Ma"],"pdf_url":"https://arxiv.org/pdf/2307.10236v3.pdf","comment":"20 pages, 4 figures"},{"id":"http://arxiv.org/abs/2310.11324v1","updated":"2023-10-17T15:03:30Z","published":"2023-10-17T15:03:30Z","title":"Quantifying Language Models' Sensitivity to Spurious Features in Prompt\n  Design or: How I learned to start worrying about prompt formatting","summary":"  As large language models (LLMs) are adopted as a fundamental component of\nlanguage technologies, it is crucial to accurately characterize their\nperformance. Because choices in prompt design can strongly influence model\nbehavior, this design process is critical in effectively using any modern\npre-trained generative language model. In this work, we focus on LLM\nsensitivity to a quintessential class of meaning-preserving design choices:\nprompt formatting. We find that several widely used open-source LLMs are\nextremely sensitive to subtle changes in prompt formatting in few-shot\nsettings, with performance differences of up to 76 accuracy points when\nevaluated using LLaMA-2-13B. Sensitivity remains even when increasing model\nsize, the number of few-shot examples, or performing instruction tuning. Our\nanalysis suggests that work evaluating LLMs with prompting-based methods would\nbenefit from reporting a range of performance across plausible prompt formats,\ninstead of the currently-standard practice of reporting performance on a single\nformat. We also show that format performance only weakly correlates between\nmodels, which puts into question the methodological validity of comparing\nmodels with an arbitrarily chosen, fixed prompt format. To facilitate\nsystematic analysis we propose FormatSpread, an algorithm that rapidly\nevaluates a sampled set of plausible prompt formats for a given task, and\nreports the interval of expected performance without accessing model weights.\nFurthermore, we present a suite of analyses that characterize the nature of\nthis sensitivity, including exploring the influence of particular atomic\nperturbations and the internal representation of particular formats.\n","authors":["Melanie Sclar","Yejin Choi","Yulia Tsvetkov","Alane Suhr"],"pdf_url":"https://arxiv.org/pdf/2310.11324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.09652v2","updated":"2023-10-17T14:59:28Z","published":"2023-05-16T17:53:03Z","title":"The Interpreter Understands Your Meaning: End-to-end Spoken Language\n  Understanding Aided by Speech Translation","summary":"  End-to-end spoken language understanding (SLU) remains elusive even with\ncurrent large pretrained language models on text and speech, especially in\nmultilingual cases. Machine translation has been established as a powerful\npretraining objective on text as it enables the model to capture high-level\nsemantics of the input utterance and associations between different languages,\nwhich is desired for speech models that work on lower-level acoustic frames.\nMotivated particularly by the task of cross-lingual SLU, we demonstrate that\nthe task of speech translation (ST) is a good means of pretraining speech\nmodels for end-to-end SLU on both intra- and cross-lingual scenarios.\n  By introducing ST, our models reach higher performance over baselines on\nmonolingual and multilingual intent classification as well as spoken question\nanswering using SLURP, MINDS-14, and NMSQA benchmarks. To verify the\neffectiveness of our methods, we also create new benchmark datasets from both\nsynthetic and real sources, for speech summarization and low-resource/zero-shot\ntransfer from English to French or Spanish. We further show the value of\npreserving knowledge for the ST pretraining task for better downstream\nperformance, possibly using Bayesian transfer regularizers.\n","authors":["Mutian He","Philip N. Garner"],"pdf_url":"https://arxiv.org/pdf/2305.09652v2.pdf","comment":"16 pages, 3 figures; accepted by Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.07276v2","updated":"2023-10-17T14:55:58Z","published":"2023-10-11T07:57:08Z","title":"BioT5: Enriching Cross-modal Integration in Biology with Chemical\n  Knowledge and Natural Language Associations","summary":"  Recent advancements in biological research leverage the integration of\nmolecules, proteins, and natural language to enhance drug discovery. However,\ncurrent models exhibit several limitations, such as the generation of invalid\nmolecular SMILES, underutilization of contextual information, and equal\ntreatment of structured and unstructured knowledge. To address these issues, we\npropose $\\mathbf{BioT5}$, a comprehensive pre-training framework that enriches\ncross-modal integration in biology with chemical knowledge and natural language\nassociations. $\\mathbf{BioT5}$ utilizes SELFIES for $100%$ robust molecular\nrepresentations and extracts knowledge from the surrounding context of\nbio-entities in unstructured biological literature. Furthermore,\n$\\mathbf{BioT5}$ distinguishes between structured and unstructured knowledge,\nleading to more effective utilization of information. After fine-tuning, BioT5\nshows superior performance across a wide range of tasks, demonstrating its\nstrong capability of capturing underlying relations and properties of\nbio-entities. Our code is available at\n$\\href{https://github.com/QizhiPei/BioT5}{Github}$.\n","authors":["Qizhi Pei","Wei Zhang","Jinhua Zhu","Kehan Wu","Kaiyuan Gao","Lijun Wu","Yingce Xia","Rui Yan"],"pdf_url":"https://arxiv.org/pdf/2310.07276v2.pdf","comment":"Accepted by Empirical Methods in Natural Language Processing 2023\n  (EMNLP 2023)"},{"id":"http://arxiv.org/abs/2310.11318v1","updated":"2023-10-17T14:52:33Z","published":"2023-10-17T14:52:33Z","title":"Utilising a Large Language Model to Annotate Subject Metadata: A Case\n  Study in an Australian National Research Data Catalogue","summary":"  In support of open and reproducible research, there has been a rapidly\nincreasing number of datasets made available for research. As the availability\nof datasets increases, it becomes more important to have quality metadata for\ndiscovering and reusing them. Yet, it is a common issue that datasets often\nlack quality metadata due to limited resources for data curation. Meanwhile,\ntechnologies such as artificial intelligence and large language models (LLMs)\nare progressing rapidly. Recently, systems based on these technologies, such as\nChatGPT, have demonstrated promising capabilities for certain data curation\ntasks. This paper proposes to leverage LLMs for cost-effective annotation of\nsubject metadata through the LLM-based in-context learning. Our method employs\nGPT-3.5 with prompts designed for annotating subject metadata, demonstrating\npromising performance in automatic metadata annotation. However, models based\non in-context learning cannot acquire discipline-specific rules, resulting in\nlower performance in several categories. This limitation arises from the\nlimited contextual information available for subject inference. To the best of\nour knowledge, we are introducing, for the first time, an in-context learning\nmethod that harnesses large language models for automated subject metadata\nannotation.\n","authors":["Shiwei Zhang","Mingfang Wu","Xiuzhen Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.11318v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09520v2","updated":"2023-10-17T14:48:25Z","published":"2023-10-14T07:19:47Z","title":"Reward-Augmented Decoding: Efficient Controlled Text Generation With a\n  Unidirectional Reward Model","summary":"  While large language models have proven effective in a huge range of\ndownstream applications, they often generate text that is problematic or lacks\na desired attribute. In this paper, we introduce Reward-Augmented Decoding\n(RAD), a text generation procedure that uses a small unidirectional reward\nmodel to encourage a language model to generate text that has certain\nproperties. Specifically, RAD uses the reward model to score generations as\nthey are produced and rescales sampling probabilities to favor high-reward\ntokens. By using a unidirectional reward model, RAD can cache activations from\nprior generation steps to decrease computational overhead. Through experiments\non generating non-toxic and sentiment-controlled text, we demonstrate that RAD\nperforms best among methods that change only the generation procedure and\nmatches the performance of state-of-the-art methods that involve re-training\nthe language model. We further validate that RAD is effective on very large\nlanguage models while incurring a minimal computational overhead.\n","authors":["Haikang Deng","Colin Raffel"],"pdf_url":"https://arxiv.org/pdf/2310.09520v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11171v2","updated":"2023-10-17T14:45:39Z","published":"2023-05-18T17:58:35Z","title":"TrueTeacher: Learning Factual Consistency Evaluation with Large Language\n  Models","summary":"  Factual consistency evaluation is often conducted using Natural Language\nInference (NLI) models, yet these models exhibit limited success in evaluating\nsummaries. Previous work improved such models with synthetic training data.\nHowever, the data is typically based on perturbed human-written summaries,\nwhich often differ in their characteristics from real model-generated summaries\nand have limited coverage of possible factual errors. Alternatively, large\nlanguage models (LLMs) have recently shown promising results in directly\nevaluating generative tasks, but are too computationally expensive for\npractical use. Motivated by these limitations, we introduce TrueTeacher, a\nmethod for generating synthetic data by annotating diverse model-generated\nsummaries using a LLM. Unlike prior work, TrueTeacher does not rely on\nhuman-written summaries, and is multilingual by nature. Experiments on the TRUE\nbenchmark show that a student model trained using our data, substantially\noutperforms both the state-of-the-art model with similar capacity, and the LLM\nteacher. In a systematic study, we compare TrueTeacher to existing synthetic\ndata generation methods and demonstrate its superiority and robustness to\ndomain-shift. We also show that our method generalizes to multilingual\nscenarios. Lastly, we release our large scale synthetic dataset (1.4M\nexamples), generated using TrueTeacher, and a checkpoint trained on this data.\n","authors":["Zorik Gekhman","Jonathan Herzig","Roee Aharoni","Chen Elkind","Idan Szpektor"],"pdf_url":"https://arxiv.org/pdf/2305.11171v2.pdf","comment":"Accepted as a long paper in EMNLP 2023"},{"id":"http://arxiv.org/abs/2305.15040v2","updated":"2023-10-17T14:37:10Z","published":"2023-05-24T11:27:53Z","title":"Active Learning for Natural Language Generation","summary":"  The field of Natural Language Generation (NLG) suffers from a severe shortage\nof labeled data due to the extremely expensive and time-consuming process\ninvolved in manual annotation. A natural approach for coping with this problem\nis active learning (AL), a well-known machine learning technique for improving\nannotation efficiency by selectively choosing the most informative examples to\nlabel. However, while AL has been well-researched in the context of text\nclassification, its application to NLG remains largely unexplored. In this\npaper, we present a first systematic study of active learning for NLG,\nconsidering a diverse set of tasks and multiple leading selection strategies,\nand harnessing a strong instruction-tuned model. Our results indicate that the\nperformance of existing AL strategies is inconsistent, surpassing the baseline\nof random example selection in some cases but not in others. We highlight some\nnotable differences between the classification and generation scenarios, and\nanalyze the selection behaviors of existing AL strategies. Our findings\nmotivate exploring novel approaches for applying AL to generation tasks.\n","authors":["Yotam Perlitz","Ariel Gera","Michal Shmueli-Scheuer","Dafna Sheinwald","Noam Slonim","Liat Ein-Dor"],"pdf_url":"https://arxiv.org/pdf/2305.15040v2.pdf","comment":"Accepted to EMNLP2023 as a long paper"},{"id":"http://arxiv.org/abs/2310.11303v1","updated":"2023-10-17T14:27:34Z","published":"2023-10-17T14:27:34Z","title":"QADYNAMICS: Training Dynamics-Driven Synthetic QA Diagnostic for\n  Zero-Shot Commonsense Question Answering","summary":"  Zero-shot commonsense Question-Answering (QA) requires models to reason about\ngeneral situations beyond specific benchmarks. State-of-the-art approaches\nfine-tune language models on QA pairs constructed from CommonSense Knowledge\nBases (CSKBs) to equip the models with more commonsense knowledge in a QA\ncontext. However, current QA synthesis protocols may introduce noise from the\nCSKBs and generate ungrammatical questions and false negative options, which\nimpede the model's ability to generalize. To address these issues, we propose\nQADYNAMICS, a training dynamics-driven framework for QA diagnostics and\nrefinement. Our approach analyzes the training dynamics of each QA pair at both\nthe question level and option level, discarding machine-detectable artifacts by\nremoving uninformative QA pairs and mislabeled or false-negative options.\nExtensive experiments demonstrate the effectiveness of our approach, which\noutperforms all baselines while using only 33% of the synthetic data, even\nincluding LLMs such as ChatGPT. Moreover, expert evaluations confirm that our\nframework significantly improves the quality of QA synthesis. Our codes and\nmodel checkpoints are available at\nhttps://github.com/HKUST-KnowComp/QaDynamics.\n","authors":["Haochen Shi","Weiqi Wang","Tianqing Fang","Baixuan Xu","Wenxuan Ding","Xin Liu","Yangqiu Song"],"pdf_url":"https://arxiv.org/pdf/2310.11303v1.pdf","comment":"Findings of EMNLP2023"},{"id":"http://arxiv.org/abs/2302.11713v5","updated":"2023-10-17T14:19:13Z","published":"2023-02-23T00:33:54Z","title":"Can Pre-trained Vision and Language Models Answer Visual\n  Information-Seeking Questions?","summary":"  Pre-trained vision and language models have demonstrated state-of-the-art\ncapabilities over existing tasks involving images and texts, including visual\nquestion answering. However, it remains unclear whether these models possess\nthe capability to answer questions that are not only querying visual content\nbut knowledge-intensive and information-seeking. In this study, we introduce\nInfoSeek, a visual question answering dataset tailored for information-seeking\nquestions that cannot be answered with only common sense knowledge. Using\nInfoSeek, we analyze various pre-trained visual question answering models and\ngain insights into their characteristics. Our findings reveal that\nstate-of-the-art pre-trained multi-modal models (e.g., PaLI-X, BLIP2, etc.)\nface challenges in answering visual information-seeking questions, but\nfine-tuning on the InfoSeek dataset elicits models to use fine-grained\nknowledge that was learned during their pre-training. Furthermore, we show that\naccurate visual entity recognition can be used to improve performance on\nInfoSeek by retrieving relevant documents, showing a significant space for\nimprovement.\n","authors":["Yang Chen","Hexiang Hu","Yi Luan","Haitian Sun","Soravit Changpinyo","Alan Ritter","Ming-Wei Chang"],"pdf_url":"https://arxiv.org/pdf/2302.11713v5.pdf","comment":"EMNLP 2023 (main conference); Our dataset and evaluation is available\n  at https://open-vision-language.github.io/infoseek/"},{"id":"http://arxiv.org/abs/2309.12871v3","updated":"2023-10-17T14:08:53Z","published":"2023-09-22T13:52:42Z","title":"AnglE-optimized Text Embeddings","summary":"  High-quality text embedding is pivotal in improving semantic textual\nsimilarity (STS) tasks, which are crucial components in Large Language Model\n(LLM) applications. However, a common challenge existing text embedding models\nface is the problem of vanishing gradients, primarily due to their reliance on\nthe cosine function in the optimization objective, which has saturation zones.\nTo address this issue, this paper proposes a novel angle-optimized text\nembedding model called AnglE. The core idea of AnglE is to introduce angle\noptimization in a complex space. This novel approach effectively mitigates the\nadverse effects of the saturation zone in the cosine function, which can impede\ngradient and hinder optimization processes. To set up a comprehensive STS\nevaluation, we experimented on existing short-text STS datasets and a newly\ncollected long-text STS dataset from GitHub Issues. Furthermore, we examine\ndomain-specific STS scenarios with limited labeled data and explore how AnglE\nworks with LLM-annotated data. Extensive experiments were conducted on various\ntasks including short-text STS, long-text STS, and domain-specific STS tasks.\nThe results show that AnglE outperforms the state-of-the-art (SOTA) STS models\nthat ignore the cosine saturation zone. These findings demonstrate the ability\nof AnglE to generate high-quality text embeddings and the usefulness of angle\noptimization in STS.\n","authors":["Xianming Li","Jing Li"],"pdf_url":"https://arxiv.org/pdf/2309.12871v3.pdf","comment":"update llama results"},{"id":"http://arxiv.org/abs/2310.11282v1","updated":"2023-10-17T14:06:06Z","published":"2023-10-17T14:06:06Z","title":"ChapGTP, ILLC's Attempt at Raising a BabyLM: Improving Data Efficiency\n  by Automatic Task Formation","summary":"  We present the submission of the ILLC at the University of Amsterdam to the\nBabyLM challenge (Warstadt et al., 2023), in the strict-small track. Our final\nmodel, ChapGTP, is a masked language model that was trained for 200 epochs,\naided by a novel data augmentation technique called Automatic Task Formation.\nWe discuss in detail the performance of this model on the three evaluation\nsuites: BLiMP, (Super)GLUE, and MSGS. Furthermore, we present a wide range of\nmethods that were ultimately not included in the model, but may serve as\ninspiration for training LMs in low-resource settings.\n","authors":["Jaap Jumelet","Michael Hanna","Marianne de Heer Kloots","Anna Langedijk","Charlotte Pouw","Oskar van der Wal"],"pdf_url":"https://arxiv.org/pdf/2310.11282v1.pdf","comment":"Part of the BabyLM challenge at CoNLL"},{"id":"http://arxiv.org/abs/2310.11275v1","updated":"2023-10-17T13:53:57Z","published":"2023-10-17T13:53:57Z","title":"xMEN: A Modular Toolkit for Cross-Lingual Medical Entity Normalization","summary":"  Objective: To improve performance of medical entity normalization across many\nlanguages, especially when fewer language resources are available compared to\nEnglish.\n  Materials and Methods: We introduce xMEN, a modular system for cross-lingual\nmedical entity normalization, which performs well in both low- and\nhigh-resource scenarios. When synonyms in the target language are scarce for a\ngiven terminology, we leverage English aliases via cross-lingual candidate\ngeneration. For candidate ranking, we incorporate a trainable cross-encoder\nmodel if annotations for the target task are available. We also evaluate\ncross-encoders trained in a weakly supervised manner based on\nmachine-translated datasets from a high resource domain. Our system is publicly\navailable as an extensible Python toolkit.\n  Results: xMEN improves the state-of-the-art performance across a wide range\nof multilingual benchmark datasets. Weakly supervised cross-encoders are\neffective when no training data is available for the target task. Through the\ncompatibility of xMEN with the BigBIO framework, it can be easily used with\nexisting and prospective datasets.\n  Discussion: Our experiments show the importance of balancing the output of\ngeneral-purpose candidate generators with subsequent trainable re-rankers,\nwhich we achieve through a rank regularization term in the loss function of the\ncross-encoder. However, error analysis reveals that multi-word expressions and\nother complex entities are still challenging.\n  Conclusion: xMEN exhibits strong performance for medical entity normalization\nin multiple languages, even when no labeled data and few terminology aliases\nfor the target language are available. Its configuration system and evaluation\nmodules enable reproducible benchmarks. Models and code are available online at\nthe following URL: https://github.com/hpi-dhc/xmen\n","authors":["Florian Borchert","Ignacio Llorca","Roland Roller","Bert Arnrich","Matthieu-P. Schapranow"],"pdf_url":"https://arxiv.org/pdf/2310.11275v1.pdf","comment":"16 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.11266v1","updated":"2023-10-17T13:39:26Z","published":"2023-10-17T13:39:26Z","title":"Emulating Human Cognitive Processes for Expert-Level Medical\n  Question-Answering with Large Language Models","summary":"  In response to the pressing need for advanced clinical problem-solving tools\nin healthcare, we introduce BooksMed, a novel framework based on a Large\nLanguage Model (LLM). BooksMed uniquely emulates human cognitive processes to\ndeliver evidence-based and reliable responses, utilizing the GRADE (Grading of\nRecommendations, Assessment, Development, and Evaluations) framework to\neffectively quantify evidence strength. For clinical decision-making to be\nappropriately assessed, an evaluation metric that is clinically aligned and\nvalidated is required. As a solution, we present ExpertMedQA, a multispecialty\nclinical benchmark comprised of open-ended, expert-level clinical questions,\nand validated by a diverse group of medical professionals. By demanding an\nin-depth understanding and critical appraisal of up-to-date clinical\nliterature, ExpertMedQA rigorously evaluates LLM performance. BooksMed\noutperforms existing state-of-the-art models Med-PaLM 2, Almanac, and ChatGPT\nin a variety of medical scenarios. Therefore, a framework that mimics human\ncognitive stages could be a useful tool for providing reliable and\nevidence-based responses to clinical inquiries.\n","authors":["Khushboo Verma","Marina Moore","Stephanie Wottrich","Karla Robles López","Nishant Aggarwal","Zeel Bhatt","Aagamjit Singh","Bradford Unroe","Salah Basheer","Nitish Sachdeva","Prinka Arora","Harmanjeet Kaur","Tanupreet Kaur","Tevon Hood","Anahi Marquez","Tushar Varshney","Nanfu Deng","Azaan Ramani","Pawanraj Ishwara","Maimoona Saeed","Tatiana López Velarde Peña","Bryan Barksdale","Sushovan Guha","Satwant Kumar"],"pdf_url":"https://arxiv.org/pdf/2310.11266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14107v2","updated":"2023-10-17T13:38:27Z","published":"2023-09-25T13:00:33Z","title":"Wav2vec-based Detection and Severity Level Classification of Dysarthria\n  from Speech","summary":"  Automatic detection and severity level classification of dysarthria directly\nfrom acoustic speech signals can be used as a tool in medical diagnosis. In\nthis work, the pre-trained wav2vec 2.0 model is studied as a feature extractor\nto build detection and severity level classification systems for dysarthric\nspeech. The experiments were carried out with the popularly used UA-speech\ndatabase. In the detection experiments, the results revealed that the best\nperformance was obtained using the embeddings from the first layer of the\nwav2vec model that yielded an absolute improvement of 1.23% in accuracy\ncompared to the best performing baseline feature (spectrogram). In the studied\nseverity level classification task, the results revealed that the embeddings\nfrom the final layer gave an absolute improvement of 10.62% in accuracy\ncompared to the best baseline features (mel-frequency cepstral coefficients).\n","authors":["Farhad Javanmardi","Saska Tirronen","Manila Kodali","Sudarsana Reddy Kadiri","Paavo Alku"],"pdf_url":"https://arxiv.org/pdf/2309.14107v2.pdf","comment":"copyright 2023 IEEE. Personal use of this material is permitted.\n  Permission from IEEE must be obtained for all other uses, in any current or\n  future media, including reprinting/republishing this material for advertising\n  or promotional purposes, creating new collective works, for resale or\n  redistribution to servers or lists, or reuse of any copyrighted component of\n  this work in other works"},{"id":"http://arxiv.org/abs/2309.14080v2","updated":"2023-10-17T13:36:36Z","published":"2023-09-25T12:14:25Z","title":"Analysis and Detection of Pathological Voice using Glottal Source\n  Features","summary":"  Automatic detection of voice pathology enables objective assessment and\nearlier intervention for the diagnosis. This study provides a systematic\nanalysis of glottal source features and investigates their effectiveness in\nvoice pathology detection. Glottal source features are extracted using glottal\nflows estimated with the quasi-closed phase (QCP) glottal inverse filtering\nmethod, using approximate glottal source signals computed with the zero\nfrequency filtering (ZFF) method, and using acoustic voice signals directly. In\naddition, we propose to derive mel-frequency cepstral coefficients (MFCCs) from\nthe glottal source waveforms computed by QCP and ZFF to effectively capture the\nvariations in glottal source spectra of pathological voice. Experiments were\ncarried out using two databases, the Hospital Universitario Principe de\nAsturias (HUPA) database and the Saarbrucken Voice Disorders (SVD) database.\nAnalysis of features revealed that the glottal source contains information that\ndiscriminates normal and pathological voice. Pathology detection experiments\nwere carried out using support vector machine (SVM). From the detection\nexperiments it was observed that the performance achieved with the studied\nglottal source features is comparable or better than that of conventional MFCCs\nand perceptual linear prediction (PLP) features. The best detection performance\nwas achieved when the glottal source features were combined with the\nconventional MFCCs and PLP features, which indicates the complementary nature\nof the features.\n","authors":["Sudarsana Reddy Kadiri","Paavo Alku"],"pdf_url":"https://arxiv.org/pdf/2309.14080v2.pdf","comment":"Copyright 2020 IEEE. Personal use of this material is permitted.\n  Permission from IEEE must be obtained for all other uses, in any current or\n  future media, including reprinting/republishing this material for advertising\n  or promotional purposes, creating new collective works, for resale or\n  redistribution to servers or lists, or reuse of any copyrighted component of\n  this work in other works"},{"id":"http://arxiv.org/abs/2305.07609v3","updated":"2023-10-17T13:29:54Z","published":"2023-05-12T16:54:36Z","title":"Is ChatGPT Fair for Recommendation? Evaluating Fairness in Large\n  Language Model Recommendation","summary":"  The remarkable achievements of Large Language Models (LLMs) have led to the\nemergence of a novel recommendation paradigm -- Recommendation via LLM\n(RecLLM). Nevertheless, it is important to note that LLMs may contain social\nprejudices, and therefore, the fairness of recommendations made by RecLLM\nrequires further investigation. To avoid the potential risks of RecLLM, it is\nimperative to evaluate the fairness of RecLLM with respect to various sensitive\nattributes on the user side. Due to the differences between the RecLLM paradigm\nand the traditional recommendation paradigm, it is problematic to directly use\nthe fairness benchmark of traditional recommendation. To address the dilemma,\nwe propose a novel benchmark called Fairness of Recommendation via LLM\n(FaiRLLM). This benchmark comprises carefully crafted metrics and a dataset\nthat accounts for eight sensitive attributes1 in two recommendation scenarios:\nmusic and movies. By utilizing our FaiRLLM benchmark, we conducted an\nevaluation of ChatGPT and discovered that it still exhibits unfairness to some\nsensitive attributes when generating recommendations. Our code and dataset can\nbe found at https://github.com/jizhi-zhang/FaiRLLM.\n","authors":["Jizhi Zhang","Keqin Bao","Yang Zhang","Wenjie Wang","Fuli Feng","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2305.07609v3.pdf","comment":"Accepted by Recsys 2023 (Short)"},{"id":"http://arxiv.org/abs/2310.11258v1","updated":"2023-10-17T13:23:18Z","published":"2023-10-17T13:23:18Z","title":"Utilizing Weak Supervision To Generate Indonesian Conservation Dataset","summary":"  Weak supervision has emerged as a promising approach for rapid and\nlarge-scale dataset creation in response to the increasing demand for\naccelerated NLP development. By leveraging labeling functions, weak supervision\nallows practitioners to generate datasets quickly by creating learned label\nmodels that produce soft-labeled datasets. This paper aims to show how such an\napproach can be utilized to build an Indonesian NLP dataset from conservation\nnews text. We construct two types of datasets: multi-class classification and\nsentiment classification. We then provide baseline experiments using various\npretrained language models. These baseline results demonstrate test\nperformances of 59.79% accuracy and 55.72% F1-score for sentiment\nclassification, 66.87% F1-score-macro, 71.5% F1-score-micro, and 83.67% ROC-AUC\nfor multi-class classification. Additionally, we release the datasets and\nlabeling functions used in this work for further research and exploration.\n","authors":["Mega Fransiska","Diah Pitaloka"," Saripudin","Satrio Putra","Lintang Sutawika"],"pdf_url":"https://arxiv.org/pdf/2310.11258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11252v1","updated":"2023-10-17T13:20:16Z","published":"2023-10-17T13:20:16Z","title":"Revealing the Unwritten: Visual Investigation of Beam Search Trees to\n  Address Language Model Prompting Challenges","summary":"  The growing popularity of generative language models has amplified interest\nin interactive methods to guide model outputs. Prompt refinement is considered\none of the most effective means to influence output among these methods. We\nidentify several challenges associated with prompting large language models,\ncategorized into data- and model-specific, linguistic, and socio-linguistic\nchallenges. A comprehensive examination of model outputs, including runner-up\ncandidates and their corresponding probabilities, is needed to address these\nissues. The beam search tree, the prevalent algorithm to sample model outputs,\ncan inherently supply this information. Consequently, we introduce an\ninteractive visual method for investigating the beam search tree, facilitating\nanalysis of the decisions made by the model during generation. We\nquantitatively show the value of exposing the beam search tree and present five\ndetailed analysis scenarios addressing the identified challenges. Our\nmethodology validates existing results and offers additional insights.\n","authors":["Thilo Spinner","Rebecca Kehlbeck","Rita Sevastjanova","Tobias Stähle","Daniel A. Keim","Oliver Deussen","Andreas Spitz","Mennatallah El-Assady"],"pdf_url":"https://arxiv.org/pdf/2310.11252v1.pdf","comment":"9 pages paper, 2 pages references, 7 figures"},{"id":"http://arxiv.org/abs/2310.11248v1","updated":"2023-10-17T13:18:01Z","published":"2023-10-17T13:18:01Z","title":"CrossCodeEval: A Diverse and Multilingual Benchmark for Cross-File Code\n  Completion","summary":"  Code completion models have made significant progress in recent years, yet\ncurrent popular evaluation datasets, such as HumanEval and MBPP, predominantly\nfocus on code completion tasks within a single file. This over-simplified\nsetting falls short of representing the real-world software development\nscenario where repositories span multiple files with numerous cross-file\ndependencies, and accessing and understanding cross-file context is often\nrequired to complete the code correctly.\n  To fill in this gap, we propose CrossCodeEval, a diverse and multilingual\ncode completion benchmark that necessitates an in-depth cross-file contextual\nunderstanding to complete the code accurately. CrossCodeEval is built on a\ndiverse set of real-world, open-sourced, permissively-licensed repositories in\nfour popular programming languages: Python, Java, TypeScript, and C#. To create\nexamples that strictly require cross-file context for accurate completion, we\npropose a straightforward yet efficient static-analysis-based approach to\npinpoint the use of cross-file context within the current file.\n  Extensive experiments on state-of-the-art code language models like CodeGen\nand StarCoder demonstrate that CrossCodeEval is extremely challenging when the\nrelevant cross-file context is absent, and we see clear improvements when\nadding these context into the prompt. However, despite such improvements, the\npinnacle of performance remains notably unattained even with the\nhighest-performing model, indicating that CrossCodeEval is also capable of\nassessing model's capability in leveraging extensive context to make better\ncode completion. Finally, we benchmarked various methods in retrieving\ncross-file context, and show that CrossCodeEval can also be used to measure the\ncapability of code retrievers.\n","authors":["Yangruibo Ding","Zijian Wang","Wasi Uddin Ahmad","Hantian Ding","Ming Tan","Nihal Jain","Murali Krishna Ramanathan","Ramesh Nallapati","Parminder Bhatia","Dan Roth","Bing Xiang"],"pdf_url":"https://arxiv.org/pdf/2310.11248v1.pdf","comment":"To appear at NeurIPS 2023 (Datasets and Benchmarks Track)"},{"id":"http://arxiv.org/abs/2310.11244v1","updated":"2023-10-17T13:12:32Z","published":"2023-10-17T13:12:32Z","title":"Entity Matching using Large Language Models","summary":"  Entity Matching is the task of deciding whether two entity descriptions refer\nto the same real-world entity. Entity Matching is a central step in most data\nintegration pipelines and an enabler for many e-commerce applications which\nrequire to match products offers from different vendors. State-of-the-art\nentity matching methods often rely on pre-trained language models (PLMs) such\nas BERT or RoBERTa. Two major drawbacks of these models for entity matching are\nthat (i) the models require significant amounts of task-specific training data\nand (ii) the fine-tuned models are not robust concerning out-of-distribution\nentities. In this paper, we investigate using large language models (LLMs) for\nentity matching as a less domain-specific training data reliant and more robust\nalternative to PLM-based matchers. Our study covers hosted LLMs, such as GPT3.5\nand GPT4, as well as open source LLMs based on Llama2 which can be run locally.\nWe evaluate these models in a zero-shot scenario as well as a scenario where\ntask-specific training data is available. We compare different prompt designs\nas well as the prompt sensitivity of the models in the zero-shot scenario. We\ninvestigate (i) the selection of in-context demonstrations, (ii) the generation\nof matching rules, as well as (iii) fine-tuning GPT3.5 in the second scenario\nusing the same pool of training data across the different approaches. Our\nexperiments show that GPT4 without any task-specific training data outperforms\nfine-tuned PLMs (RoBERTa and Ditto) on three out of five benchmark datasets\nreaching F1 scores around 90%. The experiments with in-context learning and\nrule generation show that all models beside of GPT4 benefit from these\ntechniques (on average 5.9% and 2.2% F1), while GPT4 does not need such\nadditional guidance in most cases...\n","authors":["Ralph Peeters","Christian Bizer"],"pdf_url":"https://arxiv.org/pdf/2310.11244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12053v4","updated":"2023-10-17T13:10:05Z","published":"2023-09-21T13:20:13Z","title":"AceGPT, Localizing Large Language Models in Arabic","summary":"  This paper is devoted to the development of a localized Large Language Model\n(LLM) specifically for Arabic, a language imbued with unique cultural\ncharacteristics inadequately addressed by current mainstream models.\nSignificant concerns emerge when addressing cultural sensitivity and local\nvalues. To address this, the paper proposes a comprehensive solution that\nincludes further pre-training with Arabic texts, Supervised Fine-Tuning (SFT)\nutilizing native Arabic instructions, and GPT-4 responses in Arabic, alongside\nReinforcement Learning with AI Feedback (RLAIF) employing a reward model\nattuned to local culture and values. The goal is to cultivate culturally\ncognizant and value-aligned Arabic LLMs capable of accommodating the diverse,\napplication-specific needs of Arabic-speaking communities. Comprehensive\nevaluations reveal that the resulting model, dubbed 'AceGPT', sets the\nstate-of-the-art standard for open Arabic LLMs across various benchmarks,\nincluding the instruction-following benchmark (i.e., Arabic Vicuna-80 and\nArabic AlpacaEval), knowledge benchmark (i.e., Arabic MMLU and EXAMs), and the\nnewly introduced Arabic Cultural and Value Alignment benchmark. Notably, AceGPT\noutperforms Turbo in the popular Vicuna-80 benchmark when evaluated with GPT-4,\ndespite the benchmark's limited scale. Codes, data, and models are in\nhttps://github.com/FreedomIntelligence/AceGPT.\n","authors":["Huang Huang","Fei Yu","Jianqing Zhu","Xuening Sun","Hao Cheng","Dingjie Song","Zhihong Chen","Abdulmohsen Alharthi","Bang An","Juncai He","Ziche Liu","Zhiyi Zhang","Junying Chen","Jianquan Li","Benyou Wang","Lian Zhang","Ruoyu Sun","Xiang Wan","Haizhou Li","Jinchao Xu"],"pdf_url":"https://arxiv.org/pdf/2309.12053v4.pdf","comment":"https://github.com/FreedomIntelligence/AceGPT"},{"id":"http://arxiv.org/abs/2310.11237v1","updated":"2023-10-17T13:06:59Z","published":"2023-10-17T13:06:59Z","title":"Watermarking LLMs with Weight Quantization","summary":"  Abuse of large language models reveals high risks as large language models\nare being deployed at an astonishing speed. It is important to protect the\nmodel weights to avoid malicious usage that violates licenses of open-source\nlarge language models. This paper proposes a novel watermarking strategy that\nplants watermarks in the quantization process of large language models without\npre-defined triggers during inference. The watermark works when the model is\nused in the fp32 mode and remains hidden when the model is quantized to int8,\nin this way, the users can only inference the model without further supervised\nfine-tuning of the model. We successfully plant the watermark into open-source\nlarge language model weights including GPT-Neo and LLaMA. We hope our proposed\nmethod can provide a potential direction for protecting model weights in the\nera of large language model applications.\n","authors":["Linyang Li","Botian Jiang","Pengyu Wang","Ke Ren","Hang Yan","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2310.11237v1.pdf","comment":"Accepted by Findings of EMNLP2023"},{"id":"http://arxiv.org/abs/2310.11227v1","updated":"2023-10-17T12:58:17Z","published":"2023-10-17T12:58:17Z","title":"RealBehavior: A Framework for Faithfully Characterizing Foundation\n  Models' Human-like Behavior Mechanisms","summary":"  Reports of human-like behaviors in foundation models are growing, with\npsychological theories providing enduring tools to investigate these behaviors.\nHowever, current research tends to directly apply these human-oriented tools\nwithout verifying the faithfulness of their outcomes. In this paper, we\nintroduce a framework, RealBehavior, which is designed to characterize the\nhumanoid behaviors of models faithfully. Beyond simply measuring behaviors, our\nframework assesses the faithfulness of results based on reproducibility,\ninternal and external consistency, and generalizability. Our findings suggest\nthat a simple application of psychological tools cannot faithfully characterize\nall human-like behaviors. Moreover, we discuss the impacts of aligning models\nwith human and social values, arguing for the necessity of diversifying\nalignment objectives to prevent the creation of models with restricted\ncharacteristics.\n","authors":["Enyu Zhou","Rui Zheng","Zhiheng Xi","Songyang Gao","Xiaoran Fan","Zichu Fei","Jingting Ye","Tao Gui","Qi Zhang","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2310.11227v1.pdf","comment":"Accepted to Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.08943v2","updated":"2023-10-17T12:53:58Z","published":"2023-10-13T08:16:27Z","title":"Multi-level Adaptive Contrastive Learning for Knowledge Internalization\n  in Dialogue Generation","summary":"  Knowledge-grounded dialogue generation aims to mitigate the issue of text\ndegeneration by incorporating external knowledge to supplement the context.\nHowever, the model often fails to internalize this information into responses\nin a human-like manner. Instead, it simply inserts segments of the provided\nknowledge into generic responses. As a result, the generated responses tend to\nbe tedious, incoherent, and in lack of interactivity which means the\ndegeneration problem is still unsolved. In this work, we first find that such\ncopying-style degeneration is primarily due to the weak likelihood objective,\nwhich allows the model to \"cheat\" the objective by merely duplicating knowledge\nsegments in a superficial pattern matching based on overlap. To overcome this\nchallenge, we then propose a Multi-level Adaptive Contrastive Learning (MACL)\nframework that dynamically samples negative examples and subsequently penalizes\ndegeneration behaviors at both the token-level and sequence-level. Extensive\nexperiments on the WoW dataset demonstrate the effectiveness of our approach\nacross various pre-trained models.\n","authors":["Chenxu Yang","Zheng Lin","Lanrui Wang","Chong Tian","Liang Pang","Jiangnan Li","Qirong Ho","Yanan Cao","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2310.08943v2.pdf","comment":"Accepted by EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.11220v1","updated":"2023-10-17T12:51:35Z","published":"2023-10-17T12:51:35Z","title":"KG-GPT: A General Framework for Reasoning on Knowledge Graphs Using\n  Large Language Models","summary":"  While large language models (LLMs) have made considerable advancements in\nunderstanding and generating unstructured text, their application in structured\ndata remains underexplored. Particularly, using LLMs for complex reasoning\ntasks on knowledge graphs (KGs) remains largely untouched. To address this, we\npropose KG-GPT, a multi-purpose framework leveraging LLMs for tasks employing\nKGs. KG-GPT comprises three steps: Sentence Segmentation, Graph Retrieval, and\nInference, each aimed at partitioning sentences, retrieving relevant graph\ncomponents, and deriving logical conclusions, respectively. We evaluate KG-GPT\nusing KG-based fact verification and KGQA benchmarks, with the model showing\ncompetitive and robust performance, even outperforming several fully-supervised\nmodels. Our work, therefore, marks a significant step in unifying structured\nand unstructured data processing within the realm of LLMs.\n","authors":["Jiho Kim","Yeonsu Kwon","Yohan Jo","Edward Choi"],"pdf_url":"https://arxiv.org/pdf/2310.11220v1.pdf","comment":"Accepted to EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2310.11207v1","updated":"2023-10-17T12:34:32Z","published":"2023-10-17T12:34:32Z","title":"Can Large Language Models Explain Themselves? A Study of LLM-Generated\n  Self-Explanations","summary":"  Large language models (LLMs) such as ChatGPT have demonstrated superior\nperformance on a variety of natural language processing (NLP) tasks including\nsentiment analysis, mathematical reasoning and summarization. Furthermore,\nsince these models are instruction-tuned on human conversations to produce\n\"helpful\" responses, they can and often will produce explanations along with\nthe response, which we call self-explanations. For example, when analyzing the\nsentiment of a movie review, the model may output not only the positivity of\nthe sentiment, but also an explanation (e.g., by listing the sentiment-laden\nwords such as \"fantastic\" and \"memorable\" in the review). How good are these\nautomatically generated self-explanations? In this paper, we investigate this\nquestion on the task of sentiment analysis and for feature attribution\nexplanation, one of the most commonly studied settings in the interpretability\nliterature (for pre-ChatGPT models). Specifically, we study different ways to\nelicit the self-explanations, evaluate their faithfulness on a set of\nevaluation metrics, and compare them to traditional explanation methods such as\nocclusion or LIME saliency maps. Through an extensive set of experiments, we\nfind that ChatGPT's self-explanations perform on par with traditional ones, but\nare quite different from them according to various agreement metrics, meanwhile\nbeing much cheaper to produce (as they are generated along with the\nprediction). In addition, we identified several interesting characteristics of\nthem, which prompt us to rethink many current model interpretability practices\nin the era of ChatGPT(-like) LLMs.\n","authors":["Shiyuan Huang","Siddarth Mamidanna","Shreedhar Jangam","Yilun Zhou","Leilani H. Gilpin"],"pdf_url":"https://arxiv.org/pdf/2310.11207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15452v3","updated":"2023-10-17T12:17:41Z","published":"2023-08-29T17:22:39Z","title":"When Do Program-of-Thoughts Work for Reasoning?","summary":"  The reasoning capabilities of Large Language Models (LLMs) play a pivotal\nrole in the realm of embodied artificial intelligence. Although there are\neffective methods like program-of-thought prompting for LLMs which uses\nprogramming language to tackle complex reasoning tasks, the specific impact of\ncode data on the improvement of reasoning capabilities remains under-explored.\nTo address this gap, we propose complexity-impacted reasoning score (CIRS),\nwhich combines structural and logical attributes, to measure the correlation\nbetween code and reasoning abilities. Specifically, we use the abstract syntax\ntree to encode the structural information and calculate logical complexity by\nconsidering the difficulty and the cyclomatic complexity. Through an empirical\nanalysis, we find not all code data of complexity can be learned or understood\nby LLMs. Optimal level of complexity is critical to the improvement of\nreasoning abilities by program-aided prompting. Then we design an\nauto-synthesizing and stratifying algorithm, and apply it to instruction\ngeneration for mathematical reasoning and code data filtering for code\ngeneration tasks. Extensive results demonstrates the effectiveness of our\nproposed approach. Code will be integrated into the EasyInstruct framework at\nhttps://github.com/zjunlp/EasyInstruct.\n","authors":["Zhen Bi","Ningyu Zhang","Yinuo Jiang","Shumin Deng","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.15452v3.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2305.11490v4","updated":"2023-10-17T12:16:03Z","published":"2023-05-19T07:44:39Z","title":"LLM-CXR: Instruction-Finetuned LLM for CXR Image Understanding and\n  Generation","summary":"  Following the impressive development of LLMs, vision-language alignment in\nLLMs is actively being researched to enable multimodal reasoning and visual IO.\nThis direction of research is particularly relevant to medical imaging because\nmedical image analysis and generation consist of reasoning based on a\ncombination of visual features and prior knowledge. Many recent works have\nfocused on training adapter networks that serve as an information bridge\nbetween image processing networks and LLMs; but presumably, in order to achieve\nmaximum reasoning potential of LLMs on visual information as well, visual and\nlanguage features should be allowed to interact more freely. This is especially\nimportant in the medical domain because understanding and generating medical\nimages such as chest X-rays (CXR) require not only accurate visual and\nlanguage-based reasoning but also a more intimate mapping between the two\nmodalities. Thus, taking inspiration from previous work on the transformer and\nVQ-GAN combination for bidirectional image and text generation, we build upon\nthis approach and develop a method for instruction-tuning an LLM pre-trained\nonly on text to gain vision-language capabilities for medical images.\nSpecifically, we leverage a pretrained LLM's existing question-answering and\ninstruction-following abilities to teach it to understand visual inputs by\ninstructing it to answer questions about image inputs and, symmetrically,\noutput both text and image responses appropriate to a given query by tuning the\nLLM with diverse tasks that encompass image-based text-generation and\ntext-based image-generation. We show that our model, LLM-CXR, trained in this\napproach shows better image-text alignment in both CXR understanding and\ngeneration tasks while being smaller in size compared to previously developed\nmodels that perform a narrower range of tasks. The code is at\nhttps://github.com/hyn2028/llm-cxr.\n","authors":["Suhyeon Lee","Won Jun Kim","Jinho Chang","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2305.11490v4.pdf","comment":"20 pages, 8 figures"},{"id":"http://arxiv.org/abs/2310.11191v1","updated":"2023-10-17T12:14:03Z","published":"2023-10-17T12:14:03Z","title":"Medical Text Simplification: Optimizing for Readability with\n  Unlikelihood Training and Reranked Beam Search Decoding","summary":"  Text simplification has emerged as an increasingly useful application of AI\nfor bridging the communication gap in specialized fields such as medicine,\nwhere the lexicon is often dominated by technical jargon and complex\nconstructs. Despite notable progress, methods in medical simplification\nsometimes result in the generated text having lower quality and diversity. In\nthis work, we explore ways to further improve the readability of text\nsimplification in the medical domain. We propose (1) a new unlikelihood loss\nthat encourages generation of simpler terms and (2) a reranked beam search\ndecoding method that optimizes for simplicity, which achieve better performance\non readability metrics on three datasets. This study's findings offer promising\navenues for improving text simplification in the medical field.\n","authors":["Lorenzo Jaime Yu Flores","Heyuan Huang","Kejian Shi","Sophie Chheang","Arman Cohan"],"pdf_url":"https://arxiv.org/pdf/2310.11191v1.pdf","comment":"EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2305.13303v2","updated":"2023-10-17T11:58:37Z","published":"2023-05-22T17:58:04Z","title":"Towards Unsupervised Recognition of Semantic Differences in Related\n  Documents","summary":"  Automatically highlighting words that cause semantic differences between two\ndocuments could be useful for a wide range of applications. We formulate\nrecognizing semantic differences (RSD) as a token-level regression task and\nstudy three unsupervised approaches that rely on a masked language model. To\nassess the approaches, we begin with basic English sentences and gradually move\nto more complex, cross-lingual document pairs. Our results show that an\napproach based on word alignment and sentence-level contrastive learning has a\nrobust correlation to gold labels. However, all unsupervised approaches still\nleave a large margin of improvement. Code to reproduce our experiments is\navailable at https://github.com/ZurichNLP/recognizing-semantic-differences\n","authors":["Jannis Vamvas","Rico Sennrich"],"pdf_url":"https://arxiv.org/pdf/2305.13303v2.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.11166v1","updated":"2023-10-17T11:34:50Z","published":"2023-10-17T11:34:50Z","title":"ViSoBERT: A Pre-Trained Language Model for Vietnamese Social Media Text\n  Processing","summary":"  English and Chinese, known as resource-rich languages, have witnessed the\nstrong development of transformer-based language models for natural language\nprocessing tasks. Although Vietnam has approximately 100M people speaking\nVietnamese, several pre-trained models, e.g., PhoBERT, ViBERT, and vELECTRA,\nperformed well on general Vietnamese NLP tasks, including POS tagging and named\nentity recognition. These pre-trained language models are still limited to\nVietnamese social media tasks. In this paper, we present the first monolingual\npre-trained language model for Vietnamese social media texts, ViSoBERT, which\nis pre-trained on a large-scale corpus of high-quality and diverse Vietnamese\nsocial media texts using XLM-R architecture. Moreover, we explored our\npre-trained model on five important natural language downstream tasks on\nVietnamese social media texts: emotion recognition, hate speech detection,\nsentiment analysis, spam reviews detection, and hate speech spans detection.\nOur experiments demonstrate that ViSoBERT, with far fewer parameters, surpasses\nthe previous state-of-the-art models on multiple Vietnamese social media tasks.\nOur ViSoBERT model is\navailable\\footnote{\\url{https://huggingface.co/uitnlp/visobert}} only for\nresearch purposes.\n","authors":["Quoc-Nam Nguyen","Thang Chau Phan","Duc-Vu Nguyen","Kiet Van Nguyen"],"pdf_url":"https://arxiv.org/pdf/2310.11166v1.pdf","comment":"Accepted at EMNLP'2023 Main Conference"},{"id":"http://arxiv.org/abs/2310.11163v1","updated":"2023-10-17T11:29:04Z","published":"2023-10-17T11:29:04Z","title":"IMTLab: An Open-Source Platform for Building, Evaluating, and Diagnosing\n  Interactive Machine Translation Systems","summary":"  We present IMTLab, an open-source end-to-end interactive machine translation\n(IMT) system platform that enables researchers to quickly build IMT systems\nwith state-of-the-art models, perform an end-to-end evaluation, and diagnose\nthe weakness of systems. IMTLab treats the whole interactive translation\nprocess as a task-oriented dialogue with a human-in-the-loop setting, in which\nhuman interventions can be explicitly incorporated to produce high-quality,\nerror-free translations. To this end, a general communication interface is\ndesigned to support the flexible IMT architectures and user policies. Based on\nthe proposed design, we construct a simulated and real interactive environment\nto achieve end-to-end evaluation and leverage the framework to systematically\nevaluate previous IMT systems. Our simulated and manual experiments show that\nthe prefix-constrained decoding approach still gains the lowest editing cost in\nthe end-to-end evaluation, while BiTIIMT achieves comparable editing cost with\na better interactive experience.\n","authors":["Xu Huang","Zhirui Zhang","Ruize Gao","Yichao Du","Lemao Liu","Gouping Huang","Shuming Shi","Jiajun Chen","Shujian Huang"],"pdf_url":"https://arxiv.org/pdf/2310.11163v1.pdf","comment":"Accepted by EMNLP2023"},{"id":"http://arxiv.org/abs/2301.11916v3","updated":"2023-10-17T11:24:33Z","published":"2023-01-27T18:59:01Z","title":"Large Language Models Are Latent Variable Models: Explaining and Finding\n  Good Demonstrations for In-Context Learning","summary":"  In recent years, pre-trained large language models (LLMs) have demonstrated\nremarkable efficiency in achieving an inference-time few-shot learning\ncapability known as in-context learning. However, existing literature has\nhighlighted the sensitivity of this capability to the selection of few-shot\ndemonstrations. Current understandings of the underlying mechanisms by which\nthis capability arises from regular language model pretraining objectives\nremain disconnected from the real-world LLMs. This study aims to examine the\nin-context learning phenomenon through a Bayesian lens, viewing real-world LLMs\nas latent variable models. On this premise, we propose an algorithm to select\noptimal demonstrations from a set of annotated data with a small LM, and then\ndirectly generalize the selected demonstrations to larger LMs. We demonstrate\nsignificant improvement over baselines, averaged over eight GPT models on eight\nreal-world text classification datasets. We also demonstrate the real-world\nusefulness of our algorithm on GSM8K, a math word problem dataset. Our\nempirical findings support our hypothesis that LLMs implicitly infer a latent\nvariable containing task information.\n","authors":["Xinyi Wang","Wanrong Zhu","Michael Saxon","Mark Steyvers","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2301.11916v3.pdf","comment":"code at:\n  https://github.com/WANGXinyiLinda/concept-based-demonstration-selection\n  Accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.11158v1","updated":"2023-10-17T11:23:32Z","published":"2023-10-17T11:23:32Z","title":"Probing the Creativity of Large Language Models: Can models produce\n  divergent semantic association?","summary":"  Large language models possess remarkable capacity for processing language,\nbut it remains unclear whether these models can further generate creative\ncontent. The present study aims to investigate the creative thinking of large\nlanguage models through a cognitive perspective. We utilize the divergent\nassociation task (DAT), an objective measurement of creativity that asks models\nto generate unrelated words and calculates the semantic distance between them.\nWe compare the results across different models and decoding strategies. Our\nfindings indicate that: (1) When using the greedy search strategy, GPT-4\noutperforms 96% of humans, while GPT-3.5-turbo exceeds the average human level.\n(2) Stochastic sampling and temperature scaling are effective to obtain higher\nDAT scores for models except GPT-4, but face a trade-off between creativity and\nstability. These results imply that advanced large language models have\ndivergent semantic associations, which is a fundamental process underlying\ncreativity.\n","authors":["Honghua Chen","Nai Ding"],"pdf_url":"https://arxiv.org/pdf/2310.11158v1.pdf","comment":"Accepted for publication in Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.11146v1","updated":"2023-10-17T10:54:24Z","published":"2023-10-17T10:54:24Z","title":"The Quo Vadis of the Relationship between Language and Large Language\n  Models","summary":"  In the field of Artificial (General) Intelligence (AI), the several recent\nadvancements in Natural language processing (NLP) activities relying on Large\nLanguage Models (LLMs) have come to encourage the adoption of LLMs as\nscientific models of language. While the terminology employed for the\ncharacterization of LLMs favors their embracing as such, it is not clear that\nthey are in a place to offer insights into the target system they seek to\nrepresent. After identifying the most important theoretical and empirical risks\nbrought about by the adoption of scientific models that lack transparency, we\ndiscuss LLMs relating them to every scientific model's fundamental components:\nthe object, the medium, the meaning and the user. We conclude that, at their\ncurrent stage of development, LLMs hardly offer any explanations for language,\nand then we provide an outlook for more informative future research directions\non this topic.\n","authors":["Evelina Leivada","Vittoria Dentella","Elliot Murphy"],"pdf_url":"https://arxiv.org/pdf/2310.11146v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11141v1","updated":"2023-10-17T10:44:05Z","published":"2023-10-17T10:44:05Z","title":"Long-form Simultaneous Speech Translation: Thesis Proposal","summary":"  Simultaneous speech translation (SST) aims to provide real-time translation\nof spoken language, even before the speaker finishes their sentence.\nTraditionally, SST has been addressed primarily by cascaded systems that\ndecompose the task into subtasks, including speech recognition, segmentation,\nand machine translation. However, the advent of deep learning has sparked\nsignificant interest in end-to-end (E2E) systems. Nevertheless, a major\nlimitation of most approaches to E2E SST reported in the current literature is\nthat they assume that the source speech is pre-segmented into sentences, which\nis a significant obstacle for practical, real-world applications. This thesis\nproposal addresses end-to-end simultaneous speech translation, particularly in\nthe long-form setting, i.e., without pre-segmentation. We present a survey of\nthe latest advancements in E2E SST, assess the primary obstacles in SST and its\nrelevance to long-form scenarios, and suggest approaches to tackle these\nchallenges.\n","authors":["Peter Polák"],"pdf_url":"https://arxiv.org/pdf/2310.11141v1.pdf","comment":"IJCNLP-AACL SRW 2023 - camera-ready version"},{"id":"http://arxiv.org/abs/2306.17181v4","updated":"2023-10-17T10:41:12Z","published":"2023-06-19T10:22:12Z","title":"Unsupervised Text Embedding Space Generation Using Generative\n  Adversarial Networks for Text Synthesis","summary":"  Generative Adversarial Networks (GAN) is a model for data synthesis, which\ncreates plausible data through the competition of generator and discriminator.\nAlthough GAN application to image synthesis is extensively studied, it has\ninherent limitations to natural language generation. Because natural language\nis composed of discrete tokens, a generator has difficulty updating its\ngradient through backpropagation; therefore, most text-GAN studies generate\nsentences starting with a random token based on a reward system. Thus, the\ngenerators of previous studies are pre-trained in an autoregressive way before\nadversarial training, causing data memorization that synthesized sentences\nreproduce the training data. In this paper, we synthesize sentences using a\nframework similar to the original GAN. More specifically, we propose Text\nEmbedding Space Generative Adversarial Networks (TESGAN) which generate\ncontinuous text embedding spaces instead of discrete tokens to solve the\ngradient backpropagation problem. Furthermore, TESGAN conducts unsupervised\nlearning which does not directly refer to the text of the training data to\novercome the data memorization issue. By adopting this novel method, TESGAN can\nsynthesize new sentences, showing the potential of unsupervised learning for\ntext synthesis. We expect to see extended research combining Large Language\nModels with a new perspective of viewing text as an continuous space.\n","authors":["Jun-Min Lee","Tae-Bin Ha"],"pdf_url":"https://arxiv.org/pdf/2306.17181v4.pdf","comment":"NEJLT accpeted"},{"id":"http://arxiv.org/abs/2310.09619v2","updated":"2023-10-17T10:10:49Z","published":"2023-10-14T17:00:28Z","title":"An Expression Tree Decoding Strategy for Mathematical Equation\n  Generation","summary":"  Generating mathematical equations from natural language requires an accurate\nunderstanding of the relations among math expressions. Existing approaches can\nbe broadly categorized into token-level and expression-level generation. The\nformer treats equations as a mathematical language, sequentially generating\nmath tokens. Expression-level methods generate each expression one by one.\nHowever, each expression represents a solving step, and there naturally exist\nparallel or dependent relations between these steps, which are ignored by\ncurrent sequential methods. Therefore, we integrate tree structure into the\nexpression-level generation and advocate an expression tree decoding strategy.\nTo generate a tree with expression as its node, we employ a layer-wise parallel\ndecoding strategy: we decode multiple independent expressions (leaf nodes) in\nparallel at each layer and repeat parallel decoding layer by layer to\nsequentially generate these parent node expressions that depend on others.\nBesides, a bipartite matching algorithm is adopted to align multiple\npredictions with annotations for each layer. Experiments show our method\noutperforms other baselines, especially for these equations with complex\nstructures.\n","authors":["Wenqi Zhang","Yongliang Shen","Qingpeng Nong","Zeqi Tan Yanna Ma","Weiming Lu"],"pdf_url":"https://arxiv.org/pdf/2310.09619v2.pdf","comment":"Accepted to EMNLP-2023, camera-ready version"},{"id":"http://arxiv.org/abs/2304.13734v2","updated":"2023-10-17T09:34:30Z","published":"2023-04-26T02:49:38Z","title":"The Internal State of an LLM Knows When It's Lying","summary":"  While Large Language Models (LLMs) have shown exceptional performance in\nvarious tasks, one of their most prominent drawbacks is generating inaccurate\nor false information with a confident tone. In this paper, we provide evidence\nthat the LLM's internal state can be used to reveal the truthfulness of\nstatements. This includes both statements provided to the LLM, and statements\nthat the LLM itself generates. Our approach is to train a classifier that\noutputs the probability that a statement is truthful, based on the hidden layer\nactivations of the LLM as it reads or generates the statement. Experiments\ndemonstrate that given a set of test sentences, of which half are true and half\nfalse, our trained classifier achieves an average of 71\\% to 83\\% accuracy\nlabeling which sentences are true versus false, depending on the LLM base\nmodel. Furthermore, we explore the relationship between our classifier's\nperformance and approaches based on the probability assigned to the sentence by\nthe LLM. We show that while LLM-assigned sentence probability is related to\nsentence truthfulness, this probability is also dependent on sentence length\nand the frequencies of words in the sentence, resulting in our trained\nclassifier providing a more reliable approach to detecting truthfulness,\nhighlighting its potential to enhance the reliability of LLM-generated content\nand its practical applicability in real-world scenarios.\n","authors":["Amos Azaria","Tom Mitchell"],"pdf_url":"https://arxiv.org/pdf/2304.13734v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10333v2","updated":"2023-10-17T09:33:08Z","published":"2023-10-16T12:17:11Z","title":"NLP for Crypto-Asset Regulation: A Roadmap","summary":"  In the rapidly evolving field of crypto-assets, white papers are essential\ndocuments for investor guidance, and are now subject to unprecedented content\nrequirements under the EU's Markets in Crypto-Assets Regulation (MiCAR).\nNatural Language Processing can serve as a powerful tool for both analyzing\nthese documents and assisting in regulatory compliance. This paper delivers two\ncontributions to the topic. First, we survey existing applications of textual\nanalysis to unregulated crypto-asset white papers, uncovering a research gap\nthat could be bridged with interdisciplinary collaboration. We then conduct an\nanalysis of the changes introduced by MiCAR, highlighting the opportunities and\nchallenges of integrating NLP within the new regulatory framework. The findings\nset the stage for further research, with the potential to benefit regulators,\ncrypto-asset issuers, and investors.\n","authors":["Carolina Camassa"],"pdf_url":"https://arxiv.org/pdf/2310.10333v2.pdf","comment":"Accepted at NLLP23"},{"id":"http://arxiv.org/abs/2310.11097v1","updated":"2023-10-17T09:27:43Z","published":"2023-10-17T09:27:43Z","title":"Experimenting AI Technologies for Disinformation Combat: the IDMO\n  Project","summary":"  The Italian Digital Media Observatory (IDMO) project, part of a European\ninitiative, focuses on countering disinformation and fake news. This report\noutlines contributions from Rai-CRITS to the project, including: (i) the\ncreation of novel datasets for testing technologies (ii) development of an\nautomatic model for categorizing Pagella Politica verdicts to facilitate\nbroader analysis (iii) creation of an automatic model for recognizing textual\nentailment with exceptional accuracy on the FEVER dataset (iv) assessment using\nGPT-4 to identify textual entailmen (v) a game to raise awareness about fake\nnews at national events.\n","authors":["Lorenzo Canale","Alberto Messina"],"pdf_url":"https://arxiv.org/pdf/2310.11097v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11085v1","updated":"2023-10-17T09:10:27Z","published":"2023-10-17T09:10:27Z","title":"In-Context Few-Shot Relation Extraction via Pre-Trained Language Models","summary":"  Relation extraction aims at inferring structured human knowledge from textual\ndocuments. State-of-the-art methods based on language models commonly have two\nlimitations: (1) they require named entities to be either given as input or\ninfer them, which introduces additional noise, and (2) they require human\nannotations of documents. As a remedy, we present a novel framework for\nin-context few-shot relation extraction via pre-trained language models. To the\nbest of our knowledge, we are the first to reformulate the relation extraction\ntask as a tailored in-context few-shot learning paradigm. Thereby, we achieve\ncrucial benefits in that we eliminate the need for both named entity\nrecognition and human annotation of documents. Unlike existing methods based on\nfine-tuning, our framework is flexible in that it can be easily updated for a\nnew set of relations without re-training. We evaluate our framework using\nDocRED, the largest publicly available dataset for document-level relation\nextraction, and demonstrate that our framework achieves state-of-the-art\nperformance. Finally, our framework allows us to identify missing annotations,\nand we thus show that our framework actually performs much better than the\noriginal labels from the development set of DocRED.\n","authors":["Yilmazcan Ozyurt","Stefan Feuerriegel","Ce Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.11085v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.17148v2","updated":"2023-10-17T09:06:16Z","published":"2022-11-30T16:37:42Z","title":"ConvLab-3: A Flexible Dialogue System Toolkit Based on a Unified Data\n  Format","summary":"  Task-oriented dialogue (TOD) systems function as digital assistants, guiding\nusers through various tasks such as booking flights or finding restaurants.\nExisting toolkits for building TOD systems often fall short of in delivering\ncomprehensive arrays of data, models, and experimental environments with a\nuser-friendly experience. We introduce ConvLab-3: a multifaceted dialogue\nsystem toolkit crafted to bridge this gap. Our unified data format simplifies\nthe integration of diverse datasets and models, significantly reducing\ncomplexity and cost for studying generalization and transfer. Enhanced with\nrobust reinforcement learning (RL) tools, featuring a streamlined training\nprocess, in-depth evaluation tools, and a selection of user simulators,\nConvLab-3 supports the rapid development and evaluation of robust dialogue\npolicies. Through an extensive study, we demonstrate the efficacy of transfer\nlearning and RL and showcase that ConvLab-3 is not only a powerful tool for\nseasoned researchers but also an accessible platform for newcomers.\n","authors":["Qi Zhu","Christian Geishauser","Hsien-chin Lin","Carel van Niekerk","Baolin Peng","Zheng Zhang","Michael Heck","Nurul Lubis","Dazhen Wan","Xiaochen Zhu","Jianfeng Gao","Milica Gašić","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2211.17148v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11081v1","updated":"2023-10-17T09:01:17Z","published":"2023-10-17T09:01:17Z","title":"Understanding writing style in social media with a supervised\n  contrastively pre-trained transformer","summary":"  Online Social Networks serve as fertile ground for harmful behavior, ranging\nfrom hate speech to the dissemination of disinformation. Malicious actors now\nhave unprecedented freedom to misbehave, leading to severe societal unrest and\ndire consequences, as exemplified by events such as the Capitol assault during\nthe US presidential election and the Antivaxx movement during the COVID-19\npandemic. Understanding online language has become more pressing than ever.\nWhile existing works predominantly focus on content analysis, we aim to shift\nthe focus towards understanding harmful behaviors by relating content to their\nrespective authors. Numerous novel approaches attempt to learn the stylistic\nfeatures of authors in texts, but many of these approaches are constrained by\nsmall datasets or sub-optimal training losses. To overcome these limitations,\nwe introduce the Style Transformer for Authorship Representations (STAR),\ntrained on a large corpus derived from public sources of 4.5 x 10^6 authored\ntexts involving 70k heterogeneous authors. Our model leverages Supervised\nContrastive Loss to teach the model to minimize the distance between texts\nauthored by the same individual. This author pretext pre-training task yields\ncompetitive performance at zero-shot with PAN challenges on attribution and\nclustering. Additionally, we attain promising results on PAN verification\nchallenges using a single dense layer, with our model serving as an embedding\nencoder. Finally, we present results from our test partition on Reddit. Using a\nsupport base of 8 documents of 512 tokens, we can discern authors from sets of\nup to 1616 authors with at least 80\\% accuracy. We share our pre-trained model\nat huggingface (https://huggingface.co/AIDA-UPM/star) and our code is available\nat (https://github.com/jahuerta92/star)\n","authors":["Javier Huertas-Tato","Alejandro Martin","David Camacho"],"pdf_url":"https://arxiv.org/pdf/2310.11081v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11079v1","updated":"2023-10-17T08:56:04Z","published":"2023-10-17T08:56:04Z","title":"Learning from Red Teaming: Gender Bias Provocation and Mitigation in\n  Large Language Models","summary":"  Recently, researchers have made considerable improvements in dialogue systems\nwith the progress of large language models (LLMs) such as ChatGPT and GPT-4.\nThese LLM-based chatbots encode the potential biases while retaining\ndisparities that can harm humans during interactions. The traditional biases\ninvestigation methods often rely on human-written test cases. However, these\ntest cases are usually expensive and limited. In this work, we propose a\nfirst-of-its-kind method that automatically generates test cases to detect\nLLMs' potential gender bias. We apply our method to three well-known LLMs and\nfind that the generated test cases effectively identify the presence of biases.\nTo address the biases identified, we propose a mitigation strategy that uses\nthe generated test cases as demonstrations for in-context learning to\ncircumvent the need for parameter fine-tuning. The experimental results show\nthat LLMs generate fairer responses with the proposed approach.\n","authors":["Hsuan Su","Cheng-Chu Cheng","Hua Farn","Shachi H Kumar","Saurav Sahay","Shang-Tse Chen","Hung-yi Lee"],"pdf_url":"https://arxiv.org/pdf/2310.11079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11069v1","updated":"2023-10-17T08:33:02Z","published":"2023-10-17T08:33:02Z","title":"VoxArabica: A Robust Dialect-Aware Arabic Speech Recognition System","summary":"  Arabic is a complex language with many varieties and dialects spoken by over\n450 millions all around the world. Due to the linguistic diversity and\nvariations, it is challenging to build a robust and generalized ASR system for\nArabic. In this work, we address this gap by developing and demoing a system,\ndubbed VoxArabica, for dialect identification (DID) as well as automatic speech\nrecognition (ASR) of Arabic. We train a wide range of models such as HuBERT\n(DID), Whisper, and XLS-R (ASR) in a supervised setting for Arabic DID and ASR\ntasks. Our DID models are trained to identify 17 different dialects in addition\nto MSA. We finetune our ASR models on MSA, Egyptian, Moroccan, and mixed data.\nAdditionally, for the remaining dialects in ASR, we provide the option to\nchoose various models such as Whisper and MMS in a zero-shot setting. We\nintegrate these models into a single web interface with diverse features such\nas audio recording, file upload, model selection, and the option to raise flags\nfor incorrect outputs. Overall, we believe VoxArabica will be useful for a wide\nrange of audiences concerned with Arabic research. Our system is currently\nrunning at https://cdce-206-12-100-168.ngrok.io/.\n","authors":["Abdul Waheed","Bashar Talafha","Peter Suvellin","Abdelrahman Elmadney","Muhammad Abdul-Mageed"],"pdf_url":"https://arxiv.org/pdf/2310.11069v1.pdf","comment":"Accepted at ArabicNLP conference co-located with EMNLP'23"},{"id":"http://arxiv.org/abs/2305.14342v3","updated":"2023-10-17T07:44:16Z","published":"2023-05-23T17:59:21Z","title":"Sophia: A Scalable Stochastic Second-order Optimizer for Language Model\n  Pre-training","summary":"  Given the massive cost of language model pre-training, a non-trivial\nimprovement of the optimization algorithm would lead to a material reduction on\nthe time and cost of training. Adam and its variants have been state-of-the-art\nfor years, and more sophisticated second-order (Hessian-based) optimizers often\nincur too much per-step overhead. In this paper, we propose Sophia,\nSecond-order Clipped Stochastic Optimization, a simple scalable second-order\noptimizer that uses a light-weight estimate of the diagonal Hessian as the\npre-conditioner. The update is the moving average of the gradients divided by\nthe moving average of the estimated Hessian, followed by element-wise clipping.\nThe clipping controls the worst-case update size and tames the negative impact\nof non-convexity and rapid change of Hessian along the trajectory. Sophia only\nestimates the diagonal Hessian every handful of iterations, which has\nnegligible average per-step time and memory overhead. On language modeling with\nGPT models of sizes ranging from 125M to 1.5B, Sophia achieves a 2x speed-up\ncompared to Adam in the number of steps, total compute, and wall-clock time,\nachieving the same perplexity with 50% fewer steps, less total compute, and\nreduced wall-clock time. Theoretically, we show that Sophia, in a much\nsimplified setting, adapts to the heterogeneous curvatures in different\nparameter dimensions, and thus has a run-time bound that does not depend on the\ncondition number of the loss.\n","authors":["Hong Liu","Zhiyuan Li","David Hall","Percy Liang","Tengyu Ma"],"pdf_url":"https://arxiv.org/pdf/2305.14342v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11053v1","updated":"2023-10-17T07:42:40Z","published":"2023-10-17T07:42:40Z","title":"Denevil: Towards Deciphering and Navigating the Ethical Values of Large\n  Language Models via Instruction Learning","summary":"  Large Language Models (LLMs) have made unprecedented breakthroughs, yet their\nincreasing integration into everyday life might raise societal risks due to\ngenerated unethical content. Despite extensive study on specific issues like\nbias, the intrinsic values of LLMs remain largely unexplored from a moral\nphilosophy perspective. This work delves into ethical values utilizing Moral\nFoundation Theory. Moving beyond conventional discriminative evaluations with\npoor reliability, we propose DeNEVIL, a novel prompt generation algorithm\ntailored to dynamically exploit LLMs' value vulnerabilities and elicit the\nviolation of ethics in a generative manner, revealing their underlying value\ninclinations. On such a basis, we construct MoralPrompt, a high-quality dataset\ncomprising 2,397 prompts covering 500+ value principles, and then benchmark the\nintrinsic values across a spectrum of LLMs. We discovered that most models are\nessentially misaligned, necessitating further ethical value alignment. In\nresponse, we develop VILMO, an in-context alignment method that substantially\nenhances the value compliance of LLM outputs by learning to generate\nappropriate value instructions, outperforming existing competitors. Our methods\nare suitable for black-box and open-source models, offering a promising initial\nstep in studying the ethical values of LLMs.\n","authors":["Shitong Duan","Xiaoyuan Yi","Peng Zhang","Tun Lu","Xing Xie","Ning Gu"],"pdf_url":"https://arxiv.org/pdf/2310.11053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11049v1","updated":"2023-10-17T07:35:11Z","published":"2023-10-17T07:35:11Z","title":"Nonet at SemEval-2023 Task 6: Methodologies for Legal Evaluation","summary":"  This paper describes our submission to the SemEval-2023 for Task 6 on\nLegalEval: Understanding Legal Texts. Our submission concentrated on three\nsubtasks: Legal Named Entity Recognition (L-NER) for Task-B, Legal Judgment\nPrediction (LJP) for Task-C1, and Court Judgment Prediction with Explanation\n(CJPE) for Task-C2. We conducted various experiments on these subtasks and\npresented the results in detail, including data statistics and methodology. It\nis worth noting that legal tasks, such as those tackled in this research, have\nbeen gaining importance due to the increasing need to automate legal analysis\nand support. Our team obtained competitive rankings of 15$^{th}$, 11$^{th}$,\nand 1$^{st}$ in Task-B, Task-C1, and Task-C2, respectively, as reported on the\nleaderboard.\n","authors":["Shubham Kumar Nigam","Aniket Deroy","Noel Shallum","Ayush Kumar Mishra","Anup Roy","Shubham Kumar Mishra","Arnab Bhattacharya","Saptarshi Ghosh","Kripabandhu Ghosh"],"pdf_url":"https://arxiv.org/pdf/2310.11049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2105.01331v3","updated":"2023-10-17T07:30:40Z","published":"2021-05-04T07:27:42Z","title":"BLM-17m: A Large-Scale Dataset for Black Lives Matter Topic Detection on\n  Twitter","summary":"  Protection of human rights is one of the most important problems of our\nworld. In this paper, our aim is to provide a dataset which covers one of the\nmost significant human rights contradiction in recent months affected the whole\nworld, George Floyd incident. We propose a labeled dataset for topic detection\nthat contains 17 million tweets. These Tweets are collected from 25 May 2020 to\n21 August 2020 that covers 89 days from start of this incident. We labeled the\ndataset by monitoring most trending news topics from global and local\nnewspapers. Apart from that, we present two baselines, TF-IDF and LDA. We\nevaluated the results of these two methods with three different k values for\nmetrics of precision, recall and f1-score. The collected dataset is available\nat https://github.com/MeysamAsgariC/BLMT.\n","authors":["Hasan Kemik","Nusret Özateş","Meysam Asgari-Chenaghlu","Yang Li","Erik Cambria"],"pdf_url":"https://arxiv.org/pdf/2105.01331v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13780v3","updated":"2023-10-17T07:21:05Z","published":"2023-03-24T03:35:21Z","title":"Towards Making the Most of ChatGPT for Machine Translation","summary":"  ChatGPT shows remarkable capabilities for machine translation (MT). Several\nprior studies have shown that it achieves comparable results to commercial\nsystems for high-resource languages, but lags behind in complex tasks, e.g.,\nlow-resource and distant-language-pairs translation. However, they usually\nadopt simple prompts which can not fully elicit the capability of ChatGPT. In\nthis paper, we aim to further mine ChatGPT's translation ability by revisiting\nseveral aspects: temperature, task information, and domain information, and\ncorrespondingly propose an optimal temperature setting and two (simple but\neffective) prompts: Task-Specific Prompts (TSP) and Domain-Specific Prompts\n(DSP). We show that: 1) The performance of ChatGPT depends largely on\ntemperature, and a lower temperature usually can achieve better performance; 2)\nEmphasizing the task information can further improve ChatGPT's performance,\nparticularly in complex MT tasks; 3) Introducing domain information can elicit\nChatGPT's generalization ability and improve its performance in the specific\ndomain; 4) ChatGPT tends to generate hallucinations for non-English-centric MT\ntasks, which can be partially addressed by our proposed prompts but still need\nto be highlighted for the MT/NLP community. We also explore the effects of\nadvanced in-context learning strategies and find a (negative but interesting)\nobservation: the powerful chain-of-thought prompt leads to word-by-word\ntranslation behavior, thus bringing significant translation degradation.\n","authors":["Keqin Peng","Liang Ding","Qihuang Zhong","Li Shen","Xuebo Liu","Min Zhang","Yuanxin Ouyang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2303.13780v3.pdf","comment":"EMNLP 2023 (findings)"},{"id":"http://arxiv.org/abs/2310.11035v1","updated":"2023-10-17T07:02:26Z","published":"2023-10-17T07:02:26Z","title":"Lyricist-Singer Entropy Affects Lyric-Lyricist Classification\n  Performance","summary":"  Although lyrics represent an essential component of music, few music\ninformation processing studies have been conducted on the characteristics of\nlyricists. Because these characteristics may be valuable for musical\napplications, such as recommendations, they warrant further study. We\nconsidered a potential method that extracts features representing the\ncharacteristics of lyricists from lyrics. Because these features must be\nidentified prior to extraction, we focused on lyricists with easily\nidentifiable features. We believe that it is desirable for singers to perform\nunique songs that share certain characteristics specific to the singer.\nAccordingly, we hypothesized that lyricists account for the unique\ncharacteristics of the singers they write lyrics for. In other words,\nlyric-lyricist classification performance or the ease of capturing the features\nof a lyricist from the lyrics may depend on the variety of singers. In this\nstudy, we observed a relationship between lyricist-singer entropy or the\nvariety of singers associated with a single lyricist and lyric-lyricist\nclassification performance. As an example, the lyricist-singer entropy is\nminimal when the lyricist writes lyrics for only one singer. In our\nexperiments, we grouped lyricists among five groups in terms of lyricist-singer\nentropy and assessed the lyric-lyricist classification performance within each\ngroup. Consequently, the best F1 score was obtained for the group with the\nlowest lyricist-singer entropy. Our results suggest that further analyses of\nthe features contributing to lyric-lyricist classification performance on the\nlowest lyricist-singer entropy group may improve the feature extraction task\nfor lyricists.\n","authors":["Mitsuki Morita","Masato Kikuchi","Tadachika Ozono"],"pdf_url":"https://arxiv.org/pdf/2310.11035v1.pdf","comment":"The 10th International Conference on Advanced Informatics: Concepts,\n  Theory and Applications (ICAICTA 2023)"},{"id":"http://arxiv.org/abs/2310.11026v1","updated":"2023-10-17T06:53:00Z","published":"2023-10-17T06:53:00Z","title":"Exploring Automatic Evaluation Methods based on a Decoder-based LLM for\n  Text Generation","summary":"  Automatic evaluation of text generation is essential for improving the\naccuracy of generation tasks. In light of the current trend towards\nincreasingly larger decoder-based language models, we investigate automatic\nevaluation methods based on such models for text generation. This paper\ncompares various methods, including tuning with encoder-based models and large\nlanguage models under equal conditions, on two different tasks, machine\ntranslation evaluation and semantic textual similarity, in two languages,\nJapanese and English. Experimental results show that compared to the tuned\nencoder-based models, the tuned decoder-based models perform poorly. The\nanalysis of the causes for this suggests that the decoder-based models focus on\nsurface word sequences and do not capture meaning. It is also revealed that\nin-context learning of very large decoder-based models such as ChatGPT makes it\ndifficult to identify fine-grained semantic differences.\n","authors":["Tomohito Kasahara","Daisuke Kawahara"],"pdf_url":"https://arxiv.org/pdf/2310.11026v1.pdf","comment":"Accepted to IJCNLP-AACL 2023 SRW"},{"id":"http://arxiv.org/abs/2307.03109v8","updated":"2023-10-17T06:28:04Z","published":"2023-07-06T16:28:35Z","title":"A Survey on Evaluation of Large Language Models","summary":"  Large language models (LLMs) are gaining increasing popularity in both\nacademia and industry, owing to their unprecedented performance in various\napplications. As LLMs continue to play a vital role in both research and daily\nuse, their evaluation becomes increasingly critical, not only at the task\nlevel, but also at the society level for better understanding of their\npotential risks. Over the past years, significant efforts have been made to\nexamine LLMs from various perspectives. This paper presents a comprehensive\nreview of these evaluation methods for LLMs, focusing on three key dimensions:\nwhat to evaluate, where to evaluate, and how to evaluate. Firstly, we provide\nan overview from the perspective of evaluation tasks, encompassing general\nnatural language processing tasks, reasoning, medical usage, ethics,\neducations, natural and social sciences, agent applications, and other areas.\nSecondly, we answer the `where' and `how' questions by diving into the\nevaluation methods and benchmarks, which serve as crucial components in\nassessing performance of LLMs. Then, we summarize the success and failure cases\nof LLMs in different tasks. Finally, we shed light on several future challenges\nthat lie ahead in LLMs evaluation. Our aim is to offer invaluable insights to\nresearchers in the realm of LLMs evaluation, thereby aiding the development of\nmore proficient LLMs. Our key point is that evaluation should be treated as an\nessential discipline to better assist the development of LLMs. We consistently\nmaintain the related open-source materials at:\nhttps://github.com/MLGroupJLU/LLM-eval-survey.\n","authors":["Yupeng Chang","Xu Wang","Jindong Wang","Yuan Wu","Linyi Yang","Kaijie Zhu","Hao Chen","Xiaoyuan Yi","Cunxiang Wang","Yidong Wang","Wei Ye","Yue Zhang","Yi Chang","Philip S. Yu","Qiang Yang","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2307.03109v8.pdf","comment":"31 pages; a major update to include more recent works;\n  https://llm-eval.github.io/"},{"id":"http://arxiv.org/abs/2310.11016v1","updated":"2023-10-17T06:08:55Z","published":"2023-10-17T06:08:55Z","title":"Reading Order Matters: Information Extraction from Visually-rich\n  Documents by Token Path Prediction","summary":"  Recent advances in multimodal pre-trained models have significantly improved\ninformation extraction from visually-rich documents (VrDs), in which named\nentity recognition (NER) is treated as a sequence-labeling task of predicting\nthe BIO entity tags for tokens, following the typical setting of NLP. However,\nBIO-tagging scheme relies on the correct order of model inputs, which is not\nguaranteed in real-world NER on scanned VrDs where text are recognized and\narranged by OCR systems. Such reading order issue hinders the accurate marking\nof entities by BIO-tagging scheme, making it impossible for sequence-labeling\nmethods to predict correct named entities. To address the reading order issue,\nwe introduce Token Path Prediction (TPP), a simple prediction head to predict\nentity mentions as token sequences within documents. Alternative to token\nclassification, TPP models the document layout as a complete directed graph of\ntokens, and predicts token paths within the graph as entities. For better\nevaluation of VrD-NER systems, we also propose two revised benchmark datasets\nof NER on scanned documents which can reflect real-world scenarios. Experiment\nresults demonstrate the effectiveness of our method, and suggest its potential\nto be a universal solution to various information extraction tasks on\ndocuments.\n","authors":["Chong Zhang","Ya Guo","Yi Tu","Huan Chen","Jinyang Tang","Huijia Zhu","Qi Zhang","Tao Gui"],"pdf_url":"https://arxiv.org/pdf/2310.11016v1.pdf","comment":"Accepted as a long paper in the main conference of EMNLP 2023"},{"id":"http://arxiv.org/abs/2308.10335v4","updated":"2023-10-17T05:48:29Z","published":"2023-08-20T18:36:28Z","title":"Can ChatGPT replace StackOverflow? A Study on Robustness and Reliability\n  of Large Language Model Code Generation","summary":"  Recently, the large language models (LLMs) have shown extraordinary ability\nin understanding natural language and generating programming code. It has been\na common practice of software engineers to consult LLMs when encountering\ncoding questions. Although efforts have been made to avoid syntax errors and\nalign the code with the intended semantics, the reliability and robustness of\nthe code generationfrom LLMs have not yet been thoroughly studied. The\nexecutable code is not equivalent to the reliable and robust code, especially\nin the context of real-world software development. The misuse of APIs in the\ngenerated code could lead to severe problem, such as resource leaks, program\ncrashes. To make things worse, the users of LLM code generation services are\nactually the developers that are most vulnerable to these code that seems right\n-- They are always novice developers that are not familiar with the APIs that\nLLMs generate code for them. Therefore, they could hardly tell the misuse in\nthe code generated by LLMs, which further facilitates the incorrect code\napplied in real-world software. Existing code evaluation benchmark and datasets\nfocus on crafting small tasks such as programming questions in coding\ninterviews, which however deviates from the problem that developers would ask\nLLM for real-world coding help. To fill the missing piece, in this work, we\npropose a dataset RobustAPI for evaluating the reliability and robustness of\ncode generated by LLMs. We collect 1208 coding questions from StackOverflow on\n24 representative Java APIs. We summarize thecommon misuse patterns of these\nAPIs and evaluate them oncurrent popular LLMs. The evaluation results show that\nevenfor GPT-4, 62% of the generated code contains API misuses,which would cause\nunexpected consequences if the code isintroduced into real-world software.\n","authors":["Li Zhong","Zilong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10335v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09680v2","updated":"2023-10-17T05:44:03Z","published":"2023-10-14T23:16:05Z","title":"Improved Contextual Recognition In Automatic Speech Recognition Systems\n  By Semantic Lattice Rescoring","summary":"  Automatic Speech Recognition (ASR) has witnessed a profound research\ninterest. Recent breakthroughs have given ASR systems different prospects such\nas faithfully transcribing spoken language, which is a pivotal advancement in\nbuilding conversational agents. However, there is still an imminent challenge\nof accurately discerning context-dependent words and phrases. In this work, we\npropose a novel approach for enhancing contextual recognition within ASR\nsystems via semantic lattice processing leveraging the power of deep learning\nmodels in accurately delivering spot-on transcriptions across a wide variety of\nvocabularies and speaking styles. Our solution consists of using Hidden Markov\nModels and Gaussian Mixture Models (HMM-GMM) along with Deep Neural Networks\n(DNN) models integrating both language and acoustic modeling for better\naccuracy. We infused our network with the use of a transformer-based model to\nproperly rescore the word lattice achieving remarkable capabilities with a\npalpable reduction in Word Error Rate (WER). We demonstrate the effectiveness\nof our proposed framework on the LibriSpeech dataset with empirical analyses.\n","authors":["Ankitha Sudarshan","Vinay Samuel","Parth Patwa","Ibtihel Amara","Aman Chadha"],"pdf_url":"https://arxiv.org/pdf/2310.09680v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2209.01250,\n  arXiv:2301.06735 by other authors"},{"id":"http://arxiv.org/abs/2310.11003v1","updated":"2023-10-17T05:10:39Z","published":"2023-10-17T05:10:39Z","title":"Correction Focused Language Model Training for Speech Recognition","summary":"  Language models (LMs) have been commonly adopted to boost the performance of\nautomatic speech recognition (ASR) particularly in domain adaptation tasks.\nConventional way of LM training treats all the words in corpora equally,\nresulting in suboptimal improvements in ASR performance. In this work, we\nintroduce a novel correction focused LM training approach which aims to\nprioritize ASR fallible words. The word-level ASR fallibility score,\nrepresenting the likelihood of ASR mis-recognition, is defined and shaped as a\nprior word distribution to guide the LM training. To enable correction focused\ntraining with text-only corpora, large language models (LLMs) are employed as\nfallibility score predictors and text generators through multi-task\nfine-tuning. Experimental results for domain adaptation tasks demonstrate the\neffectiveness of our proposed method. Compared with conventional LMs,\ncorrection focused training achieves up to relatively 5.5% word error rate\n(WER) reduction in sufficient text scenarios. In insufficient text scenarios,\nLM training with LLM-generated text achieves up to relatively 13% WER\nreduction, while correction focused training further obtains up to relatively\n6% WER reduction.\n","authors":["Yingyi Ma","Zhe Liu","Ozlem Kalinli"],"pdf_url":"https://arxiv.org/pdf/2310.11003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10819v2","updated":"2023-10-17T04:56:57Z","published":"2023-05-18T08:57:17Z","title":"CLEME: Debiasing Multi-reference Evaluation for Grammatical Error\n  Correction","summary":"  Evaluating the performance of Grammatical Error Correction (GEC) systems is a\nchallenging task due to its subjectivity. Designing an evaluation metric that\nis as objective as possible is crucial to the development of GEC task. However,\nmainstream evaluation metrics, i.e., reference-based metrics, introduce bias\ninto the multi-reference evaluation by extracting edits without considering the\npresence of multiple references. To overcome this issue, we propose Chunk-LEvel\nMulti-reference Evaluation (CLEME), designed to evaluate GEC systems in the\nmulti-reference evaluation setting. CLEME builds chunk sequences with\nconsistent boundaries for the source, the hypothesis and references, thus\neliminating the bias caused by inconsistent edit boundaries. Furthermore, we\nobserve the consistent boundary could also act as the boundary of grammatical\nerrors, based on which the F$_{0.5}$ score is then computed following the\ncorrection independence assumption. We conduct experiments on six English\nreference sets based on the CoNLL-2014 shared task. Extensive experiments and\ndetailed analyses demonstrate the correctness of our discovery and the\neffectiveness of CLEME. Further analysis reveals that CLEME is robust to\nevaluate GEC systems across reference sets with varying numbers of references\nand annotation style.\n","authors":["Jingheng Ye","Yinghui Li","Qingyu Zhou","Yangning Li","Shirong Ma","Hai-Tao Zheng","Ying Shen"],"pdf_url":"https://arxiv.org/pdf/2305.10819v2.pdf","comment":"Accepted as an EMNLP 2023 main paper"},{"id":"http://arxiv.org/abs/2310.10981v1","updated":"2023-10-17T04:03:00Z","published":"2023-10-17T04:03:00Z","title":"Instructive Dialogue Summarization with Query Aggregations","summary":"  Conventional dialogue summarization methods directly generate summaries and\ndo not consider user's specific interests. This poses challenges in cases where\nthe users are more focused on particular topics or aspects. With the\nadvancement of instruction-finetuned language models, we introduce\ninstruction-tuning to dialogues to expand the capability set of dialogue\nsummarization models. To overcome the scarcity of instructive dialogue\nsummarization data, we propose a three-step approach to synthesize high-quality\nquery-based summarization triples. This process involves summary-anchored query\ngeneration, query filtering, and query-based summary generation. By training a\nunified model called InstructDS (Instructive Dialogue Summarization) on three\nsummarization datasets with multi-purpose instructive triples, we expand the\ncapability of dialogue summarization models. We evaluate our method on four\ndatasets, including dialogue summarization and dialogue reading comprehension.\nExperimental results show that our approach outperforms the state-of-the-art\nmodels and even models with larger sizes. Additionally, our model exhibits\nhigher generalizability and faithfulness, as confirmed by human subjective\nevaluations.\n","authors":["Bin Wang","Zhengyuan Liu","Nancy F. Chen"],"pdf_url":"https://arxiv.org/pdf/2310.10981v1.pdf","comment":"Accept to EMNLP 2023 Main Conference"},{"id":"http://arxiv.org/abs/2310.09909v2","updated":"2023-10-17T03:41:09Z","published":"2023-10-15T18:32:27Z","title":"Can GPT-4V(ision) Serve Medical Applications? Case Studies on GPT-4V for\n  Multimodal Medical Diagnosis","summary":"  Driven by the large foundation models, the development of artificial\nintelligence has witnessed tremendous progress lately, leading to a surge of\ngeneral interest from the public. In this study, we aim to assess the\nperformance of OpenAI's newest model, GPT-4V(ision), specifically in the realm\nof multimodal medical diagnosis. Our evaluation encompasses 17 human body\nsystems, including Central Nervous System, Head and Neck, Cardiac, Chest,\nHematology, Hepatobiliary, Gastrointestinal, Urogenital, Gynecology,\nObstetrics, Breast, Musculoskeletal, Spine, Vascular, Oncology, Trauma,\nPediatrics, with images taken from 8 modalities used in daily clinic routine,\ne.g., X-ray, Computed Tomography (CT), Magnetic Resonance Imaging (MRI),\nPositron Emission Tomography (PET), Digital Subtraction Angiography (DSA),\nMammography, Ultrasound, and Pathology. We probe the GPT-4V's ability on\nmultiple clinical tasks with or without patent history provided, including\nimaging modality and anatomy recognition, disease diagnosis, report generation,\ndisease localisation.\n  Our observation shows that, while GPT-4V demonstrates proficiency in\ndistinguishing between medical image modalities and anatomy, it faces\nsignificant challenges in disease diagnosis and generating comprehensive\nreports. These findings underscore that while large multimodal models have made\nsignificant advancements in computer vision and natural language processing, it\nremains far from being used to effectively support real-world medical\napplications and clinical decision-making.\n  All images used in this report can be found in\nhttps://github.com/chaoyi-wu/GPT-4V_Medical_Evaluation.\n","authors":["Chaoyi Wu","Jiayu Lei","Qiaoyu Zheng","Weike Zhao","Weixiong Lin","Xiaoman Zhang","Xiao Zhou","Ziheng Zhao","Ya Zhang","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2310.09909v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10070v2","updated":"2023-10-17T03:29:04Z","published":"2023-06-15T20:19:08Z","title":"Opportunities and Challenges for ChatGPT and Large Language Models in\n  Biomedicine and Health","summary":"  ChatGPT has drawn considerable attention from both the general public and\ndomain experts with its remarkable text generation capabilities. This has\nsubsequently led to the emergence of diverse applications in the field of\nbiomedicine and health. In this work, we examine the diverse applications of\nlarge language models (LLMs), such as ChatGPT, in biomedicine and health.\nSpecifically we explore the areas of biomedical information retrieval, question\nanswering, medical text summarization, information extraction, and medical\neducation, and investigate whether LLMs possess the transformative power to\nrevolutionize these tasks or whether the distinct complexities of biomedical\ndomain presents unique challenges. Following an extensive literature survey, we\nfind that significant advances have been made in the field of text generation\ntasks, surpassing the previous state-of-the-art methods. For other\napplications, the advances have been modest. Overall, LLMs have not yet\nrevolutionized biomedicine, but recent rapid progress indicates that such\nmethods hold great potential to provide valuable means for accelerating\ndiscovery and improving health. We also find that the use of LLMs, like\nChatGPT, in the fields of biomedicine and health entails various risks and\nchallenges, including fabricated information in its generated responses, as\nwell as legal and privacy concerns associated with sensitive patient data. We\nbelieve this survey can provide a comprehensive and timely overview to\nbiomedical researchers and healthcare practitioners on the opportunities and\nchallenges associated with using ChatGPT and other LLMs for transforming\nbiomedicine and health.\n","authors":["Shubo Tian","Qiao Jin","Lana Yeganova","Po-Ting Lai","Qingqing Zhu","Xiuying Chen","Yifan Yang","Qingyu Chen","Won Kim","Donald C. Comeau","Rezarta Islamaj","Aadit Kapoor","Xin Gao","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2306.10070v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10967v1","updated":"2023-10-17T03:28:29Z","published":"2023-10-17T03:28:29Z","title":"EXMODD: An EXplanatory Multimodal Open-Domain Dialogue dataset","summary":"  The need for high-quality data has been a key issue hindering the research of\ndialogue tasks. Recent studies try to build datasets through manual, web\ncrawling, and large pre-trained models. However, man-made data is expensive and\ndata collected from the internet often includes generic responses, meaningless\nstatements, and toxic dialogues. Automatic data generation through large models\nis a cost-effective method, but for open-domain multimodal dialogue tasks,\nthere are still three drawbacks: 1) There is currently no open-source large\nmodel that can accept multimodal input; 2) The content generated by the model\nlacks interpretability; 3) The generated data is usually difficult to quality\ncontrol and require extensive resource to collect. To alleviate the significant\nhuman and resource expenditure in data collection, we propose a Multimodal Data\nConstruction Framework (MDCF). MDCF designs proper prompts to spur the\nlarge-scale pre-trained language model to generate well-formed and satisfactory\ncontent. Additionally, MDCF also automatically provides explanation for a given\nimage and its corresponding dialogue, which can provide a certain degree of\ninterpretability and facilitate manual follow-up quality inspection. Based on\nthis, we release an Explanatory Multimodal Open-Domain dialogue dataset\n(EXMODD). Experiments indicate a positive correlation between the model's\nability to generate accurate understandings and high-quality responses. Our\ncode and data can be found at https://github.com/poplpr/EXMODD.\n","authors":["Hang Yin","Pinren Lu","Ziang Li","Bin Sun","Kan Li"],"pdf_url":"https://arxiv.org/pdf/2310.10967v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10962v1","updated":"2023-10-17T03:21:43Z","published":"2023-10-17T03:21:43Z","title":"Semantic-Aware Contrastive Sentence Representation Learning with Large\n  Language Models","summary":"  Contrastive learning has been proven to be effective in learning better\nsentence representations. However, to train a contrastive learning model, large\nnumbers of labeled sentences are required to construct positive and negative\npairs explicitly, such as those in natural language inference (NLI) datasets.\nUnfortunately, acquiring sufficient high-quality labeled data can be both\ntime-consuming and resource-intensive, leading researchers to focus on\ndeveloping methods for learning unsupervised sentence representations. As there\nis no clear relationship between these unstructured randomly-sampled sentences,\nbuilding positive and negative pairs over them is tricky and problematic. To\ntackle these challenges, in this paper, we propose SemCSR, a semantic-aware\ncontrastive sentence representation framework. By leveraging the generation and\nevaluation capabilities of large language models (LLMs), we can automatically\nconstruct a high-quality NLI-style corpus without any human annotation, and\nfurther incorporate the generated sentence pairs into learning a contrastive\nsentence representation model. Extensive experiments and comprehensive analyses\ndemonstrate the effectiveness of our proposed framework for learning a better\nsentence representation with LLMs.\n","authors":["Huiming Wang","Liying Cheng","Zhaodonghui Li","De Wen Soh","Lidong Bing"],"pdf_url":"https://arxiv.org/pdf/2310.10962v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10956v1","updated":"2023-10-17T03:05:42Z","published":"2023-10-17T03:05:42Z","title":"Computing the optimal keyboard through a geometric analysis of the\n  English language","summary":"  In the context of a group project for the course COMSW4995 002 - Geometric\nData Analysis, we bring our attention to the design of fast-typing keyboards.\nLeveraging some geometric tools in an optimization framework allowed us to\npropose novel keyboard layouts that offer a faster typing.\n","authors":["Jules Deschamps","Quentin Hubert","Lucas Ryckelynck"],"pdf_url":"https://arxiv.org/pdf/2310.10956v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2310.10955v1","updated":"2023-10-17T03:05:06Z","published":"2023-10-17T03:05:06Z","title":"A State-Vector Framework for Dataset Effects","summary":"  The impressive success of recent deep neural network (DNN)-based systems is\nsignificantly influenced by the high-quality datasets used in training.\nHowever, the effects of the datasets, especially how they interact with each\nother, remain underexplored. We propose a state-vector framework to enable\nrigorous studies in this direction. This framework uses idealized probing test\nresults as the bases of a vector space. This framework allows us to quantify\nthe effects of both standalone and interacting datasets. We show that the\nsignificant effects of some commonly-used language understanding datasets are\ncharacteristic and are concentrated on a few linguistic dimensions.\nAdditionally, we observe some ``spill-over'' effects: the datasets could impact\nthe models along dimensions that may seem unrelated to the intended tasks. Our\nstate-vector framework paves the way for a systematic understanding of the\ndataset effects, a crucial component in responsible and robust model\ndevelopment.\n","authors":["Esmat Sahak","Zining Zhu","Frank Rudzicz"],"pdf_url":"https://arxiv.org/pdf/2310.10955v1.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.10944v1","updated":"2023-10-17T02:42:34Z","published":"2023-10-17T02:42:34Z","title":"TEQ: Trainable Equivalent Transformation for Quantization of LLMs","summary":"  As large language models (LLMs) become more prevalent, there is a growing\nneed for new and improved quantization methods that can meet the\ncomputationalast layer demands of these modern architectures while maintaining\nthe accuracy. In this paper, we present TEQ, a trainable equivalent\ntransformation that preserves the FP32 precision of the model output while\ntaking advantage of low-precision quantization, especially 3 and 4 bits\nweight-only quantization. The training process is lightweight, requiring only\n1K steps and fewer than 0.1 percent of the original model's trainable\nparameters. Furthermore, the transformation does not add any computational\noverhead during inference. Our results are on-par with the state-of-the-art\n(SOTA) methods on typical LLMs. Our approach can be combined with other methods\nto achieve even better performance. The code is available at\nhttps://github.com/intel/neural-compressor.\n","authors":["Wenhua Cheng","Yiyang Cai","Kaokao Lv","Haihao Shen"],"pdf_url":"https://arxiv.org/pdf/2310.10944v1.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2304.01492v5","updated":"2023-10-17T02:37:46Z","published":"2023-04-04T03:13:03Z","title":"A Unified Contrastive Transfer Framework with Propagation Structure for\n  Boosting Low-Resource Rumor Detection","summary":"  The truth is significantly hampered by massive rumors that spread along with\nbreaking news or popular topics. Since there is sufficient corpus gathered from\nthe same domain for model training, existing rumor detection algorithms show\npromising performance on yesterday's news. However, due to a lack of\nsubstantial training data and prior expert knowledge, they are poor at spotting\nrumors concerning unforeseen events, especially those propagated in different\nlanguages (i.e., low-resource regimes). In this paper, we propose a unified\ncontrastive transfer framework to detect rumors by adapting the features\nlearned from well-resourced rumor data to that of the low-resourced with only\nfew-shot annotations. More specifically, we first represent rumor circulated on\nsocial media as an undirected topology for enhancing the interaction of user\nopinions, and then train a Multi-scale Graph Convolutional Network via a\nunified contrastive paradigm to mine effective clues simultaneously from post\nsemantics and propagation structure. Our model explicitly breaks the barriers\nof the domain and/or language issues, via language alignment and a novel\ndomain-adaptive contrastive learning mechanism. To well-generalize the\nrepresentation learning using a small set of annotated target events, we reveal\nthat rumor-indicative signal is closely correlated with the uniformity of the\ndistribution of these events. We design a target-wise contrastive training\nmechanism with three event-level data augmentation strategies, capable of\nunifying the representations by distinguishing target events. Extensive\nexperiments conducted on four low-resource datasets collected from real-world\nmicroblog platforms demonstrate that our framework achieves much better\nperformance than state-of-the-art methods and exhibits a superior capacity for\ndetecting rumors at early stages.\n","authors":["Hongzhan Lin","Jing Ma","Ruichao Yang","Zhiwei Yang","Mingfei Cheng"],"pdf_url":"https://arxiv.org/pdf/2304.01492v5.pdf","comment":"An extension of the first contrastive approach for low-resource rumor\n  detection (arXiv:2204.08143)"},{"id":"http://arxiv.org/abs/2310.10941v1","updated":"2023-10-17T02:34:34Z","published":"2023-10-17T02:34:34Z","title":"MASON-NLP at eRisk 2023: Deep Learning-Based Detection of Depression\n  Symptoms from Social Media Texts","summary":"  Depression is a mental health disorder that has a profound impact on people's\nlives. Recent research suggests that signs of depression can be detected in the\nway individuals communicate, both through spoken words and written texts. In\nparticular, social media posts are a rich and convenient text source that we\nmay examine for depressive symptoms. The Beck Depression Inventory (BDI)\nQuestionnaire, which is frequently used to gauge the severity of depression, is\none instrument that can aid in this study. We can narrow our study to only\nthose symptoms since each BDI question is linked to a particular depressive\nsymptom. It's important to remember that not everyone with depression exhibits\nall symptoms at once, but rather a combination of them. Therefore, it is\nextremely useful to be able to determine if a sentence or a piece of\nuser-generated content is pertinent to a certain condition. With this in mind,\nthe eRisk 2023 Task 1 was designed to do exactly that: assess the relevance of\ndifferent sentences to the symptoms of depression as outlined in the BDI\nquestionnaire. This report is all about how our team, Mason-NLP, participated\nin this subtask, which involved identifying sentences related to different\ndepression symptoms. We used a deep learning approach that incorporated\nMentalBERT, RoBERTa, and LSTM. Despite our efforts, the evaluation results were\nlower than expected, underscoring the challenges inherent in ranking sentences\nfrom an extensive dataset about depression, which necessitates both appropriate\nmethodological choices and significant computational resources. We anticipate\nthat future iterations of this shared task will yield improved results as our\nunderstanding and techniques evolve.\n","authors":["Fardin Ahsan Sakib","Ahnaf Atef Choudhury","Ozlem Uzuner"],"pdf_url":"https://arxiv.org/pdf/2310.10941v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10935v1","updated":"2023-10-17T02:12:12Z","published":"2023-10-17T02:12:12Z","title":"Intent Detection and Slot Filling for Home Assistants: Dataset and\n  Analysis for Bangla and Sylheti","summary":"  As voice assistants cement their place in our technologically advanced\nsociety, there remains a need to cater to the diverse linguistic landscape,\nincluding colloquial forms of low-resource languages. Our study introduces the\nfirst-ever comprehensive dataset for intent detection and slot filling in\nformal Bangla, colloquial Bangla, and Sylheti languages, totaling 984 samples\nacross 10 unique intents. Our analysis reveals the robustness of large language\nmodels for tackling downstream tasks with inadequate data. The GPT-3.5 model\nachieves an impressive F1 score of 0.94 in intent detection and 0.51 in slot\nfilling for colloquial Bangla.\n","authors":["Fardin Ahsan Sakib","A H M Rezaul Karim","Saadat Hasan Khan","Md Mushfiqur Rahman"],"pdf_url":"https://arxiv.org/pdf/2310.10935v1.pdf","comment":"Accepted at the First Workshop on Bangla Language Processing, 2023"},{"id":"http://arxiv.org/abs/2310.09430v2","updated":"2023-10-17T02:08:24Z","published":"2023-10-13T22:29:15Z","title":"A Systematic Evaluation of Large Language Models on Out-of-Distribution\n  Logical Reasoning Tasks","summary":"  Large language models (LLMs), such as GPT-3.5 and GPT-4, have greatly\nadvanced the performance of artificial systems on various natural language\nprocessing tasks to human-like levels. However, their generalisation and\nrobustness to perform logical reasoning remain under-evaluated. To probe this\nability, we propose three new logical reasoning datasets named \"ReClor-plus\",\n\"LogiQA-plus\" and \"LogiQAv2-plus\", each featuring three subsets: the first with\nrandomly shuffled options, the second with the correct choices replaced by\n\"none of the other options are correct\", and a combination of the previous two\nsubsets. We carry out experiments on these datasets with both discriminative\nand generative LLMs and show that these simple tricks greatly hinder the\nperformance of the language models. Despite their superior performance on the\noriginal publicly available datasets, we find that all models struggle to\nanswer our newly constructed datasets. We show that introducing task variations\nby perturbing a sizable training set can markedly improve the model's\ngeneralisation and robustness in logical reasoning tasks. Moreover, applying\nlogic-driven data augmentation for fine-tuning, combined with prompting can\nenhance the generalisation performance of both discriminative large language\nmodels and generative large language models. These results offer insights into\nassessing and improving the generalisation and robustness of large language\nmodels for logical reasoning tasks. We make our source code and data publicly\navailable\n\\url{https://github.com/Strong-AI-Lab/Logical-and-abstract-reasoning}.\n","authors":["Qiming Bao","Gael Gendron","Alex Yuxuan Peng","Wanjun Zhong","Neset Tan","Yang Chen","Michael Witbrock","Jiamou Liu"],"pdf_url":"https://arxiv.org/pdf/2310.09430v2.pdf","comment":"Accepted for oral presentation at the LLM@IJCAI 2023 non-archival\n  symposium"},{"id":"http://arxiv.org/abs/2310.10930v1","updated":"2023-10-17T01:59:07Z","published":"2023-10-17T01:59:07Z","title":"Enhanced Transformer Architecture for Natural Language Processing","summary":"  Transformer is a state-of-the-art model in the field of natural language\nprocessing (NLP). Current NLP models primarily increase the number of\ntransformers to improve processing performance. However, this technique\nrequires a lot of training resources such as computing capacity. In this paper,\na novel structure of Transformer is proposed. It is featured by full layer\nnormalization, weighted residual connection, positional encoding exploiting\nreinforcement learning, and zero masked self-attention. The proposed\nTransformer model, which is called Enhanced Transformer, is validated by the\nbilingual evaluation understudy (BLEU) score obtained with the Multi30k\ntranslation dataset. As a result, the Enhanced Transformer achieves 202.96%\nhigher BLEU score as compared to the original transformer with the translation\ndataset.\n","authors":["Woohyeon Moon","Taeyoung Kim","Bumgeun Park","Dongsoo Har"],"pdf_url":"https://arxiv.org/pdf/2310.10930v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2310.08659v2","updated":"2023-10-17T01:35:10Z","published":"2023-10-12T18:34:08Z","title":"LoftQ: LoRA-Fine-Tuning-Aware Quantization for Large Language Models","summary":"  Quantization is an indispensable technique for serving Large Language Models\n(LLMs) and has recently found its way into LoRA fine-tuning. In this work we\nfocus on the scenario where quantization and LoRA fine-tuning are applied\ntogether on a pre-trained model. In such cases it is common to observe a\nconsistent gap in the performance on downstream tasks between full fine-tuning\nand quantization plus LoRA fine-tuning approach. In response, we propose LoftQ\n(LoRA-Fine-Tuning-aware Quantization), a novel quantization framework that\nsimultaneously quantizes an LLM and finds a proper low-rank initialization for\nLoRA fine-tuning. Such an initialization alleviates the discrepancy between the\nquantized and full-precision model and significantly improves the\ngeneralization in downstream tasks. We evaluate our method on natural language\nunderstanding, question answering, summarization, and natural language\ngeneration tasks. Experiments show that our method is highly effective and\noutperforms existing quantization methods, especially in the challenging 2-bit\nand 2/4-bit mixed precision regimes. We will release our code.\n","authors":["Yixiao Li","Yifan Yu","Chen Liang","Pengcheng He","Nikos Karampatziakis","Weizhu Chen","Tuo Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.08659v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10922v1","updated":"2023-10-17T01:31:59Z","published":"2023-10-17T01:31:59Z","title":"Spatial HuBERT: Self-supervised Spatial Speech Representation Learning\n  for a Single Talker from Multi-channel Audio","summary":"  Self-supervised learning has been used to leverage unlabelled data, improving\naccuracy and generalisation of speech systems through the training of\nrepresentation models. While many recent works have sought to produce effective\nrepresentations across a variety of acoustic domains, languages, modalities and\neven simultaneous speakers, these studies have all been limited to\nsingle-channel audio recordings. This paper presents Spatial HuBERT, a\nself-supervised speech representation model that learns both acoustic and\nspatial information pertaining to a single speaker in a potentially noisy\nenvironment by using multi-channel audio inputs. Spatial HuBERT learns\nrepresentations that outperform state-of-the-art single-channel speech\nrepresentations on a variety of spatial downstream tasks, particularly in\nreverberant and noisy environments. We also demonstrate the utility of the\nrepresentations learned by Spatial HuBERT on a speech localisation downstream\ntask. Along with this paper, we publicly release a new dataset of 100 000\nsimulated first-order ambisonics room impulse responses.\n","authors":["Antoni Dimitriadis","Siqi Pan","Vidhyasaharan Sethu","Beena Ahmed"],"pdf_url":"https://arxiv.org/pdf/2310.10922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04965v2","updated":"2023-10-17T01:30:57Z","published":"2023-09-10T08:55:24Z","title":"Prefix-diffusion: A Lightweight Diffusion Model for Diverse Image\n  Captioning","summary":"  While impressive performance has been achieved in image captioning, the\nlimited diversity of the generated captions and the large parameter scale\nremain major barriers to the real-word application of these systems. In this\nwork, we propose a lightweight image captioning network in combination with\ncontinuous diffusion, called Prefix-diffusion. To achieve diversity, we design\nan efficient method that injects prefix image embeddings into the denoising\nprocess of the diffusion model. In order to reduce trainable parameters, we\nemploy a pre-trained model to extract image features and further design an\nextra mapping network. Prefix-diffusion is able to generate diverse captions\nwith relatively less parameters, while maintaining the fluency and relevance of\nthe captions benefiting from the generative capabilities of the diffusion\nmodel. Our work paves the way for scaling up diffusion models for image\ncaptioning, and achieves promising performance compared with recent approaches.\n","authors":["Guisheng Liu","Yi Li","Zhengcong Fei","Haiyan Fu","Xiangyang Luo","Yanqing Guo"],"pdf_url":"https://arxiv.org/pdf/2309.04965v2.pdf","comment":"11 pages,4 figures, 6 tables"},{"id":"http://arxiv.org/abs/2310.10920v1","updated":"2023-10-17T01:27:20Z","published":"2023-10-17T01:27:20Z","title":"NuclearQA: A Human-Made Benchmark for Language Models for the Nuclear\n  Domain","summary":"  As LLMs have become increasingly popular, they have been used in almost every\nfield. But as the application for LLMs expands from generic fields to narrow,\nfocused science domains, there exists an ever-increasing gap in ways to\nevaluate their efficacy in those fields. For the benchmarks that do exist, a\nlot of them focus on questions that don't require proper understanding of the\nsubject in question. In this paper, we present NuclearQA, a human-made\nbenchmark of 100 questions to evaluate language models in the nuclear domain,\nconsisting of a varying collection of questions that have been specifically\ndesigned by experts to test the abilities of language models. We detail our\napproach and show how the mix of several types of questions makes our benchmark\nuniquely capable of evaluating models in the nuclear domain. We also present\nour own evaluation metric for assessing LLM's performances due to the\nlimitations of existing ones. Our experiments on state-of-the-art models\nsuggest that even the best LLMs perform less than satisfactorily on our\nbenchmark, demonstrating the scientific knowledge gap of existing LLMs.\n","authors":["Anurag Acharya","Sai Munikoti","Aaron Hellinger","Sara Smith","Sridevi Wagle","Sameera Horawalavithana"],"pdf_url":"https://arxiv.org/pdf/2310.10920v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2310.10903v1","updated":"2023-10-17T00:22:10Z","published":"2023-10-17T00:22:10Z","title":"Emergent AI-Assisted Discourse: Case Study of a Second Language Writer\n  Authoring with ChatGPT","summary":"  The rapid proliferation of ChatGPT has incited debates regarding its impact\non human writing. Amid concerns about declining writing standards, this study\ninvestigates the role of ChatGPT in facilitating academic writing, especially\namong language learners. Using a case study approach, this study examines the\nexperiences of Kailing, a doctoral student, who integrates ChatGPT throughout\ntheir academic writing process. The study employs activity theory as a lens for\nunderstanding writing with generative AI tools and data analyzed includes\nsemi-structured interviews, writing samples, and GPT logs. Results indicate\nthat Kailing effectively collaborates with ChatGPT across various writing\nstages while preserving her distinct authorial voice and agency. This\nunderscores the potential of AI tools such as ChatGPT to enhance academic\nwriting for language learners without overshadowing individual authenticity.\nThis case study offers a critical exploration of how ChatGPT is utilized in the\nacademic writing process and the preservation of a student's authentic voice\nwhen engaging with the tool.\n","authors":["Sharin Jacob","Tamara Tate","Mark Warschauer"],"pdf_url":"https://arxiv.org/pdf/2310.10903v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2310.11628v1","updated":"2023-10-17T23:34:39Z","published":"2023-10-17T23:34:39Z","title":"Learn Your Tokens: Word-Pooled Tokenization for Language Modeling","summary":"  Language models typically tokenize text into subwords, using a deterministic,\nhand-engineered heuristic of combining characters into longer surface-level\nstrings such as 'ing' or whole words. Recent literature has repeatedly shown\nthe limitations of such a tokenization strategy, particularly for documents not\nwritten in English and for representing numbers. On the other extreme,\nbyte/character-level language models are much less restricted but suffer from\nincreased sequence description lengths and a subsequent quadratic expansion in\nself-attention computation. Recent attempts to compress and limit these context\nlengths with fixed size convolutions is helpful but completely ignores the word\nboundary. This paper considers an alternative 'learn your tokens' scheme which\nutilizes the word boundary to pool bytes/characters into word representations,\nwhich are fed to the primary language model, before again decoding individual\ncharacters/bytes per word in parallel. We find that our moderately expressive\nand moderately fast end-to-end tokenizer outperform by over 300% both subwords\nand byte/character models over the intrinsic language modeling metric of\nnext-word prediction across datasets. It particularly outshines on rare words,\noutperforming by a factor of 30! We extensively study the language modeling\nsetup for all three categories of tokenizers and theoretically analyze how our\nend-to-end models can also be a strong trade-off in efficiency and robustness.\n","authors":["Avijit Thawani","Saurabh Ghanekar","Xiaoyuan Zhu","Jay Pujara"],"pdf_url":"https://arxiv.org/pdf/2310.11628v1.pdf","comment":"Accepted to EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2310.05295v2","updated":"2023-10-17T22:43:08Z","published":"2023-10-08T21:45:34Z","title":"Visual Storytelling with Question-Answer Plans","summary":"  Visual storytelling aims to generate compelling narratives from image\nsequences. Existing models often focus on enhancing the representation of the\nimage sequence, e.g., with external knowledge sources or advanced graph\nstructures. Despite recent progress, the stories are often repetitive,\nillogical, and lacking in detail. To mitigate these issues, we present a novel\nframework which integrates visual representations with pretrained language\nmodels and planning. Our model translates the image sequence into a visual\nprefix, a sequence of continuous embeddings which language models can\ninterpret. It also leverages a sequence of question-answer pairs as a blueprint\nplan for selecting salient visual concepts and determining how they should be\nassembled into a narrative. Automatic and human evaluation on the VIST\nbenchmark (Huang et al., 2016) demonstrates that blueprint-based models\ngenerate stories that are more coherent, interesting, and natural compared to\ncompetitive baselines and state-of-the-art systems.\n","authors":["Danyang Liu","Mirella Lapata","Frank Keller"],"pdf_url":"https://arxiv.org/pdf/2310.05295v2.pdf","comment":"EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2310.11616v1","updated":"2023-10-17T22:42:12Z","published":"2023-10-17T22:42:12Z","title":"Unveiling the General Intelligence Factor in Language Models: A\n  Psychometric Approach","summary":"  This study uncovers the factor of general intelligence, or g, in language\nmodels, extending the psychometric theory traditionally applied to humans and\ncertain animal species. Utilizing factor analysis on two extensive datasets -\nOpen LLM Leaderboard with 1,232 models and General Language Understanding\nEvaluation (GLUE) Leaderboard with 88 models - we find compelling evidence for\na unidimensional, highly stable g factor that accounts for 85% of the variance\nin model performance. The study also finds a moderate correlation of .48\nbetween model size and g. The discovery of g in language models offers a\nunified metric for model evaluation and opens new avenues for more robust,\ng-based model ability assessment. These findings lay the foundation for\nunderstanding and future research on artificial general intelligence from a\npsychometric perspective and have practical implications for model evaluation\nand development.\n","authors":["David Ilić"],"pdf_url":"https://arxiv.org/pdf/2310.11616v1.pdf","comment":"10 pages (including appendix), 7 figures"},{"id":"http://arxiv.org/abs/2310.09166v2","updated":"2023-10-17T22:37:58Z","published":"2023-10-13T15:01:17Z","title":"Developing a Natural Language Understanding Model to Characterize Cable\n  News Bias","summary":"  Media bias has been extensively studied by both social and computational\nsciences. However, current work still has a large reliance on human input and\nsubjective assessment to label biases. This is especially true for cable news\nresearch. To address these issues, we develop an unsupervised machine learning\nmethod to characterize the bias of cable news programs without any human input.\nThis method relies on the analysis of what topics are mentioned through Named\nEntity Recognition and how those topics are discussed through Stance Analysis\nin order to cluster programs with similar biases together. Applying our method\nto 2020 cable news transcripts, we find that program clusters are consistent\nover time and roughly correspond to the cable news network of the program. This\nmethod reveals the potential for future tools to objectively assess media bias\nand characterize unfamiliar media environments.\n","authors":["Seth P. Benson","Iain J. Cruickshank"],"pdf_url":"https://arxiv.org/pdf/2310.09166v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.10764v2","updated":"2023-10-17T22:15:38Z","published":"2022-12-21T04:49:55Z","title":"Learning List-Level Domain-Invariant Representations for Ranking","summary":"  Domain adaptation aims to transfer the knowledge learned on (data-rich)\nsource domains to (low-resource) target domains, and a popular method is\ninvariant representation learning, which matches and aligns the data\ndistributions on the feature space. Although this method is studied extensively\nand applied on classification and regression problems, its adoption on ranking\nproblems is sporadic, and the few existing implementations lack theoretical\njustifications. This paper revisits invariant representation learning for\nranking. Upon reviewing prior work, we found that they implement what we call\nitem-level alignment, which aligns the distributions of the items being ranked\nfrom all lists in aggregate but ignores their list structure. However, the list\nstructure should be leveraged, because it is intrinsic to ranking problems\nwhere the data and the metrics are defined and computed on lists, not the items\nby themselves. To close this discrepancy, we propose list-level alignment --\nlearning domain-invariant representations at the higher level of lists. The\nbenefits are twofold: it leads to the first domain adaptation generalization\nbound for ranking, in turn providing theoretical support for the proposed\nmethod, and it achieves better empirical transfer performance for unsupervised\ndomain adaptation on ranking tasks, including passage reranking.\n","authors":["Ruicheng Xian","Honglei Zhuang","Zhen Qin","Hamed Zamani","Jing Lu","Ji Ma","Kai Hui","Han Zhao","Xuanhui Wang","Michael Bendersky"],"pdf_url":"https://arxiv.org/pdf/2212.10764v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2302.09473v2","updated":"2023-10-17T22:01:00Z","published":"2023-02-19T04:03:22Z","title":"Video-Text Retrieval by Supervised Sparse Multi-Grained Learning","summary":"  While recent progress in video-text retrieval has been advanced by the\nexploration of better representation learning, in this paper, we present a\nnovel multi-grained sparse learning framework, S3MA, to learn an aligned sparse\nspace shared between the video and the text for video-text retrieval. The\nshared sparse space is initialized with a finite number of sparse concepts,\neach of which refers to a number of words. With the text data at hand, we learn\nand update the shared sparse space in a supervised manner using the proposed\nsimilarity and alignment losses. Moreover, to enable multi-grained alignment,\nwe incorporate frame representations for better modeling the video modality and\ncalculating fine-grained and coarse-grained similarities. Benefiting from the\nlearned shared sparse space and multi-grained similarities, extensive\nexperiments on several video-text retrieval benchmarks demonstrate the\nsuperiority of S3MA over existing methods. Our code is available at\nhttps://github.com/yimuwangcs/Better_Cross_Modal_Retrieval.\n","authors":["Yimu Wang","Peng Shi"],"pdf_url":"https://arxiv.org/pdf/2302.09473v2.pdf","comment":"Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.11604v1","updated":"2023-10-17T21:57:36Z","published":"2023-10-17T21:57:36Z","title":"Language Models as Zero-Shot Trajectory Generators","summary":"  Large Language Models (LLMs) have recently shown promise as high-level\nplanners for robots when given access to a selection of low-level skills.\nHowever, it is often assumed that LLMs do not possess sufficient knowledge to\nbe used for the low-level trajectories themselves. In this work, we address\nthis assumption thoroughly, and investigate if an LLM (GPT-4) can directly\npredict a dense sequence of end-effector poses for manipulation skills, when\ngiven access to only object detection and segmentation vision models. We study\nhow well a single task-agnostic prompt, without any in-context examples, motion\nprimitives, or external trajectory optimisers, can perform across 26 real-world\nlanguage-based tasks, such as \"open the bottle cap\" and \"wipe the plate with\nthe sponge\", and we investigate which design choices in this prompt are the\nmost effective. Our conclusions raise the assumed limit of LLMs for robotics,\nand we reveal for the first time that LLMs do indeed possess an understanding\nof low-level robot control sufficient for a range of common tasks, and that\nthey can additionally detect failures and then re-plan trajectories\naccordingly. Videos, code, and prompts are available at:\nhttps://www.robot-learning.uk/language-models-trajectory-generators.\n","authors":["Teyun Kwon","Norman Di Palo","Edward Johns"],"pdf_url":"https://arxiv.org/pdf/2310.11604v1.pdf","comment":"19 pages, 21 figures"},{"id":"http://arxiv.org/abs/2310.11593v1","updated":"2023-10-17T21:35:06Z","published":"2023-10-17T21:35:06Z","title":"Automated Evaluation of Personalized Text Generation using Large\n  Language Models","summary":"  Personalized text generation presents a specialized mechanism for delivering\ncontent that is specific to a user's personal context. While the research\nprogress in this area has been rapid, evaluation still presents a challenge.\nTraditional automated metrics such as BLEU and ROUGE primarily measure lexical\nsimilarity to human-written references, and are not able to distinguish\npersonalization from other subtle semantic aspects, thus falling short of\ncapturing the nuances of personalized generated content quality. On the other\nhand, human judgments are costly to obtain, especially in the realm of\npersonalized evaluation. Inspired by these challenges, we explore the use of\nlarge language models (LLMs) for evaluating personalized text generation, and\nexamine their ability to understand nuanced user context. We present AuPEL, a\nnovel evaluation method that distills three major semantic aspects of the\ngenerated text: personalization, quality and relevance, and automatically\nmeasures these aspects. To validate the effectiveness of AuPEL, we design\ncarefully controlled experiments and compare the accuracy of the evaluation\njudgments made by LLMs versus that of judgements made by human annotators, and\nconduct rigorous analyses of the consistency and sensitivity of the proposed\nmetric. We find that, compared to existing evaluation metrics, AuPEL not only\ndistinguishes and ranks models based on their personalization abilities more\naccurately, but also presents commendable consistency and efficiency for this\ntask. Our work suggests that using LLMs as the evaluators of personalized text\ngeneration is superior to traditional text similarity metrics, even though\ninteresting new challenges still remain.\n","authors":["Yaqing Wang","Jiepu Jiang","Mingyang Zhang","Cheng Li","Yi Liang","Qiaozhu Mei","Michael Bendersky"],"pdf_url":"https://arxiv.org/pdf/2310.11593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11589v1","updated":"2023-10-17T21:11:21Z","published":"2023-10-17T21:11:21Z","title":"Eliciting Human Preferences with Language Models","summary":"  Language models (LMs) can be directed to perform target tasks by using\nlabeled examples or natural language prompts. But selecting examples or writing\nprompts for can be challenging--especially in tasks that involve unusual edge\ncases, demand precise articulation of nebulous preferences, or require an\naccurate mental model of LM behavior. We propose to use *LMs themselves* to\nguide the task specification process. In this paper, we introduce **Generative\nActive Task Elicitation (GATE)**: a learning framework in which models elicit\nand infer intended behavior through free-form, language-based interaction with\nusers. We study GATE in three domains: email validation, content\nrecommendation, and moral reasoning. In preregistered experiments, we show that\nLMs prompted to perform GATE (e.g., by generating open-ended questions or\nsynthesizing informative edge cases) elicit responses that are often more\ninformative than user-written prompts or labels. Users report that interactive\ntask elicitation requires less effort than prompting or example labeling and\nsurfaces novel considerations not initially anticipated by users. Our findings\nsuggest that LM-driven elicitation can be a powerful tool for aligning models\nto complex human preferences and values.\n","authors":["Belinda Z. Li","Alex Tamkin","Noah Goodman","Jacob Andreas"],"pdf_url":"https://arxiv.org/pdf/2310.11589v1.pdf","comment":"26 pages, 15 figures"},{"id":"http://arxiv.org/abs/2310.11584v1","updated":"2023-10-17T21:05:20Z","published":"2023-10-17T21:05:20Z","title":"BasahaCorpus: An Expanded Linguistic Resource for Readability Assessment\n  in Central Philippine Languages","summary":"  Current research on automatic readability assessment (ARA) has focused on\nimproving the performance of models in high-resource languages such as English.\nIn this work, we introduce and release BasahaCorpus as part of an initiative\naimed at expanding available corpora and baseline models for readability\nassessment in lower resource languages in the Philippines. We compiled a corpus\nof short fictional narratives written in Hiligaynon, Minasbate, Karay-a, and\nRinconada -- languages belonging to the Central Philippine family tree subgroup\n-- to train ARA models using surface-level, syllable-pattern, and n-gram\noverlap features. We also propose a new hierarchical cross-lingual modeling\napproach that takes advantage of a language's placement in the family tree to\nincrease the amount of available training data. Our study yields encouraging\nresults that support previous work showcasing the efficacy of cross-lingual\nmodels in low-resource settings, as well as similarities in highly informative\nlinguistic features for mutually intelligible languages.\n","authors":["Joseph Marvin Imperial","Ekaterina Kochmar"],"pdf_url":"https://arxiv.org/pdf/2310.11584v1.pdf","comment":"Final camera-ready paper for EMNLP 2023 (Main)"},{"id":"http://arxiv.org/abs/2305.14577v2","updated":"2023-10-17T21:03:10Z","published":"2023-05-23T23:31:02Z","title":"Difference-Masking: Choosing What to Mask in Continued Pretraining","summary":"  The self-supervised objective of masking-and-predicting has led to promising\nperformance gains on a variety of downstream tasks. However, while most\napproaches randomly mask tokens, there is strong intuition that deciding what\nto mask can substantially improve learning outcomes. We investigate this in\ncontinued pretraining setting in which pretrained models continue to pretrain\non domain-specific data before performing some downstream task. We introduce\nDifference-Masking, a masking strategy that automatically chooses what to mask\nduring continued pretraining by considering what makes a task domain different\nfrom the pretraining domain. Empirically, we find that Difference-Masking\noutperforms baselines on continued pretraining settings across four diverse\nlanguage-only and multimodal video tasks.\n","authors":["Alex Wilf","Syeda Nahida Akter","Leena Mathur","Paul Pu Liang","Sheryl Mathew","Mengrou Shou","Eric Nyberg","Louis-Philippe Morency"],"pdf_url":"https://arxiv.org/pdf/2305.14577v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11571v1","updated":"2023-10-17T20:40:59Z","published":"2023-10-17T20:40:59Z","title":"What is a good question? Task-oriented asking with fact-level masking","summary":"  Asking questions is an important element of real-life collaboration on\nreasoning tasks like question answering. For example, a legal assistant chatbot\nmay be unable to make accurate recommendations without specific information on\nthe user's circumstances. However, large language models are usually deployed\nto solve reasoning tasks directly without asking follow-up questions to the\nuser or third parties. We term this problem task-oriented asking (TOA).\nZero-shot chat models can perform TOA, but their training is primarily based on\nnext-token prediction rather than whether questions contribute to successful\ncollaboration. To enable the training and evaluation of TOA models, we present\na definition and framework for natural language task-oriented asking, the\nproblem of generating questions that result in answers useful for a reasoning\ntask. We also present fact-level masking (FLM), a procedure for converting\nnatural language datasets into self-supervised TOA datasets by omitting\nparticular critical facts. Finally, we generate a TOA dataset from the HotpotQA\ndataset using FLM and evaluate several zero-shot language models on it. Our\nexperiments show that current zero-shot models struggle to ask questions that\nretrieve useful information, as compared to human annotators. These results\ndemonstrate an opportunity to use FLM datasets and the TOA framework to train\nand evaluate better TOA models.\n","authors":["Matthew Toles","Yukun Huang","Zhou Yu","Luis Gravano"],"pdf_url":"https://arxiv.org/pdf/2310.11571v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11564v1","updated":"2023-10-17T20:22:13Z","published":"2023-10-17T20:22:13Z","title":"Personalized Soups: Personalized Large Language Model Alignment via\n  Post-hoc Parameter Merging","summary":"  While Reinforcement Learning from Human Feedback (RLHF) aligns Large Language\nModels (LLMs) with general, aggregate human preferences, it is suboptimal for\nlearning diverse, individual perspectives. In this work, we study Reinforcement\nLearning from Personalized Human Feedback (RLPHF) problem, wherein LLMs are\naligned to multiple (sometimes conflicting) preferences by modeling alignment\nas a Multi-Objective Reinforcement Learning (MORL) problem. Compared to strong\nsingle-objective baselines, we show that we can achieve personalized alignment\nby decomposing preferences into multiple dimensions. These dimensions are\ndefined based on personalizations that are declared as desirable by the user.\nIn this work, we show that they can be efficiently trained independently in a\ndistributed manner and combined effectively post-hoc through parameter merging.\nThe code is available at https://github.com/joeljang/RLPHF.\n","authors":["Joel Jang","Seungone Kim","Bill Yuchen Lin","Yizhong Wang","Jack Hessel","Luke Zettlemoyer","Hannaneh Hajishirzi","Yejin Choi","Prithviraj Ammanabrolu"],"pdf_url":"https://arxiv.org/pdf/2310.11564v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2310.10378v2","updated":"2023-10-17T19:56:41Z","published":"2023-10-16T13:19:17Z","title":"Cross-Lingual Consistency of Factual Knowledge in Multilingual Language\n  Models","summary":"  Multilingual large-scale Pretrained Language Models (PLMs) have been shown to\nstore considerable amounts of factual knowledge, but large variations are\nobserved across languages. With the ultimate goal of ensuring that users with\ndifferent language backgrounds obtain consistent feedback from the same model,\nwe study the cross-lingual consistency (CLC) of factual knowledge in various\nmultilingual PLMs. To this end, we propose a Ranking-based Consistency (RankC)\nmetric to evaluate knowledge consistency across languages independently from\naccuracy. Using this metric, we conduct an in-depth analysis of the determining\nfactors for CLC, both at model level and at language-pair level. Among other\nresults, we find that increasing model size leads to higher factual probing\naccuracy in most languages, but does not improve cross-lingual consistency.\nFinally, we conduct a case study on CLC when new factual associations are\ninserted in the PLMs via model editing. Results on a small sample of facts\ninserted in English reveal a clear pattern whereby the new piece of knowledge\ntransfers only to languages with which English has a high RankC score.\n","authors":["Jirui Qi","Raquel Fernández","Arianna Bisazza"],"pdf_url":"https://arxiv.org/pdf/2310.10378v2.pdf","comment":"Accepted at EMNLP2023 main conference. All code and data are released\n  at https://github.com/Betswish/Cross-Lingual-Consistency"},{"id":"http://arxiv.org/abs/2310.10449v2","updated":"2023-10-17T19:54:16Z","published":"2023-10-16T14:33:02Z","title":"Text Summarization Using Large Language Models: A Comparative Study of\n  MPT-7b-instruct, Falcon-7b-instruct, and OpenAI Chat-GPT Models","summary":"  Text summarization is a critical Natural Language Processing (NLP) task with\napplications ranging from information retrieval to content generation.\nLeveraging Large Language Models (LLMs) has shown remarkable promise in\nenhancing summarization techniques. This paper embarks on an exploration of\ntext summarization with a diverse set of LLMs, including MPT-7b-instruct,\nfalcon-7b-instruct, and OpenAI ChatGPT text-davinci-003 models. The experiment\nwas performed with different hyperparameters and evaluated the generated\nsummaries using widely accepted metrics such as the Bilingual Evaluation\nUnderstudy (BLEU) Score, Recall-Oriented Understudy for Gisting Evaluation\n(ROUGE) Score, and Bidirectional Encoder Representations from Transformers\n(BERT) Score. According to the experiment, text-davinci-003 outperformed the\nothers. This investigation involved two distinct datasets: CNN Daily Mail and\nXSum. Its primary objective was to provide a comprehensive understanding of the\nperformance of Large Language Models (LLMs) when applied to different datasets.\nThe assessment of these models' effectiveness contributes valuable insights to\nresearchers and practitioners within the NLP domain. This work serves as a\nresource for those interested in harnessing the potential of LLMs for text\nsummarization and lays the foundation for the development of advanced\nGenerative AI applications aimed at addressing a wide spectrum of business\nchallenges.\n","authors":["Lochan Basyal","Mihir Sanghvi"],"pdf_url":"https://arxiv.org/pdf/2310.10449v2.pdf","comment":"4 pages, 2 tables"},{"id":"http://arxiv.org/abs/2310.11541v1","updated":"2023-10-17T19:27:23Z","published":"2023-10-17T19:27:23Z","title":"MUST&P-SRL: Multi-lingual and Unified Syllabification in Text and\n  Phonetic Domains for Speech Representation Learning","summary":"  In this paper, we present a methodology for linguistic feature extraction,\nfocusing particularly on automatically syllabifying words in multiple\nlanguages, with a design to be compatible with a forced-alignment tool, the\nMontreal Forced Aligner (MFA). In both the textual and phonetic domains, our\nmethod focuses on the extraction of phonetic transcriptions from text, stress\nmarks, and a unified automatic syllabification (in text and phonetic domains).\nThe system was built with open-source components and resources. Through an\nablation study, we demonstrate the efficacy of our approach in automatically\nsyllabifying words from several languages (English, French and Spanish).\nAdditionally, we apply the technique to the transcriptions of the CMU ARCTIC\ndataset, generating valuable annotations available\nonline\\footnote{\\url{https://github.com/noetits/MUST_P-SRL}} that are ideal for\nspeech representation learning, speech unit discovery, and disentanglement of\nspeech factors in several speech-related fields.\n","authors":["Noé Tits"],"pdf_url":"https://arxiv.org/pdf/2310.11541v1.pdf","comment":"Accepted for publication at EMNLP 2023"},{"id":"http://arxiv.org/abs/2304.13007v3","updated":"2023-10-17T19:18:05Z","published":"2023-04-25T17:27:37Z","title":"Answering Questions by Meta-Reasoning over Multiple Chains of Thought","summary":"  Modern systems for multi-hop question answering (QA) typically break\nquestions into a sequence of reasoning steps, termed chain-of-thought (CoT),\nbefore arriving at a final answer. Often, multiple chains are sampled and\naggregated through a voting mechanism over the final answers, but the\nintermediate steps themselves are discarded. While such approaches improve\nperformance, they do not consider the relations between intermediate steps\nacross chains and do not provide a unified explanation for the predicted\nanswer. We introduce Multi-Chain Reasoning (MCR), an approach which prompts\nlarge language models to meta-reason over multiple chains of thought, rather\nthan aggregating their answers. MCR examines different reasoning chains, mixes\ninformation between them and selects the most relevant facts in generating an\nexplanation and predicting the answer. MCR outperforms strong baselines on 7\nmulti-hop QA datasets. Moreover, our analysis reveals that MCR explanations\nexhibit high quality, enabling humans to verify its answers.\n","authors":["Ori Yoran","Tomer Wolfson","Ben Bogin","Uri Katz","Daniel Deutch","Jonathan Berant"],"pdf_url":"https://arxiv.org/pdf/2304.13007v3.pdf","comment":"Accepted for publication in The 2023 Conference on Empirical Methods\n  in Natural Language Processing (EMNLP 2023). Author's final version"},{"id":"http://arxiv.org/abs/2310.11532v1","updated":"2023-10-17T19:02:40Z","published":"2023-10-17T19:02:40Z","title":"Multi-stage Large Language Model Correction for Speech Recognition","summary":"  In this paper, we investigate the usage of large language models (LLMs) to\nimprove the performance of competitive speech recognition systems. Different\nfrom traditional language models that focus on one single data domain, the rise\nof LLMs brings us the opportunity to push the limit of state-of-the-art ASR\nperformance, and at the same time to achieve higher robustness and generalize\neffectively across multiple domains. Motivated by this, we propose a novel\nmulti-stage approach to combine traditional language model re-scoring and LLM\nprompting. Specifically, the proposed method has two stages: the first stage\nuses a language model to re-score an N-best list of ASR hypotheses and run a\nconfidence check; The second stage uses prompts to a LLM to perform ASR error\ncorrection on less confident results from the first stage. Our experimental\nresults demonstrate the effectiveness of the proposed method by showing a 10% ~\n20% relative improvement in WER over a competitive ASR system -- across\nmultiple test domains.\n","authors":["Jie Pu","Thai-Son Nguyen","Sebastian Stüker"],"pdf_url":"https://arxiv.org/pdf/2310.11532v1.pdf","comment":"Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2210.03350v3","updated":"2023-10-17T18:57:17Z","published":"2022-10-07T06:50:23Z","title":"Measuring and Narrowing the Compositionality Gap in Language Models","summary":"  We investigate the ability of language models to perform compositional\nreasoning tasks where the overall solution depends on correctly composing the\nanswers to sub-problems. We measure how often models can correctly answer all\nsub-problems but not generate the overall solution, a ratio we call the\ncompositionality gap. We evaluate this ratio by asking multi-hop questions with\nanswers that require composing multiple facts unlikely to have been observed\ntogether during pretraining. In the GPT-3 family of models, as model size\nincreases we show that the single-hop question answering performance improves\nfaster than the multi-hop performance does, therefore the compositionality gap\ndoes not decrease. This surprising result suggests that while more powerful\nmodels memorize and recall more factual knowledge, they show no corresponding\nimprovement in their ability to perform this kind of compositional reasoning.\n  We then demonstrate how elicitive prompting (such as chain of thought)\nnarrows the compositionality gap by reasoning explicitly. We present a new\nmethod, self-ask, that further improves on chain of thought. In our method, the\nmodel explicitly asks itself (and answers) follow-up questions before answering\nthe initial question. We finally show that self-ask's structured prompting lets\nus easily plug in a search engine to answer the follow-up questions, which\nadditionally improves accuracy.\n","authors":["Ofir Press","Muru Zhang","Sewon Min","Ludwig Schmidt","Noah A. Smith","Mike Lewis"],"pdf_url":"https://arxiv.org/pdf/2210.03350v3.pdf","comment":"To appear at Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.11523v1","updated":"2023-10-17T18:41:57Z","published":"2023-10-17T18:41:57Z","title":"Group Preference Optimization: Few-Shot Alignment of Large Language\n  Models","summary":"  Many applications of large language models (LLMs), ranging from chatbots to\ncreative writing, require nuanced subjective judgments that can differ\nsignificantly across different groups. Existing alignment algorithms can be\nexpensive to align for each group, requiring prohibitive amounts of\ngroup-specific preference data and computation for real-world use cases. We\nintroduce Group Preference Optimization (GPO), an alignment framework that\nsteers language models to preferences of individual groups in a few-shot\nmanner. In GPO, we augment the base LLM with an independent transformer module\ntrained to predict the preferences of a group for the LLM generations. For\nfew-shot learning, we parameterize this module as an in-context autoregressive\ntransformer and train it via meta-learning on several groups. We empirically\nvalidate the efficacy of GPO through rigorous evaluations using LLMs with\nvaried sizes on three human opinion adaptation tasks. These tasks involve\nadapting to the preferences of US demographic groups, global countries, and\nindividual users. Our results demonstrate that GPO not only aligns models more\naccurately but also requires fewer group-specific preferences, and less\ntraining and inference computing resources, outperforming existing strategies\nsuch as in-context steering and fine-tuning methods.\n","authors":["Siyan Zhao","John Dang","Aditya Grover"],"pdf_url":"https://arxiv.org/pdf/2310.11523v1.pdf","comment":"24 pages, 12 figures"},{"id":"http://arxiv.org/abs/2310.11520v1","updated":"2023-10-17T18:38:03Z","published":"2023-10-17T18:38:03Z","title":"Automatic News Summerization","summary":"  Natural Language Processing is booming with its applications in the real\nworld, one of which is Text Summarization for large texts including news\narticles. This research paper provides an extensive comparative evaluation of\nextractive and abstractive approaches for news text summarization, with an\nemphasis on the ROUGE score analysis. The study employs the CNN-Daily Mail\ndataset, which consists of news articles and human-generated reference\nsummaries. The evaluation employs ROUGE scores to assess the efficacy and\nquality of generated summaries. After Evaluation, we integrate the\nbest-performing models on a web application to assess their real-world\ncapabilities and user experience.\n","authors":["Kavach Dheer","Arpit Dhankhar"],"pdf_url":"https://arxiv.org/pdf/2310.11520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.11649v2","updated":"2023-10-17T18:21:27Z","published":"2023-02-22T20:56:40Z","title":"Grounding Complex Natural Language Commands for Temporal Tasks in Unseen\n  Environments","summary":"  Grounding navigational commands to linear temporal logic (LTL) leverages its\nunambiguous semantics for reasoning about long-horizon tasks and verifying the\nsatisfaction of temporal constraints. Existing approaches require training data\nfrom the specific environment and landmarks that will be used in natural\nlanguage to understand commands in those environments. We propose Lang2LTL, a\nmodular system and a software package that leverages large language models\n(LLMs) to ground temporal navigational commands to LTL specifications in\nenvironments without prior language data. We comprehensively evaluate Lang2LTL\nfor five well-defined generalization behaviors. Lang2LTL demonstrates the\nstate-of-the-art ability of a single model to ground navigational commands to\ndiverse temporal specifications in 21 city-scaled environments. Finally, we\ndemonstrate a physical robot using Lang2LTL can follow 52 semantically diverse\nnavigational commands in two indoor environments.\n","authors":["Jason Xinyu Liu","Ziyi Yang","Ifrah Idrees","Sam Liang","Benjamin Schornstein","Stefanie Tellex","Ankit Shah"],"pdf_url":"https://arxiv.org/pdf/2302.11649v2.pdf","comment":"Conference on Robot Learning 2023"},{"id":"http://arxiv.org/abs/2310.11511v1","updated":"2023-10-17T18:18:32Z","published":"2023-10-17T18:18:32Z","title":"Self-RAG: Learning to Retrieve, Generate, and Critique through\n  Self-Reflection","summary":"  Despite their remarkable capabilities, large language models (LLMs) often\nproduce responses containing factual inaccuracies due to their sole reliance on\nthe parametric knowledge they encapsulate. Retrieval-Augmented Generation\n(RAG), an ad hoc approach that augments LMs with retrieval of relevant\nknowledge, decreases such issues. However, indiscriminately retrieving and\nincorporating a fixed number of retrieved passages, regardless of whether\nretrieval is necessary, or passages are relevant, diminishes LM versatility or\ncan lead to unhelpful response generation. We introduce a new framework called\nSelf-Reflective Retrieval-Augmented Generation (Self-RAG) that enhances an LM's\nquality and factuality through retrieval and self-reflection. Our framework\ntrains a single arbitrary LM that adaptively retrieves passages on-demand, and\ngenerates and reflects on retrieved passages and its own generations using\nspecial tokens, called reflection tokens. Generating reflection tokens makes\nthe LM controllable during the inference phase, enabling it to tailor its\nbehavior to diverse task requirements. Experiments show that Self-RAG (7B and\n13B parameters) significantly outperforms state-of-the-art LLMs and\nretrieval-augmented models on a diverse set of tasks. Specifically, Self-RAG\noutperforms ChatGPT and retrieval-augmented Llama2-chat on Open-domain QA,\nreasoning and fact verification tasks, and it shows significant gains in\nimproving factuality and citation accuracy for long-form generations relative\nto these models.\n","authors":["Akari Asai","Zeqiu Wu","Yizhong Wang","Avirup Sil","Hannaneh Hajishirzi"],"pdf_url":"https://arxiv.org/pdf/2310.11511v1.pdf","comment":"30 pages, 2 figures, 12 tables"},{"id":"http://arxiv.org/abs/2310.07177v2","updated":"2023-10-17T18:02:19Z","published":"2023-10-11T04:03:42Z","title":"Online Speculative Decoding","summary":"  Speculative decoding is a pivotal technique to accelerate the inference of\nlarge language models (LLMs) by employing a smaller draft model to predict the\ntarget model's outputs. However, its efficacy can be limited due to the low\npredictive accuracy of the draft model, particularly when faced with diverse\ntext inputs and a significant capability gap between the draft and target\nmodels. We introduce online speculative decoding (OSD) to address this\nchallenge. The main idea is to continually update (multiple) draft model(s) on\nobserved user query data using the abundant excess computational power in an\nLLM serving cluster. Given that LLM inference is memory-bounded, the surplus\ncomputational power in a typical LLM serving cluster can be repurposed for\nonline retraining of draft models, thereby making the training cost-neutral.\nSince the query distribution of an LLM service is relatively simple, retraining\non query distribution enables the draft model to more accurately predict the\ntarget model's outputs, particularly on data originating from query\ndistributions. As the draft model evolves online, it aligns with the query\ndistribution in real time, mitigating distribution shifts. We develop a\nprototype of online speculative decoding based on online knowledge distillation\nand evaluate it using both synthetic and real query data on several popular\nLLMs. The results show a substantial increase in the token acceptance rate by\n0.1 to 0.65, which translates into 1.22x to 3.06x latency reduction.\n","authors":["Xiaoxuan Liu","Lanxiang Hu","Peter Bailis","Ion Stoica","Zhijie Deng","Alvin Cheung","Hao Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.07177v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11501v1","updated":"2023-10-17T18:00:25Z","published":"2023-10-17T18:00:25Z","title":"CoMPosT: Characterizing and Evaluating Caricature in LLM Simulations","summary":"  Recent work has aimed to capture nuances of human behavior by using LLMs to\nsimulate responses from particular demographics in settings like social science\nexperiments and public opinion surveys. However, there are currently no\nestablished ways to discuss or evaluate the quality of such LLM simulations.\nMoreover, there is growing concern that these LLM simulations are flattened\ncaricatures of the personas that they aim to simulate, failing to capture the\nmultidimensionality of people and perpetuating stereotypes. To bridge these\ngaps, we present CoMPosT, a framework to characterize LLM simulations using\nfour dimensions: Context, Model, Persona, and Topic. We use this framework to\nmeasure open-ended LLM simulations' susceptibility to caricature, defined via\ntwo criteria: individuation and exaggeration. We evaluate the level of\ncaricature in scenarios from existing work on LLM simulations. We find that for\nGPT-4, simulations of certain demographics (political and marginalized groups)\nand topics (general, uncontroversial) are highly susceptible to caricature.\n","authors":["Myra Cheng","Tiziano Piccardi","Diyi Yang"],"pdf_url":"https://arxiv.org/pdf/2310.11501v1.pdf","comment":"To appear at EMNLP 2023 (Main)"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2310.11449v1","updated":"2023-10-17T17:58:00Z","published":"2023-10-17T17:58:00Z","title":"DELIFFAS: Deformable Light Fields for Fast Avatar Synthesis","summary":"  Generating controllable and photorealistic digital human avatars is a\nlong-standing and important problem in Vision and Graphics. Recent methods have\nshown great progress in terms of either photorealism or inference speed while\nthe combination of the two desired properties still remains unsolved. To this\nend, we propose a novel method, called DELIFFAS, which parameterizes the\nappearance of the human as a surface light field that is attached to a\ncontrollable and deforming human mesh model. At the core, we represent the\nlight field around the human with a deformable two-surface parameterization,\nwhich enables fast and accurate inference of the human appearance. This allows\nperceptual supervision on the full image compared to previous approaches that\ncould only supervise individual pixels or small patches due to their slow\nruntime. Our carefully designed human representation and supervision strategy\nleads to state-of-the-art synthesis results and inference time. The video\nresults and code are available at\nhttps://vcai.mpi-inf.mpg.de/projects/DELIFFAS.\n","authors":["Youngjoong Kwon","Lingjie Liu","Henry Fuchs","Marc Habermann","Christian Theobalt"],"pdf_url":"https://arxiv.org/pdf/2310.11449v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11448v1","updated":"2023-10-17T17:57:38Z","published":"2023-10-17T17:57:38Z","title":"4K4D: Real-Time 4D View Synthesis at 4K Resolution","summary":"  This paper targets high-fidelity and real-time view synthesis of dynamic 3D\nscenes at 4K resolution. Recently, some methods on dynamic view synthesis have\nshown impressive rendering quality. However, their speed is still limited when\nrendering high-resolution images. To overcome this problem, we propose 4K4D, a\n4D point cloud representation that supports hardware rasterization and enables\nunprecedented rendering speed. Our representation is built on a 4D feature grid\nso that the points are naturally regularized and can be robustly optimized. In\naddition, we design a novel hybrid appearance model that significantly boosts\nthe rendering quality while preserving efficiency. Moreover, we develop a\ndifferentiable depth peeling algorithm to effectively learn the proposed model\nfrom RGB videos. Experiments show that our representation can be rendered at\nover 400 FPS on the DNA-Rendering dataset at 1080p resolution and 80 FPS on the\nENeRF-Outdoor dataset at 4K resolution using an RTX 4090 GPU, which is 30x\nfaster than previous methods and achieves the state-of-the-art rendering\nquality. We will release the code for reproducibility.\n","authors":["Zhen Xu","Sida Peng","Haotong Lin","Guangzhao He","Jiaming Sun","Yujun Shen","Hujun Bao","Xiaowei Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.11448v1.pdf","comment":"Project Page: https://zju3dv.github.io/4k4d"},{"id":"http://arxiv.org/abs/2309.16646v2","updated":"2023-10-17T17:54:37Z","published":"2023-09-28T17:51:05Z","title":"Improving Equivariance in State-of-the-Art Supervised Depth and Normal\n  Predictors","summary":"  Dense depth and surface normal predictors should possess the equivariant\nproperty to cropping-and-resizing -- cropping the input image should result in\ncropping the same output image. However, we find that state-of-the-art depth\nand normal predictors, despite having strong performances, surprisingly do not\nrespect equivariance. The problem exists even when crop-and-resize data\naugmentation is employed during training. To remedy this, we propose an\nequivariant regularization technique, consisting of an averaging procedure and\na self-consistency loss, to explicitly promote cropping-and-resizing\nequivariance in depth and normal networks. Our approach can be applied to both\nCNN and Transformer architectures, does not incur extra cost during testing,\nand notably improves the supervised and semi-supervised learning performance of\ndense predictors on Taskonomy tasks. Finally, finetuning with our loss on\nunlabeled images improves not only equivariance but also accuracy of\nstate-of-the-art depth and normal predictors when evaluated on NYU-v2. GitHub\nlink: https://github.com/mikuhatsune/equivariance\n","authors":["Yuanyi Zhong","Anand Bhattad","Yu-Xiong Wang","David Forsyth"],"pdf_url":"https://arxiv.org/pdf/2309.16646v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2310.11441v1","updated":"2023-10-17T17:51:31Z","published":"2023-10-17T17:51:31Z","title":"Set-of-Mark Prompting Unleashes Extraordinary Visual Grounding in GPT-4V","summary":"  We present Set-of-Mark (SoM), a new visual prompting method, to unleash the\nvisual grounding abilities of large multimodal models (LMMs), such as GPT-4V.\nAs illustrated in Fig. 1 (right), we employ off-the-shelf interactive\nsegmentation models, such as SAM, to partition an image into regions at\ndifferent levels of granularity, and overlay these regions with a set of marks\ne.g., alphanumerics, masks, boxes. Using the marked image as input, GPT-4V can\nanswer the questions that require visual grounding. We perform a comprehensive\nempirical study to validate the effectiveness of SoM on a wide range of\nfine-grained vision and multimodal tasks. For example, our experiments show\nthat GPT-4V with SoM outperforms the state-of-the-art fully-finetuned referring\nsegmentation model on RefCOCOg in a zero-shot setting.\n","authors":["Jianwei Yang","Hao Zhang","Feng Li","Xueyan Zou","Chunyuan Li","Jianfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2310.11441v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11440v1","updated":"2023-10-17T17:50:46Z","published":"2023-10-17T17:50:46Z","title":"EvalCrafter: Benchmarking and Evaluating Large Video Generation Models","summary":"  The vision and language generative models have been overgrown in recent\nyears. For video generation, various open-sourced models and public-available\nservices are released for generating high-visual quality videos. However, these\nmethods often use a few academic metrics, for example, FVD or IS, to evaluate\nthe performance. We argue that it is hard to judge the large conditional\ngenerative models from the simple metrics since these models are often trained\non very large datasets with multi-aspect abilities. Thus, we propose a new\nframework and pipeline to exhaustively evaluate the performance of the\ngenerated videos. To achieve this, we first conduct a new prompt list for\ntext-to-video generation by analyzing the real-world prompt list with the help\nof the large language model. Then, we evaluate the state-of-the-art video\ngenerative models on our carefully designed benchmarks, in terms of visual\nqualities, content qualities, motion qualities, and text-caption alignment with\naround 18 objective metrics. To obtain the final leaderboard of the models, we\nalso fit a series of coefficients to align the objective metrics to the users'\nopinions. Based on the proposed opinion alignment method, our final score shows\na higher correlation than simply averaging the metrics, showing the\neffectiveness of the proposed evaluation method.\n","authors":["Yaofang Liu","Xiaodong Cun","Xuebo Liu","Xintao Wang","Yong Zhang","Haoxin Chen","Yang Liu","Tieyong Zeng","Raymond Chan","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2310.11440v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2303.09447v2","updated":"2023-10-17T17:46:01Z","published":"2023-03-16T16:23:13Z","title":"Steering Prototypes with Prompt-tuning for Rehearsal-free Continual\n  Learning","summary":"  In the context of continual learning, prototypes-as representative class\nembeddings-offer advantages in memory conservation and the mitigation of\ncatastrophic forgetting. However, challenges related to semantic drift and\nprototype interference persist. In this study, we introduce the Contrastive\nPrototypical Prompt (CPP) approach. Through task-specific prompt-tuning,\nunderpinned by a contrastive learning objective, we effectively address both\naforementioned challenges. Our evaluations on four challenging\nclass-incremental benchmarks reveal that CPP achieves a significant 4% to 6%\nimprovement over state-of-the-art methods. Importantly, CPP operates without a\nrehearsal buffer and narrows the performance divergence between continual and\noffline joint-learning, suggesting an innovative scheme for Transformer-based\ncontinual learning systems.\n","authors":["Zhuowei Li","Long Zhao","Zizhao Zhang","Han Zhang","Di Liu","Ting Liu","Dimitris N. Metaxas"],"pdf_url":"https://arxiv.org/pdf/2303.09447v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.16482v3","updated":"2023-10-17T17:37:54Z","published":"2022-03-30T17:18:11Z","title":"RFNet-4D++: Joint Object Reconstruction and Flow Estimation from 4D\n  Point Clouds with Cross-Attention Spatio-Temporal Features","summary":"  Object reconstruction from 3D point clouds has been a long-standing research\nproblem in computer vision and computer graphics, and achieved impressive\nprogress. However, reconstruction from time-varying point clouds (a.k.a. 4D\npoint clouds) is generally overlooked. In this paper, we propose a new network\narchitecture, namely RFNet-4D++, that jointly reconstructs objects and their\nmotion flows from 4D point clouds. The key insight is simultaneously performing\nboth tasks via learning of spatial and temporal features from a sequence of\npoint clouds can leverage individual tasks, leading to improved overall\nperformance. To prove this ability, we design a temporal vector field learning\nmodule using an unsupervised learning approach for flow estimation task,\nleveraged by supervised learning of spatial structures for object\nreconstruction. Extensive experiments and analyses on benchmark datasets\nvalidated the effectiveness and efficiency of our method. As shown in\nexperimental results, our method achieves state-of-the-art performance on both\nflow estimation and object reconstruction while performing much faster than\nexisting methods in both training and inference. Our code and data are\navailable at https://github.com/hkust-vgd/RFNet-4D\n","authors":["Tuan-Anh Vu","Duc Thanh Nguyen","Binh-Son Hua","Quang-Hieu Pham","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2203.16482v3.pdf","comment":"TPAMI journal extension of ECCV 2022 arXiv:2203.16482"},{"id":"http://arxiv.org/abs/2310.11420v1","updated":"2023-10-17T17:28:03Z","published":"2023-10-17T17:28:03Z","title":"Revisiting Map Relations for Unsupervised Non-Rigid Shape Matching","summary":"  We propose a novel unsupervised learning approach for non-rigid 3D shape\nmatching. Our approach improves upon recent state-of-the art deep functional\nmap methods and can be applied to a broad range of different challenging\nscenarios. Previous deep functional map methods mainly focus on feature\nextraction and aim exclusively at obtaining more expressive features for\nfunctional map computation. However, the importance of the functional map\ncomputation itself is often neglected and the relationship between the\nfunctional map and point-wise map is underexplored. In this paper, we\nsystematically investigate the coupling relationship between the functional map\nfrom the functional map solver and the point-wise map based on feature\nsimilarity. To this end, we propose a self-adaptive functional map solver to\nadjust the functional map regularisation for different shape matching\nscenarios, together with a vertex-wise contrastive loss to obtain more\ndiscriminative features. Using different challenging datasets (including\nnon-isometry, topological noise and partiality), we demonstrate that our method\nsubstantially outperforms previous state-of-the-art methods.\n","authors":["Dongliang Cao","Paul Roetzer","Florian Bernard"],"pdf_url":"https://arxiv.org/pdf/2310.11420v1.pdf","comment":"3DV 2024"},{"id":"http://arxiv.org/abs/2310.11417v1","updated":"2023-10-17T17:25:31Z","published":"2023-10-17T17:25:31Z","title":"VcT: Visual change Transformer for Remote Sensing Image Change Detection","summary":"  Existing visual change detectors usually adopt CNNs or Transformers for\nfeature representation learning and focus on learning effective representation\nfor the changed regions between images. Although good performance can be\nobtained by enhancing the features of the change regions, however, these works\nare still limited mainly due to the ignorance of mining the unchanged\nbackground context information. It is known that one main challenge for change\ndetection is how to obtain the consistent representations for two images\ninvolving different variations, such as spatial variation, sunlight intensity,\netc. In this work, we demonstrate that carefully mining the common background\ninformation provides an important cue to learn the consistent representations\nfor the two images which thus obviously facilitates the visual change detection\nproblem. Based on this observation, we propose a novel Visual change\nTransformer (VcT) model for visual change detection problem. To be specific, a\nshared backbone network is first used to extract the feature maps for the given\nimage pair. Then, each pixel of feature map is regarded as a graph node and the\ngraph neural network is proposed to model the structured information for coarse\nchange map prediction. Top-K reliable tokens can be mined from the map and\nrefined by using the clustering algorithm. Then, these reliable tokens are\nenhanced by first utilizing self/cross-attention schemes and then interacting\nwith original features via an anchor-primary attention learning module.\nFinally, the prediction head is proposed to get a more accurate change map.\nExtensive experiments on multiple benchmark datasets validated the\neffectiveness of our proposed VcT model.\n","authors":["Bo Jiang","Zitian Wang","Xixi Wang","Ziyan Zhang","Lan Chen","Xiao Wang","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2310.11417v1.pdf","comment":"Accepted by IEEE Transactions on Geoscience and Remote Sensing (TGRS)\n  2023"},{"id":"http://arxiv.org/abs/2305.13812v2","updated":"2023-10-17T17:07:29Z","published":"2023-05-23T08:28:38Z","title":"Coarse-to-Fine Contrastive Learning in Image-Text-Graph Space for\n  Improved Vision-Language Compositionality","summary":"  Contrastively trained vision-language models have achieved remarkable\nprogress in vision and language representation learning, leading to\nstate-of-the-art models for various downstream multimodal tasks. However,\nrecent research has highlighted severe limitations of these models in their\nability to perform compositional reasoning over objects, attributes, and\nrelations. Scene graphs have emerged as an effective way to understand images\ncompositionally. These are graph-structured semantic representations of images\nthat contain objects, their attributes, and relations with other objects in a\nscene. In this work, we consider the scene graph parsed from text as a proxy\nfor the image scene graph and propose a graph decomposition and augmentation\nframework along with a coarse-to-fine contrastive learning objective between\nimages and text that aligns sentences of various complexities to the same\nimage. Along with this, we propose novel negative mining techniques in the\nscene graph space for improving attribute binding and relation understanding.\nThrough extensive experiments, we demonstrate the effectiveness of our approach\nthat significantly improves attribute binding, relation understanding,\nsystematic generalization, and productivity on multiple recently proposed\nbenchmarks (For example, improvements upto $18\\%$ for systematic\ngeneralization, $16.5\\%$ for relation understanding over a strong baseline),\nwhile achieving similar or better performance than CLIP on various general\nmultimodal tasks.\n","authors":["Harman Singh","Pengchuan Zhang","Qifan Wang","Mengjiao Wang","Wenhan Xiong","Jingfei Du","Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2305.13812v2.pdf","comment":"EMNLP 2023 (main)"},{"id":"http://arxiv.org/abs/2310.11392v1","updated":"2023-10-17T16:45:47Z","published":"2023-10-17T16:45:47Z","title":"Towards Automatic Satellite Images Captions Generation Using Large\n  Language Models","summary":"  Automatic image captioning is a promising technique for conveying visual\ninformation using natural language. It can benefit various tasks in satellite\nremote sensing, such as environmental monitoring, resource management, disaster\nmanagement, etc. However, one of the main challenges in this domain is the lack\nof large-scale image-caption datasets, as they require a lot of human expertise\nand effort to create. Recent research on large language models (LLMs) has\ndemonstrated their impressive performance in natural language understanding and\ngeneration tasks. Nonetheless, most of them cannot handle images (GPT-3.5,\nFalcon, Claude, etc.), while conventional captioning models pre-trained on\ngeneral ground-view images often fail to produce detailed and accurate captions\nfor aerial images (BLIP, GIT, CM3, CM3Leon, etc.). To address this problem, we\npropose a novel approach: Automatic Remote Sensing Image Captioning (ARSIC) to\nautomatically collect captions for remote sensing images by guiding LLMs to\ndescribe their object annotations. We also present a benchmark model that\nadapts the pre-trained generative image2text model (GIT) to generate\nhigh-quality captions for remote-sensing images. Our evaluation demonstrates\nthe effectiveness of our approach for collecting captions for remote sensing\nimages.\n","authors":["Yingxu He","Qiqi Sun"],"pdf_url":"https://arxiv.org/pdf/2310.11392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16309v3","updated":"2023-10-17T16:34:46Z","published":"2023-05-25T17:58:14Z","title":"Imitating Task and Motion Planning with Visuomotor Transformers","summary":"  Imitation learning is a powerful tool for training robot manipulation\npolicies, allowing them to learn from expert demonstrations without manual\nprogramming or trial-and-error. However, common methods of data collection,\nsuch as human supervision, scale poorly, as they are time-consuming and\nlabor-intensive. In contrast, Task and Motion Planning (TAMP) can autonomously\ngenerate large-scale datasets of diverse demonstrations. In this work, we show\nthat the combination of large-scale datasets generated by TAMP supervisors and\nflexible Transformer models to fit them is a powerful paradigm for robot\nmanipulation. To that end, we present a novel imitation learning system called\nOPTIMUS that trains large-scale visuomotor Transformer policies by imitating a\nTAMP agent. OPTIMUS introduces a pipeline for generating TAMP data that is\nspecifically curated for imitation learning and can be used to train performant\ntransformer-based policies. In this paper, we present a thorough study of the\ndesign decisions required to imitate TAMP and demonstrate that OPTIMUS can\nsolve a wide variety of challenging vision-based manipulation tasks with over\n70 different objects, ranging from long-horizon pick-and-place tasks, to shelf\nand articulated object manipulation, achieving 70 to 80% success rates. Video\nresults and code at https://mihdalal.github.io/optimus/\n","authors":["Murtaza Dalal","Ajay Mandlekar","Caelan Garrett","Ankur Handa","Ruslan Salakhutdinov","Dieter Fox"],"pdf_url":"https://arxiv.org/pdf/2305.16309v3.pdf","comment":"Conference on Robot Learning (CoRL) 2023. 8 pages, 5 figures, 2\n  tables; 11 pages appendix (10 additional figures)"},{"id":"http://arxiv.org/abs/2310.11385v1","updated":"2023-10-17T16:32:38Z","published":"2023-10-17T16:32:38Z","title":"A voxel-level approach to brain age prediction: A method to assess\n  regional brain aging","summary":"  Brain aging is a regional phenomenon, a facet that remains relatively\nunder-explored within the realm of brain age prediction research using machine\nlearning methods. Voxel-level predictions can provide localized brain age\nestimates that can provide granular insights into the regional aging processes.\nThis is essential to understand the differences in aging trajectories in\nhealthy versus diseased subjects. In this work, a deep learning-based multitask\nmodel is proposed for voxel-level brain age prediction from T1-weighted\nmagnetic resonance images. The proposed model outperforms the models existing\nin the literature and yields valuable clinical insights when applied to both\nhealthy and diseased populations. Regional analysis is performed on the\nvoxel-level brain age predictions to understand aging trajectories of known\nanatomical regions in the brain and show that there exist disparities in\nregional aging trajectories of healthy subjects compared to ones with\nunderlying neurological disorders such as Dementia and more specifically,\nAlzheimer's disease. Our code is available at\nhttps://github.com/nehagianchandani/Voxel-level-brain-age-prediction.\n","authors":["Neha Gianchandani","Mahsa Dibaji","Johanna Ospel","Fernando Vega","Mariana Bento","M. Ethan MacDonald","Roberto Souza"],"pdf_url":"https://arxiv.org/pdf/2310.11385v1.pdf","comment":"27 pages, submitted to MELBA"},{"id":"http://arxiv.org/abs/2302.03397v2","updated":"2023-10-17T16:29:12Z","published":"2023-02-07T11:04:14Z","title":"AniPixel: Towards Animatable Pixel-Aligned Human Avatar","summary":"  Although human reconstruction typically results in human-specific avatars,\nrecent 3D scene reconstruction techniques utilizing pixel-aligned features show\npromise in generalizing to new scenes. Applying these techniques to human\navatar reconstruction can result in a volumetric avatar with generalizability\nbut limited animatability due to rendering only being possible for static\nrepresentations. In this paper, we propose AniPixel, a novel animatable and\ngeneralizable human avatar reconstruction method that leverages pixel-aligned\nfeatures for body geometry prediction and RGB color blending. Technically, to\nalign the canonical space with the target space and the observation space, we\npropose a bidirectional neural skinning field based on skeleton-driven\ndeformation to establish the target-to-canonical and canonical-to-observation\ncorrespondences. Then, we disentangle the canonical body geometry into a\nnormalized neutral-sized body and a subject-specific residual for better\ngeneralizability. As the geometry and appearance are closely related, we\nintroduce pixel-aligned features to facilitate the body geometry prediction and\ndetailed surface normals to reinforce the RGB color blending. We also devise a\npose-dependent and view direction-related shading module to represent the local\nillumination variance. Experiments show that AniPixel renders comparable novel\nviews while delivering better novel pose animation results than\nstate-of-the-art methods.\n","authors":["Jinlong Fan","Jing Zhang","Zhi Hou","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2302.03397v2.pdf","comment":"Accepted by MM'23, code will be released at\n  https://github.com/loong8888/AniPixel"},{"id":"http://arxiv.org/abs/2301.05065v2","updated":"2023-10-17T16:11:36Z","published":"2023-01-12T15:03:05Z","title":"Toward Building General Foundation Models for Language, Vision, and\n  Vision-Language Understanding Tasks","summary":"  Foundation models or pre-trained models have substantially improved the\nperformance of various language, vision, and vision-language understanding\ntasks. However, existing foundation models can only perform the best in one\ntype of tasks, namely language, vision, or vision-language. It is still an open\nquestion whether it is possible to construct a foundation model performing the\nbest for all the understanding tasks, which we call a general foundation model.\nIn this paper, we propose a new general foundation model, X-FM (the\nX-Foundation Model). X-FM has one language encoder, one vision encoder, and one\nfusion encoder, as well as a new training method. The training method includes\ntwo new techniques for learning X-FM from text, image, and image-text pair\ndata. One is to stop gradients from the vision-language training when learning\nthe language encoder. The other is to leverage the vision-language training to\nguide the learning of the vision encoder. Extensive experiments on benchmark\ndatasets show that X-FM can significantly outperform existing general\nfoundation models and perform better than or comparable to existing foundation\nmodels specifically for language, vision, or vision-language understanding.\nCode and pre-trained models are released at\nhttps://github.com/zhangxinsong-nlp/XFM.\n","authors":["Xinsong Zhang","Yan Zeng","Jipeng Zhang","Hang Li"],"pdf_url":"https://arxiv.org/pdf/2301.05065v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10190v2","updated":"2023-10-17T15:44:32Z","published":"2023-06-16T21:51:04Z","title":"ALP: Action-Aware Embodied Learning for Perception","summary":"  Current methods in training and benchmarking vision models exhibit an\nover-reliance on passive, curated datasets. Although models trained on these\ndatasets have shown strong performance in a wide variety of tasks such as\nclassification, detection, and segmentation, they fundamentally are unable to\ngeneralize to an ever-evolving world due to constant out-of-distribution shifts\nof input data. Therefore, instead of training on fixed datasets, can we\napproach learning in a more human-centric and adaptive manner? In this paper,\nwe introduce Action-Aware Embodied Learning for Perception (ALP), an embodied\nlearning framework that incorporates action information into representation\nlearning through a combination of optimizing a reinforcement learning policy\nand an inverse dynamics prediction objective. Our method actively explores in\ncomplex 3D environments to both learn generalizable task-agnostic visual\nrepresentations as well as collect downstream training data. We show that ALP\noutperforms existing baselines in several downstream perception tasks. In\naddition, we show that by training on actively collected data more relevant to\nthe environment and task, our method generalizes more robustly to downstream\ntasks compared to models pre-trained on fixed datasets such as ImageNet.\n","authors":["Xinran Liang","Anthony Han","Wilson Yan","Aditi Raghunathan","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2306.10190v2.pdf","comment":"project website available at https://xinranliang.github.io/alp/"},{"id":"http://arxiv.org/abs/2310.11346v1","updated":"2023-10-17T15:31:28Z","published":"2023-10-17T15:31:28Z","title":"Towards Generalizable Multi-Camera 3D Object Detection via Perspective\n  Debiasing","summary":"  Detecting objects in 3D space using multiple cameras, known as Multi-Camera\n3D Object Detection (MC3D-Det), has gained prominence with the advent of\nbird's-eye view (BEV) approaches. However, these methods often struggle when\nfaced with unfamiliar testing environments due to the lack of diverse training\ndata encompassing various viewpoints and environments. To address this, we\npropose a novel method that aligns 3D detection with 2D camera plane results,\nensuring consistent and accurate detections. Our framework, anchored in\nperspective debiasing, helps the learning of features resilient to domain\nshifts. In our approach, we render diverse view maps from BEV features and\nrectify the perspective bias of these maps, leveraging implicit foreground\nvolumes to bridge the camera and BEV planes. This two-step process promotes the\nlearning of perspective- and context-independent features, crucial for accurate\nobject detection across varying viewpoints, camera parameters and environment\nconditions. Notably, our model-agnostic approach preserves the original network\nstructure without incurring additional inference costs, facilitating seamless\nintegration across various models and simplifying deployment. Furthermore, we\nalso show our approach achieves satisfactory results in real data when trained\nonly with virtual datasets, eliminating the need for real scene annotations.\nExperimental results on both Domain Generalization (DG) and Unsupervised Domain\nAdaptation (UDA) clearly demonstrate its effectiveness. Our code will be\nreleased.\n","authors":["Hao Lu","Yunpeng Zhang","Qing Lian","Dalong Du","Yingcong Chen"],"pdf_url":"https://arxiv.org/pdf/2310.11346v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11341v1","updated":"2023-10-17T15:24:02Z","published":"2023-10-17T15:24:02Z","title":"Dual Cognitive Architecture: Incorporating Biases and Multi-Memory\n  Systems for Lifelong Learning","summary":"  Artificial neural networks (ANNs) exhibit a narrow scope of expertise on\nstationary independent data. However, the data in the real world is continuous\nand dynamic, and ANNs must adapt to novel scenarios while also retaining the\nlearned knowledge to become lifelong learners. The ability of humans to excel\nat these tasks can be attributed to multiple factors ranging from cognitive\ncomputational structures, cognitive biases, and the multi-memory systems in the\nbrain. We incorporate key concepts from each of these to design a novel\nframework, Dual Cognitive Architecture (DUCA), which includes multiple\nsub-systems, implicit and explicit knowledge representation dichotomy,\ninductive bias, and a multi-memory system. The inductive bias learner within\nDUCA is instrumental in encoding shape information, effectively countering the\ntendency of ANNs to learn local textures. Simultaneously, the inclusion of a\nsemantic memory submodule facilitates the gradual consolidation of knowledge,\nreplicating the dynamics observed in fast and slow learning systems,\nreminiscent of the principles underpinning the complementary learning system in\nhuman cognition. DUCA shows improvement across different settings and datasets,\nand it also exhibits reduced task recency bias, without the need for extra\ninformation. To further test the versatility of lifelong learning methods on a\nchallenging distribution shift, we introduce a novel domain-incremental dataset\nDN4IL. In addition to improving performance on existing benchmarks, DUCA also\ndemonstrates superior performance on this complex dataset.\n","authors":["Shruthi Gowda","Bahram Zonooz","Elahe Arani"],"pdf_url":"https://arxiv.org/pdf/2310.11341v1.pdf","comment":"Published in Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2306.06081v3","updated":"2023-10-17T15:20:47Z","published":"2023-05-25T09:04:31Z","title":"CARSO: Blending Adversarial Training and Purification Improves\n  Adversarial Robustness","summary":"  In this work, we propose a novel adversarial defence mechanism for image\nclassification - CARSO - blending the paradigms of adversarial training and\nadversarial purification in a mutually-beneficial, robustness-enhancing way.\nThe method builds upon an adversarially-trained classifier, and learns to map\nits internal representation associated with a potentially perturbed input onto\na distribution of tentative clean reconstructions. Multiple samples from such\ndistribution are classified by the adversarially-trained model itself, and an\naggregation of its outputs finally constitutes the robust prediction of\ninterest. Experimental evaluation by a well-established benchmark of varied,\nstrong adaptive attacks, across different image datasets and classifier\narchitectures, shows that CARSO is able to defend itself against foreseen and\nunforeseen threats, including adaptive end-to-end attacks devised for\nstochastic defences. Paying a tolerable clean accuracy toll, our method\nimproves by a significant margin the state of the art for CIFAR-10 and\nCIFAR-100 $\\ell_\\infty$ robust classification accuracy against AutoAttack. Code\nand pre-trained models are available at https://github.com/emaballarin/CARSO .\n","authors":["Emanuele Ballarin","Alessio Ansuini","Luca Bortolussi"],"pdf_url":"https://arxiv.org/pdf/2306.06081v3.pdf","comment":"19 pages, 1 figure, 9 tables"},{"id":"http://arxiv.org/abs/2310.11333v1","updated":"2023-10-17T15:12:11Z","published":"2023-10-17T15:12:11Z","title":"Key Point-based Orientation Estimation of Strawberries for Robotic Fruit\n  Picking","summary":"  Selective robotic harvesting is a promising technological solution to address\nlabour shortages which are affecting modern agriculture in many parts of the\nworld. For an accurate and efficient picking process, a robotic harvester\nrequires the precise location and orientation of the fruit to effectively plan\nthe trajectory of the end effector. The current methods for estimating fruit\norientation employ either complete 3D information which typically requires\nregistration from multiple views or rely on fully-supervised learning\ntechniques, which require difficult-to-obtain manual annotation of the\nreference orientation. In this paper, we introduce a novel key-point-based\nfruit orientation estimation method allowing for the prediction of 3D\norientation from 2D images directly. The proposed technique can work without\nfull 3D orientation annotations but can also exploit such information for\nimproved accuracy. We evaluate our work on two separate datasets of strawberry\nimages obtained from real-world data collection scenarios. Our proposed method\nachieves state-of-the-art performance with an average error as low as\n$8^{\\circ}$, improving predictions by $\\sim30\\%$ compared to previous work\npresented in~\\cite{wagner2021efficient}. Furthermore, our method is suited for\nreal-time robotic applications with fast inference times of $\\sim30$ms.\n","authors":["Justin Le Louëdec","Grzegorz Cielniak"],"pdf_url":"https://arxiv.org/pdf/2310.11333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11320v1","updated":"2023-10-17T14:58:18Z","published":"2023-10-17T14:58:18Z","title":"Towards Generic Semi-Supervised Framework for Volumetric Medical Image\n  Segmentation","summary":"  Volume-wise labeling in 3D medical images is a time-consuming task that\nrequires expertise. As a result, there is growing interest in using\nsemi-supervised learning (SSL) techniques to train models with limited labeled\ndata. However, the challenges and practical applications extend beyond SSL to\nsettings such as unsupervised domain adaptation (UDA) and semi-supervised\ndomain generalization (SemiDG). This work aims to develop a generic SSL\nframework that can handle all three settings. We identify two main obstacles to\nachieving this goal in the existing SSL framework: 1) the weakness of capturing\ndistribution-invariant features; and 2) the tendency for unlabeled data to be\noverwhelmed by labeled data, leading to over-fitting to the labeled data during\ntraining. To address these issues, we propose an Aggregating & Decoupling\nframework. The aggregating part consists of a Diffusion encoder that constructs\na common knowledge set by extracting distribution-invariant features from\naggregated information from multiple distributions/domains. The decoupling part\nconsists of three decoders that decouple the training process with labeled and\nunlabeled data, thus avoiding over-fitting to labeled data, specific domains\nand classes. We evaluate our proposed framework on four benchmark datasets for\nSSL, Class-imbalanced SSL, UDA and SemiDG. The results showcase notable\nimprovements compared to state-of-the-art methods across all four settings,\nindicating the potential of our framework to tackle more challenging SSL\nscenarios. Code and models are available at:\nhttps://github.com/xmed-lab/GenericSSL.\n","authors":["Haonan Wang","Xiaomeng Li"],"pdf_url":"https://arxiv.org/pdf/2310.11320v1.pdf","comment":"Accepted at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2207.07173v3","updated":"2023-10-17T14:52:13Z","published":"2022-07-14T19:16:56Z","title":"Deep Image Clustering with Contrastive Learning and Multi-scale Graph\n  Convolutional Networks","summary":"  Deep clustering has shown its promising capability in joint representation\nlearning and clustering via deep neural networks. Despite the significant\nprogress, the existing deep clustering works mostly utilize some\ndistribution-based clustering loss, lacking the ability to unify representation\nlearning and multi-scale structure learning. To address this, this paper\npresents a new deep clustering approach termed image clustering with\ncontrastive learning and multi-scale graph convolutional networks (IcicleGCN),\nwhich bridges the gap between convolutional neural network (CNN) and graph\nconvolutional network (GCN) as well as the gap between contrastive learning and\nmulti-scale structure learning for the deep clustering task. Our framework\nconsists of four main modules, namely, the CNN-based backbone, the Instance\nSimilarity Module (ISM), the Joint Cluster Structure Learning and Instance\nreconstruction Module (JC-SLIM), and the Multi-scale GCN module (M-GCN).\nSpecifically, the backbone network with two weight-sharing views is utilized to\nlearn the representations for the two augmented samples (from each image). The\nlearned representations are then fed to ISM and JC-SLIM for joint\ninstance-level and cluster-level contrastive learning, respectively, during\nwhich an auto-encoder in JC-SLIM is also pretrained to serve as a bridge to the\nM-GCN module. Further, to enforce multi-scale neighborhood structure learning,\ntwo streams of GCNs and the auto-encoder are simultaneously trained via (i) the\nlayer-wise interaction with representation fusion and (ii) the joint\nself-adaptive learning. Experiments on multiple image datasets demonstrate the\nsuperior clustering performance of IcicleGCN over the state-of-the-art. The\ncode is available at https://github.com/xuyuankun631/IcicleGCN.\n","authors":["Yuankun Xu","Dong Huang","Chang-Dong Wang","Jian-Huang Lai"],"pdf_url":"https://arxiv.org/pdf/2207.07173v3.pdf","comment":"To appear in the Pattern Recognition journal"},{"id":"http://arxiv.org/abs/2310.11316v1","updated":"2023-10-17T14:48:02Z","published":"2023-10-17T14:48:02Z","title":"MonoSKD: General Distillation Framework for Monocular 3D Object\n  Detection via Spearman Correlation Coefficient","summary":"  Monocular 3D object detection is an inherently ill-posed problem, as it is\nchallenging to predict accurate 3D localization from a single image. Existing\nmonocular 3D detection knowledge distillation methods usually project the LiDAR\nonto the image plane and train the teacher network accordingly. Transferring\nLiDAR-based model knowledge to RGB-based models is more complex, so a general\ndistillation strategy is needed. To alleviate cross-modal prob-lem, we propose\nMonoSKD, a novel Knowledge Distillation framework for Monocular 3D detection\nbased on Spearman correlation coefficient, to learn the relative correlation\nbetween cross-modal features. Considering the large gap between these features,\nstrict alignment of features may mislead the training, so we propose a looser\nSpearman loss. Furthermore, by selecting appropriate distillation locations and\nremoving redundant modules, our scheme saves more GPU resources and trains\nfaster than existing methods. Extensive experiments are performed to verify the\neffectiveness of our framework on the challenging KITTI 3D object detection\nbenchmark. Our method achieves state-of-the-art performance until submission\nwith no additional inference computational cost. Our codes are available at\nhttps://github.com/Senwang98/MonoSKD\n","authors":["Sen Wang","Jin Zheng"],"pdf_url":"https://arxiv.org/pdf/2310.11316v1.pdf","comment":"Accepted by ECAI 2023"},{"id":"http://arxiv.org/abs/2310.11307v1","updated":"2023-10-17T14:32:49Z","published":"2023-10-17T14:32:49Z","title":"Multi Self-supervised Pre-fine-tuned Transformer Fusion for Better\n  Intelligent Transportation Detection","summary":"  Intelligent transportation system combines advanced information technology to\nprovide intelligent services such as monitoring, detection, and early warning\nfor modern transportation. Intelligent transportation detection is the\ncornerstone of many intelligent traffic services by identifying task targets\nthrough object detection methods. However existing detection methods in\nintelligent transportation are limited by two aspects. First, there is a\ndifference between the model knowledge pre-trained on large-scale datasets and\nthe knowledge required for target task. Second, most detection models follow\nthe pattern of single-source learning, which limits the learning ability. To\naddress these problems, we propose a Multi Self-supervised Pre-fine-tuned\nTransformer Fusion (MSPTF) network, consisting of two steps: unsupervised\npre-fine-tune domain knowledge learning and multi-model fusion target task\nlearning. In the first step, we introduced self-supervised learning methods\ninto transformer model pre-fine-tune which could reduce data costs and\nalleviate the knowledge gap between pre-trained model and target task. In the\nsecond step, we take feature information differences between different model\narchitectures and different pre-fine-tune tasks into account and propose\nMulti-model Semantic Consistency Cross-attention Fusion (MSCCF) network to\ncombine different transformer model features by considering channel semantic\nconsistency and feature vector semantic consistency, which obtain more complete\nand proper fusion features for detection task. We experimented the proposed\nmethod on vehicle recognition dataset and road disease detection dataset and\nachieved 1.1%, 5.5%, 4.2% improvement compared with baseline and 0.7%, 1.8%,\n1.7% compared with sota, which proved the effectiveness of our method.\n","authors":["Juwu Zheng","Jiangtao Ren"],"pdf_url":"https://arxiv.org/pdf/2310.11307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.11713v5","updated":"2023-10-17T14:19:13Z","published":"2023-02-23T00:33:54Z","title":"Can Pre-trained Vision and Language Models Answer Visual\n  Information-Seeking Questions?","summary":"  Pre-trained vision and language models have demonstrated state-of-the-art\ncapabilities over existing tasks involving images and texts, including visual\nquestion answering. However, it remains unclear whether these models possess\nthe capability to answer questions that are not only querying visual content\nbut knowledge-intensive and information-seeking. In this study, we introduce\nInfoSeek, a visual question answering dataset tailored for information-seeking\nquestions that cannot be answered with only common sense knowledge. Using\nInfoSeek, we analyze various pre-trained visual question answering models and\ngain insights into their characteristics. Our findings reveal that\nstate-of-the-art pre-trained multi-modal models (e.g., PaLI-X, BLIP2, etc.)\nface challenges in answering visual information-seeking questions, but\nfine-tuning on the InfoSeek dataset elicits models to use fine-grained\nknowledge that was learned during their pre-training. Furthermore, we show that\naccurate visual entity recognition can be used to improve performance on\nInfoSeek by retrieving relevant documents, showing a significant space for\nimprovement.\n","authors":["Yang Chen","Hexiang Hu","Yi Luan","Haitian Sun","Soravit Changpinyo","Alan Ritter","Ming-Wei Chang"],"pdf_url":"https://arxiv.org/pdf/2302.11713v5.pdf","comment":"EMNLP 2023 (main conference); Our dataset and evaluation is available\n  at https://open-vision-language.github.io/infoseek/"},{"id":"http://arxiv.org/abs/2310.11295v1","updated":"2023-10-17T14:16:42Z","published":"2023-10-17T14:16:42Z","title":"CorrTalk: Correlation Between Hierarchical Speech and Facial Activity\n  Variances for 3D Animation","summary":"  Speech-driven 3D facial animation is a challenging cross-modal task that has\nattracted growing research interest. During speaking activities, the mouth\ndisplays strong motions, while the other facial regions typically demonstrate\ncomparatively weak activity levels. Existing approaches often simplify the\nprocess by directly mapping single-level speech features to the entire facial\nanimation, which overlook the differences in facial activity intensity leading\nto overly smoothed facial movements. In this study, we propose a novel\nframework, CorrTalk, which effectively establishes the temporal correlation\nbetween hierarchical speech features and facial activities of different\nintensities across distinct regions. A novel facial activity intensity metric\nis defined to distinguish between strong and weak facial activity, obtained by\ncomputing the short-time Fourier transform of facial vertex displacements.\nBased on the variances in facial activity, we propose a dual-branch decoding\nframework to synchronously synthesize strong and weak facial activity, which\nguarantees wider intensity facial animation synthesis. Furthermore, a weighted\nhierarchical feature encoder is proposed to establish temporal correlation\nbetween hierarchical speech features and facial activity at different\nintensities, which ensures lip-sync and plausible facial expressions. Extensive\nqualitatively and quantitatively experiments as well as a user study indicate\nthat our CorrTalk outperforms existing state-of-the-art methods. The source\ncode and supplementary video are publicly available at:\nhttps://zjchu.github.io/projects/CorrTalk/\n","authors":["Zhaojie Chu","Kailing Guo","Xiaofen Xing","Yilin Lan","Bolun Cai","Xiangmin Xu"],"pdf_url":"https://arxiv.org/pdf/2310.11295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11284v1","updated":"2023-10-17T14:06:55Z","published":"2023-10-17T14:06:55Z","title":"Self-Supervised 3D Scene Flow Estimation and Motion Prediction using\n  Local Rigidity Prior","summary":"  In this article, we investigate self-supervised 3D scene flow estimation and\nclass-agnostic motion prediction on point clouds. A realistic scene can be well\nmodeled as a collection of rigidly moving parts, therefore its scene flow can\nbe represented as a combination of the rigid motion of these individual parts.\nBuilding upon this observation, we propose to generate pseudo scene flow labels\nfor self-supervised learning through piecewise rigid motion estimation, in\nwhich the source point cloud is decomposed into local regions and each region\nis treated as rigid. By rigidly aligning each region with its potential\ncounterpart in the target point cloud, we obtain a region-specific rigid\ntransformation to generate its pseudo flow labels. To mitigate the impact of\npotential outliers on label generation, when solving the rigid registration for\neach region, we alternately perform three steps: establishing point\ncorrespondences, measuring the confidence for the correspondences, and updating\nthe rigid transformation based on the correspondences and their confidence. As\na result, confident correspondences will dominate label generation and a\nvalidity mask will be derived for the generated pseudo labels. By using the\npseudo labels together with their validity mask for supervision, models can be\ntrained in a self-supervised manner. Extensive experiments on FlyingThings3D\nand KITTI datasets demonstrate that our method achieves new state-of-the-art\nperformance in self-supervised scene flow learning, without any ground truth\nscene flow for supervision, even performing better than some supervised\ncounterparts. Additionally, our method is further extended to class-agnostic\nmotion prediction and significantly outperforms previous state-of-the-art\nself-supervised methods on nuScenes dataset.\n","authors":["Ruibo Li","Chi Zhang","Zhe Wang","Chunhua Shen","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2310.11284v1.pdf","comment":"An extension of our CVPR 2022 paper (RigidFlow: Self-Supervised Scene\n  Flow Learning on Point Clouds by Local Rigidity Prior)"},{"id":"http://arxiv.org/abs/2310.11276v1","updated":"2023-10-17T13:55:43Z","published":"2023-10-17T13:55:43Z","title":"Video Super-Resolution Using a Grouped Residual in Residual Network","summary":"  Super-resolution (SR) is the technique of increasing the nominal resolution\nof image / video content accompanied with quality improvement. Video\nsuper-resolution (VSR) can be considered as the generalization of single image\nsuper-resolution (SISR). This generalization should be such that more detail is\ncreated in the output using adjacent input frames. In this paper, we propose a\ngrouped residual in residual network (GRRN) for VSR. By adjusting the\nhyperparameters of the proposed structure, we train three networks with\ndifferent numbers of parameters and compare their quantitative and qualitative\nresults with the existing methods. Although based on some quantitative\ncriteria, GRRN does not provide better results than the existing methods, in\nterms of the quality of the output image it has acceptable performance.\n","authors":["MohammadHossein Ashoori","Arash Amini"],"pdf_url":"https://arxiv.org/pdf/2310.11276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.03737v3","updated":"2023-10-17T13:43:34Z","published":"2022-09-07T13:08:38Z","title":"CGAN-ECT: Tomography Image Reconstruction from Electrical Capacitance\n  Measurements Using CGANs","summary":"  Due to the rapid growth of Electrical Capacitance Tomography (ECT)\napplications in several industrial fields, there is a crucial need for\ndeveloping high quality, yet fast, methodologies of image reconstruction from\nraw capacitance measurements. Deep learning, as an effective non-linear mapping\ntool for complicated functions, has been going viral in many fields including\nelectrical tomography. In this paper, we propose a Conditional Generative\nAdversarial Network (CGAN) model for reconstructing ECT images from capacitance\nmeasurements. The initial image of the CGAN model is constructed from the\ncapacitance measurement. To our knowledge, this is the first time to represent\nthe capacitance measurements in an image form. We have created a new massive\nECT dataset of 320K synthetic image measurements pairs for training, and\ntesting the proposed model. The feasibility and generalization ability of the\nproposed CGAN-ECT model are evaluated using testing dataset, contaminated data\nand flow patterns that are not exposed to the model during the training phase.\nThe evaluation results prove that the proposed CGAN-ECT model can efficiently\ncreate more accurate ECT images than traditional and other deep learning-based\nimage reconstruction algorithms. CGAN-ECT achieved an average image correlation\ncoefficient of more than 99.3% and an average relative image error about 0.07.\n","authors":["Wael Deabes","Alaa E. Abdel-Hakim"],"pdf_url":"https://arxiv.org/pdf/2209.03737v3.pdf","comment":"13 pages, 10 figures, 6 tables"},{"id":"http://arxiv.org/abs/2310.11265v1","updated":"2023-10-17T13:38:38Z","published":"2023-10-17T13:38:38Z","title":"Image Compression using only Attention based Neural Networks","summary":"  In recent research, Learned Image Compression has gained prominence for its\ncapacity to outperform traditional handcrafted pipelines, especially at low\nbit-rates. While existing methods incorporate convolutional priors with\noccasional attention blocks to address long-range dependencies, recent advances\nin computer vision advocate for a transformative shift towards fully\ntransformer-based architectures grounded in the attention mechanism. This paper\ninvestigates the feasibility of image compression exclusively using attention\nlayers within our novel model, QPressFormer. We introduce the concept of\nlearned image queries to aggregate patch information via cross-attention,\nfollowed by quantization and coding techniques. Through extensive evaluations,\nour work demonstrates competitive performance achieved by convolution-free\narchitectures across the popular Kodak, DIV2K, and CLIC datasets.\n","authors":["Natacha Luka","Romain Negrel","David Picard"],"pdf_url":"https://arxiv.org/pdf/2310.11265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11257v1","updated":"2023-10-17T13:22:59Z","published":"2023-10-17T13:22:59Z","title":"An empirical study of automatic wildlife detection using drone thermal\n  imaging and object detection","summary":"  Artificial intelligence has the potential to make valuable contributions to\nwildlife management through cost-effective methods for the collection and\ninterpretation of wildlife data. Recent advances in remotely piloted aircraft\nsystems (RPAS or ``drones'') and thermal imaging technology have created new\napproaches to collect wildlife data. These emerging technologies could provide\npromising alternatives to standard labourious field techniques as well as cover\nmuch larger areas. In this study, we conduct a comprehensive review and\nempirical study of drone-based wildlife detection. Specifically, we collect a\nrealistic dataset of drone-derived wildlife thermal detections. Wildlife\ndetections, including arboreal (for instance, koalas, phascolarctos cinereus)\nand ground dwelling species in our collected data are annotated via bounding\nboxes by experts. We then benchmark state-of-the-art object detection\nalgorithms on our collected dataset. We use these experimental results to\nidentify issues and discuss future directions in automatic animal monitoring\nusing drones.\n","authors":["Miao Chang","Tan Vuong","Manas Palaparthi","Lachlan Howell","Alessio Bonti","Mohamed Abdelrazek","Duc Thanh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2310.11257v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11256v1","updated":"2023-10-17T13:22:36Z","published":"2023-10-17T13:22:36Z","title":"Gromov-Wassertein-like Distances in the Gaussian Mixture Models Space","summary":"  In this paper, we introduce two Gromov-Wasserstein-type distances on the set\nof Gaussian mixture models. The first one takes the form of a\nGromov-Wasserstein distance between two discrete distributionson the space of\nGaussian measures. This distance can be used as an alternative to\nGromov-Wasserstein for applications which only require to evaluate how far the\ndistributions are from each other but does not allow to derive directly an\noptimal transportation plan between clouds of points. To design a way to define\nsuch a transportation plan, we introduce another distance between measures\nliving in incomparable spaces that turns out to be closely related to\nGromov-Wasserstein. When restricting the set of admissible transportation\ncouplings to be themselves Gaussian mixture models in this latter, this defines\nanother distance between Gaussian mixture models that can be used as another\nalternative to Gromov-Wasserstein and which allows to derive an optimal\nassignment between points. Finally, we design a transportation plan associated\nwith the first distance by analogy with the second, and we illustrate their\npractical uses on medium-to-large scale problems such as shape matching and\nhyperspectral image color transfer.\n","authors":["Antoine Salmona","Julie Delon","Agnès Desolneux"],"pdf_url":"https://arxiv.org/pdf/2310.11256v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2212.10230v3","updated":"2023-10-17T13:14:03Z","published":"2022-12-20T13:09:58Z","title":"A Comprehensive Study of the Robustness for LiDAR-based 3D Object\n  Detectors against Adversarial Attacks","summary":"  Recent years have witnessed significant advancements in deep learning-based\n3D object detection, leading to its widespread adoption in numerous\napplications. As 3D object detectors become increasingly crucial for\nsecurity-critical tasks, it is imperative to understand their robustness\nagainst adversarial attacks. This paper presents the first comprehensive\nevaluation and analysis of the robustness of LiDAR-based 3D detectors under\nadversarial attacks. Specifically, we extend three distinct adversarial attacks\nto the 3D object detection task, benchmarking the robustness of\nstate-of-the-art LiDAR-based 3D object detectors against attacks on the KITTI\nand Waymo datasets. We further analyze the relationship between robustness and\ndetector properties. Additionally, we explore the transferability of\ncross-model, cross-task, and cross-data attacks. Thorough experiments on\ndefensive strategies for 3D detectors are conducted, demonstrating that simple\ntransformations like flipping provide little help in improving robustness when\nthe applied transformation strategy is exposed to attackers. \\revise{Finally,\nwe propose balanced adversarial focal training, based on conventional\nadversarial training, to strike a balance between accuracy and robustness.} Our\nfindings will facilitate investigations into understanding and defending\nagainst adversarial attacks on LiDAR-based 3D object detectors, thus advancing\nthe field. The source code is publicly available at\n\\url{https://github.com/Eaphan/Robust3DOD}.\n","authors":["Yifan Zhang","Junhui Hou","Yixuan Yuan"],"pdf_url":"https://arxiv.org/pdf/2212.10230v3.pdf","comment":"30 pages, 14 figures. Accepted by IJCV"},{"id":"http://arxiv.org/abs/2310.11239v1","updated":"2023-10-17T13:08:24Z","published":"2023-10-17T13:08:24Z","title":"LiDAR-based 4D Occupancy Completion and Forecasting","summary":"  Scene completion and forecasting are two popular perception problems in\nresearch for mobile agents like autonomous vehicles. Existing approaches treat\nthe two problems in isolation, resulting in a separate perception of the two\naspects. In this paper, we introduce a novel LiDAR perception task of Occupancy\nCompletion and Forecasting (OCF) in the context of autonomous driving to unify\nthese aspects into a cohesive framework. This task requires new algorithms to\naddress three challenges altogether: (1) sparse-to-dense reconstruction, (2)\npartial-to-complete hallucination, and (3) 3D-to-4D prediction. To enable\nsupervision and evaluation, we curate a large-scale dataset termed OCFBench\nfrom public autonomous driving datasets. We analyze the performance of closely\nrelated existing baseline models and our own ones on our dataset. We envision\nthat this research will inspire and call for further investigation in this\nevolving and crucial area of 4D perception. Our code for data curation and\nbaseline implementation is available at https://github.com/ai4ce/Occ4cast.\n","authors":["Xinhao Liu","Moonjun Gong","Qi Fang","Haoyu Xie","Yiming Li","Hang Zhao","Chen Feng"],"pdf_url":"https://arxiv.org/pdf/2310.11239v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00600v2","updated":"2023-10-17T12:48:58Z","published":"2023-06-01T12:16:26Z","title":"Rotating Features for Object Discovery","summary":"  The binding problem in human cognition, concerning how the brain represents\nand connects objects within a fixed network of neural connections, remains a\nsubject of intense debate. Most machine learning efforts addressing this issue\nin an unsupervised setting have focused on slot-based methods, which may be\nlimiting due to their discrete nature and difficulty to express uncertainty.\nRecently, the Complex AutoEncoder was proposed as an alternative that learns\ncontinuous and distributed object-centric representations. However, it is only\napplicable to simple toy data. In this paper, we present Rotating Features, a\ngeneralization of complex-valued features to higher dimensions, and a new\nevaluation procedure for extracting objects from distributed representations.\nAdditionally, we show the applicability of our approach to pre-trained\nfeatures. Together, these advancements enable us to scale distributed\nobject-centric representations from simple toy to real-world data. We believe\nthis work advances a new paradigm for addressing the binding problem in machine\nlearning and has the potential to inspire further innovation in the field.\n","authors":["Sindy Löwe","Phillip Lippe","Francesco Locatello","Max Welling"],"pdf_url":"https://arxiv.org/pdf/2306.00600v2.pdf","comment":"Oral presentation at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.11217v1","updated":"2023-10-17T12:45:04Z","published":"2023-10-17T12:45:04Z","title":"Innovative Methods for Non-Destructive Inspection of Handwritten\n  Documents","summary":"  Handwritten document analysis is an area of forensic science, with the goal\nof establishing authorship of documents through examination of inherent\ncharacteristics. Law enforcement agencies use standard protocols based on\nmanual processing of handwritten documents. This method is time-consuming, is\noften subjective in its evaluation, and is not replicable. To overcome these\nlimitations, in this paper we present a framework capable of extracting and\nanalyzing intrinsic measures of manuscript documents related to text line\nheights, space between words, and character sizes using image processing and\ndeep learning techniques. The final feature vector for each document involved\nconsists of the mean and standard deviation for every type of measure\ncollected. By quantifying the Euclidean distance between the feature vectors of\nthe documents to be compared, authorship can be discerned. We also proposed a\nnew and challenging dataset consisting of 362 handwritten manuscripts written\non paper and digital devices by 124 different people. Our study pioneered the\ncomparison between traditionally handwritten documents and those produced with\ndigital tools (e.g., tablets). Experimental results demonstrate the ability of\nour method to objectively determine authorship in different writing media,\noutperforming the state of the art.\n","authors":["Eleonora Breci","Luca Guarnera","Sebastiano Battiato"],"pdf_url":"https://arxiv.org/pdf/2310.11217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11210v1","updated":"2023-10-17T12:39:16Z","published":"2023-10-17T12:39:16Z","title":"Learning Comprehensive Representations with Richer Self for\n  Text-to-Image Person Re-Identification","summary":"  Text-to-image person re-identification (TIReID) retrieves pedestrian images\nof the same identity based on a query text. However, existing methods for\nTIReID typically treat it as a one-to-one image-text matching problem, only\nfocusing on the relationship between image-text pairs within a view. The\nmany-to-many matching between image-text pairs across views under the same\nidentity is not taken into account, which is one of the main reasons for the\npoor performance of existing methods. To this end, we propose a simple yet\neffective framework, called LCR$^2$S, for modeling many-to-many correspondences\nof the same identity by learning comprehensive representations for both\nmodalities from a novel perspective. We construct a support set for each image\n(text) by using other images (texts) under the same identity and design a\nmulti-head attentional fusion module to fuse the image (text) and its support\nset. The resulting enriched image and text features fuse information from\nmultiple views, which are aligned to train a \"richer\" TIReID model with\nmany-to-many correspondences. Since the support set is unavailable during\ninference, we propose to distill the knowledge learned by the \"richer\" model\ninto a lightweight model for inference with a single image/text as input. The\nlightweight model focuses on semantic association and reasoning of multi-view\ninformation, which can generate a comprehensive representation containing\nmulti-view information with only a single-view input to perform accurate\ntext-to-image retrieval during inference. In particular, we use the intra-modal\nfeatures and inter-modal semantic relations of the \"richer\" model to supervise\nthe lightweight model to inherit its powerful capability. Extensive experiments\ndemonstrate the effectiveness of LCR$^2$S, and it also achieves new\nstate-of-the-art performance on three popular TIReID datasets.\n","authors":["Shuanglin Yan","Neng Dong","Jun Liu","Liyan Zhang","Jinhui Tang"],"pdf_url":"https://arxiv.org/pdf/2310.11210v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2310.11204v1","updated":"2023-10-17T12:30:46Z","published":"2023-10-17T12:30:46Z","title":"Improving Video Deepfake Detection: A DCT-Based Approach with\n  Patch-Level Analysis","summary":"  The term deepfake refers to all those multimedia contents that were\nsynthetically altered or created from scratch through the use of generative\nmodels. This phenomenon has become widespread due to the use of increasingly\naccurate and efficient architectures capable of rendering manipulated content\nindistinguishable from real content. In order to fight the illicit use of this\npowerful technology, it has become necessary to develop algorithms able to\ndistinguish synthetic content from real ones. In this study, a new algorithm\nfor the detection of deepfakes in digital videos is presented, focusing on the\nmain goal of creating a fast and explainable method from a forensic\nperspective. To achieve this goal, the I-frames were extracted in order to\nprovide faster computation and analysis than approaches described in\nliterature. In addition, to identify the most discriminating regions within\nindividual video frames, the entire frame, background, face, eyes, nose, mouth,\nand face frame were analyzed separately. From the Discrete Cosine Transform\n(DCT), the Beta components were extracted from the AC coefficients and used as\ninput to standard classifiers (e.g., k-NN, SVM, and others) in order to\nidentify those frequencies most discriminative for solving the task in\nquestion. Experimental results obtained on the Faceforensics++ and Celeb-DF\n(v2) datasets show that the eye and mouth regions are those most discriminative\nand able to determine the nature of the video with greater reliability than the\nanalysis of the whole frame. The method proposed in this study is analytical,\nfast and does not require much computational power.\n","authors":["Luca Guarnera","Salvatore Manganello","Sebastiano Battiato"],"pdf_url":"https://arxiv.org/pdf/2310.11204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.13785v2","updated":"2023-10-17T12:24:24Z","published":"2023-04-26T19:05:34Z","title":"Customized Segment Anything Model for Medical Image Segmentation","summary":"  We propose SAMed, a general solution for medical image segmentation.\nDifferent from the previous methods, SAMed is built upon the large-scale image\nsegmentation model, Segment Anything Model (SAM), to explore the new research\nparadigm of customizing large-scale models for medical image segmentation.\nSAMed applies the low-rank-based (LoRA) finetuning strategy to the SAM image\nencoder and finetunes it together with the prompt encoder and the mask decoder\non labeled medical image segmentation datasets. We also observe the warmup\nfinetuning strategy and the AdamW optimizer lead SAMed to successful\nconvergence and lower loss. Different from SAM, SAMed could perform semantic\nsegmentation on medical images. Our trained SAMed model achieves 81.88 DSC and\n20.64 HD on the Synapse multi-organ segmentation dataset, which is on par with\nthe state-of-the-art methods. We conduct extensive experiments to validate the\neffectiveness of our design. Since SAMed only updates a small fraction of the\nSAM parameters, its deployment cost and storage cost are quite marginal in\npractical usage. The code of SAMed is available at\nhttps://github.com/hitachinsk/SAMed.\n","authors":["Kaidong Zhang","Dong Liu"],"pdf_url":"https://arxiv.org/pdf/2304.13785v2.pdf","comment":"Technical report, 14 pages"},{"id":"http://arxiv.org/abs/2305.11490v4","updated":"2023-10-17T12:16:03Z","published":"2023-05-19T07:44:39Z","title":"LLM-CXR: Instruction-Finetuned LLM for CXR Image Understanding and\n  Generation","summary":"  Following the impressive development of LLMs, vision-language alignment in\nLLMs is actively being researched to enable multimodal reasoning and visual IO.\nThis direction of research is particularly relevant to medical imaging because\nmedical image analysis and generation consist of reasoning based on a\ncombination of visual features and prior knowledge. Many recent works have\nfocused on training adapter networks that serve as an information bridge\nbetween image processing networks and LLMs; but presumably, in order to achieve\nmaximum reasoning potential of LLMs on visual information as well, visual and\nlanguage features should be allowed to interact more freely. This is especially\nimportant in the medical domain because understanding and generating medical\nimages such as chest X-rays (CXR) require not only accurate visual and\nlanguage-based reasoning but also a more intimate mapping between the two\nmodalities. Thus, taking inspiration from previous work on the transformer and\nVQ-GAN combination for bidirectional image and text generation, we build upon\nthis approach and develop a method for instruction-tuning an LLM pre-trained\nonly on text to gain vision-language capabilities for medical images.\nSpecifically, we leverage a pretrained LLM's existing question-answering and\ninstruction-following abilities to teach it to understand visual inputs by\ninstructing it to answer questions about image inputs and, symmetrically,\noutput both text and image responses appropriate to a given query by tuning the\nLLM with diverse tasks that encompass image-based text-generation and\ntext-based image-generation. We show that our model, LLM-CXR, trained in this\napproach shows better image-text alignment in both CXR understanding and\ngeneration tasks while being smaller in size compared to previously developed\nmodels that perform a narrower range of tasks. The code is at\nhttps://github.com/hyn2028/llm-cxr.\n","authors":["Suhyeon Lee","Won Jun Kim","Jinho Chang","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2305.11490v4.pdf","comment":"20 pages, 8 figures"},{"id":"http://arxiv.org/abs/2310.11184v1","updated":"2023-10-17T12:01:32Z","published":"2023-10-17T12:01:32Z","title":"Sparse Multi-Object Render-and-Compare","summary":"  Reconstructing 3D shape and pose of static objects from a single image is an\nessential task for various industries, including robotics, augmented reality,\nand digital content creation. This can be done by directly predicting 3D shape\nin various representations or by retrieving CAD models from a database and\npredicting their alignments. Directly predicting 3D shapes often produces\nunrealistic, overly smoothed or tessellated shapes. Retrieving CAD models\nensures realistic shapes but requires robust and accurate alignment. Learning\nto directly predict CAD model poses from image features is challenging and\ninaccurate. Works, such as ROCA, compute poses from predicted normalised object\ncoordinates which can be more accurate but are susceptible to systematic\nfailure. SPARC demonstrates that following a ''render-and-compare'' approach\nwhere a network iteratively improves upon its own predictions achieves accurate\nalignments. Nevertheless, it performs individual CAD alignment for every object\ndetected in an image. This approach is slow when applied to many objects as the\ntime complexity increases linearly with the number of objects and can not learn\ninter-object relations. Introducing a new network architecture Multi-SPARC we\nlearn to perform CAD model alignments for multiple detected objects jointly.\nCompared to other single-view methods we achieve state-of-the-art performance\non the challenging real-world dataset ScanNet. By improving the instance\nalignment accuracy from 31.8% to 40.3% we perform similar to state-of-the-art\nmulti-view methods.\n","authors":["Florian Langer","Ignas Budvytis","Roberto Cipolla"],"pdf_url":"https://arxiv.org/pdf/2310.11184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11178v1","updated":"2023-10-17T11:53:32Z","published":"2023-10-17T11:53:32Z","title":"FocDepthFormer: Transformer with LSTM for Depth Estimation from Focus","summary":"  Depth estimation from focal stacks is a fundamental computer vision problem\nthat aims to infer depth from focus/defocus cues in the image stacks. Most\nexisting methods tackle this problem by applying convolutional neural networks\n(CNNs) with 2D or 3D convolutions over a set of fixed stack images to learn\nfeatures across images and stacks. Their performance is restricted due to the\nlocal properties of the CNNs, and they are constrained to process a fixed\nnumber of stacks consistent in train and inference, limiting the generalization\nto the arbitrary length of stacks. To handle the above limitations, we develop\na novel Transformer-based network, FocDepthFormer, composed mainly of a\nTransformer with an LSTM module and a CNN decoder. The self-attention in\nTransformer enables learning more informative features via an implicit\nnon-local cross reference. The LSTM module is learned to integrate the\nrepresentations across the stack with arbitrary images. To directly capture the\nlow-level features of various degrees of focus/defocus, we propose to use\nmulti-scale convolutional kernels in an early-stage encoder. Benefiting from\nthe design with LSTM, our FocDepthFormer can be pre-trained with abundant\nmonocular RGB depth estimation data for visual pattern capturing, alleviating\nthe demand for the hard-to-collect focal stack data. Extensive experiments on\nvarious focal stack benchmark datasets show that our model outperforms the\nstate-of-the-art models on multiple metrics.\n","authors":["Xueyang Kang","Fengze Han","Abdur Fayjie","Dong Gong"],"pdf_url":"https://arxiv.org/pdf/2310.11178v1.pdf","comment":"20 pages, 18 figures, journal paper"},{"id":"http://arxiv.org/abs/2310.05392v2","updated":"2023-10-17T11:51:56Z","published":"2023-10-09T04:07:35Z","title":"Lightweight Full-Convolutional Siamese Tracker","summary":"  Although single object trackers have achieved advanced performance, their\nlarge-scale models make it difficult to apply them on the platforms with\nlimited resources. Moreover, existing lightweight trackers only achieve balance\nbetween 2-3 points in terms of parameters, performance, Flops and FPS. To\nachieve the optimal balance among these points, this paper propose a\nlightweight full-convolutional Siamese tracker called LightFC. LightFC employs\na novel efficient cross-correlation module (ECM) and a novel efficient\nrep-center head (ERH) to enhance the nonlinear expressiveness of the\nconvolutional tracking pipeline. The ECM employs an attention-like module\ndesign, which conducts spatial and channel linear fusion of fused features and\nenhances the nonlinearly of the fused features. Additionally, it references\nsuccessful factors of current lightweight trackers and introduces\nskip-connections and reuse of search area features. The ERH reparameterizes the\nfeature dimensional stage in the standard center head and introduces channel\nattention to optimize the bottleneck of key feature flows. Comprehensive\nexperiments show that LightFC achieves the optimal balance between performance,\nparameters, Flops and FPS. The precision score of LightFC outperforms\nMixFormerV2-S by 3.7 \\% and 6.5 \\% on LaSOT and TNL2K, respectively, while\nusing 5x fewer parameters and 4.6x fewer Flops. Besides, LightFC runs 2x faster\nthan MixFormerV2-S on CPUs. Our code and raw results can be found at\nhttps://github.com/LiYunfengLYF/LightFC\n","authors":["Yunfeng Li","Bo Wang","Xueyi Wu","Zhuoyan Liu","Ye Li"],"pdf_url":"https://arxiv.org/pdf/2310.05392v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15938v2","updated":"2023-10-17T11:50:03Z","published":"2023-03-28T12:49:10Z","title":"fRegGAN with K-space Loss Regularization for Medical Image Translation","summary":"  Generative adversarial networks (GANs) have shown remarkable success in\ngenerating realistic images and are increasingly used in medical imaging for\nimage-to-image translation tasks. However, GANs tend to suffer from a frequency\nbias towards low frequencies, which can lead to the removal of important\nstructures in the generated images. To address this issue, we propose a novel\nfrequency-aware image-to-image translation framework based on the supervised\nRegGAN approach, which we call fRegGAN. The framework employs a K-space loss to\nregularize the frequency content of the generated images and incorporates\nwell-known properties of MRI K-space geometry to guide the network training\nprocess. By combine our method with the RegGAN approach, we can mitigate the\neffect of training with misaligned data and frequency bias at the same time. We\nevaluate our method on the public BraTS dataset and outperform the baseline\nmethods in terms of both quantitative and qualitative metrics when synthesizing\nT2-weighted from T1-weighted MR images. Detailed ablation studies are provided\nto understand the effect of each modification on the final performance. The\nproposed method is a step towards improving the performance of image-to-image\ntranslation and synthesis in the medical domain and shows promise for other\napplications in the field of image processing and generation.\n","authors":["Ivo M. Baltruschat","Felix Kreis","Alexander Hoelscher","Melanie Dohmen","Matthias Lenga"],"pdf_url":"https://arxiv.org/pdf/2303.15938v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11173v1","updated":"2023-10-17T11:41:38Z","published":"2023-10-17T11:41:38Z","title":"Knowledge Extraction and Distillation from Large-Scale Image-Text\n  Colonoscopy Records Leveraging Large Language and Vision Models","summary":"  The development of artificial intelligence systems for colonoscopy analysis\noften necessitates expert-annotated image datasets. However, limitations in\ndataset size and diversity impede model performance and generalisation.\nImage-text colonoscopy records from routine clinical practice, comprising\nmillions of images and text reports, serve as a valuable data source, though\nannotating them is labour-intensive. Here we leverage recent advancements in\nlarge language and vision models and propose EndoKED, a data mining paradigm\nfor deep knowledge extraction and distillation. EndoKED automates the\ntransformation of raw colonoscopy records into image datasets with pixel-level\nannotation. We validate EndoKED using multi-centre datasets of raw colonoscopy\nrecords (~1 million images), demonstrating its superior performance in training\npolyp detection and segmentation models. Furthermore, the EndoKED pre-trained\nvision backbone enables data-efficient and generalisable learning for optical\nbiopsy, achieving expert-level performance in both retrospective and\nprospective validation.\n","authors":["Shuo Wang","Yan Zhu","Xiaoyuan Luo","Zhiwei Yang","Yizhe Zhang","Peiyao Fu","Manning Wang","Zhijian Song","Quanlin Li","Pinghong Zhou","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2310.11173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11153v1","updated":"2023-10-17T11:19:51Z","published":"2023-10-17T11:19:51Z","title":"Unsupervised Pre-Training Using Masked Autoencoders for ECG Analysis","summary":"  Unsupervised learning methods have become increasingly important in deep\nlearning due to their demonstrated large utilization of datasets and higher\naccuracy in computer vision and natural language processing tasks. There is a\ngrowing trend to extend unsupervised learning methods to other domains, which\nhelps to utilize a large amount of unlabelled data. This paper proposes an\nunsupervised pre-training technique based on masked autoencoder (MAE) for\nelectrocardiogram (ECG) signals. In addition, we propose a task-specific\nfine-tuning to form a complete framework for ECG analysis. The framework is\nhigh-level, universal, and not individually adapted to specific model\narchitectures or tasks. Experiments are conducted using various model\narchitectures and large-scale datasets, resulting in an accuracy of 94.39% on\nthe MITDB dataset for ECG arrhythmia classification task. The result shows a\nbetter performance for the classification of previously unseen data for the\nproposed approach compared to fully supervised methods.\n","authors":["Guoxin Wang","Qingyuan Wang","Ganesh Neelakanta Iyer","Avishek Nag","Deepu John"],"pdf_url":"https://arxiv.org/pdf/2310.11153v1.pdf","comment":"Accepted by IEEE Biomedical Circuits and Systems (BIOCAS) 2023"},{"id":"http://arxiv.org/abs/2209.02397v2","updated":"2023-10-17T11:09:43Z","published":"2022-09-06T11:15:58Z","title":"A Scene-Text Synthesis Engine Achieved Through Learning from Decomposed\n  Real-World Data","summary":"  Scene-text image synthesis techniques that aim to naturally compose text\ninstances on background scene images are very appealing for training deep\nneural networks due to their ability to provide accurate and comprehensive\nannotation information. Prior studies have explored generating synthetic text\nimages on two-dimensional and three-dimensional surfaces using rules derived\nfrom real-world observations. Some of these studies have proposed generating\nscene-text images through learning; however, owing to the absence of a suitable\ntraining dataset, unsupervised frameworks have been explored to learn from\nexisting real-world data, which might not yield reliable performance. To ease\nthis dilemma and facilitate research on learning-based scene text synthesis, we\nintroduce DecompST, a real-world dataset prepared from some public benchmarks,\ncontaining three types of annotations: quadrilateral-level BBoxes, stroke-level\ntext masks, and text-erased images. Leveraging the DecompST dataset, we propose\na Learning-Based Text Synthesis engine (LBTS) that includes a text location\nproposal network (TLPNet) and a text appearance adaptation network (TAANet).\nTLPNet first predicts the suitable regions for text embedding, after which\nTAANet adaptively adjusts the geometry and color of the text instance to match\nthe background context. After training, those networks can be integrated and\nutilized to generate the synthetic dataset for scene text analysis tasks.\nComprehensive experiments were conducted to validate the effectiveness of the\nproposed LBTS along with existing methods, and the experimental results\nindicate the proposed LBTS can generate better pretraining data for scene text\ndetectors.\n","authors":["Zhengmi Tang","Tomo Miyazaki","Shinichiro Omachi"],"pdf_url":"https://arxiv.org/pdf/2209.02397v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11142v1","updated":"2023-10-17T10:45:28Z","published":"2023-10-17T10:45:28Z","title":"BayesDiff: Estimating Pixel-wise Uncertainty in Diffusion via Bayesian\n  Inference","summary":"  Diffusion models have impressive image generation capability, but low-quality\ngenerations still exist, and their identification remains challenging due to\nthe lack of a proper sample-wise metric. To address this, we propose BayesDiff,\na pixel-wise uncertainty estimator for generations from diffusion models based\non Bayesian inference. In particular, we derive a novel uncertainty iteration\nprinciple to characterize the uncertainty dynamics in diffusion, and leverage\nthe last-layer Laplace approximation for efficient Bayesian inference. The\nestimated pixel-wise uncertainty can not only be aggregated into a sample-wise\nmetric to filter out low-fidelity images but also aids in augmenting successful\ngenerations and rectifying artifacts in failed generations in text-to-image\ntasks. Extensive experiments demonstrate the efficacy of BayesDiff and its\npromise for practical applications.\n","authors":["Siqi Kou","Lei Gan","Dequan Wang","Chongxuan Li","Zhijie Deng"],"pdf_url":"https://arxiv.org/pdf/2310.11142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13035v3","updated":"2023-10-17T10:23:46Z","published":"2023-05-22T13:39:28Z","title":"Getting ViT in Shape: Scaling Laws for Compute-Optimal Model Design","summary":"  Scaling laws have been recently employed to derive compute-optimal model size\n(number of parameters) for a given compute duration. We advance and refine such\nmethods to infer compute-optimal model shapes, such as width and depth, and\nsuccessfully implement this in vision transformers. Our shape-optimized vision\ntransformer, SoViT, achieves results competitive with models that exceed twice\nits size, despite being pre-trained with an equivalent amount of compute. For\nexample, SoViT-400m/14 achieves 90.3% fine-tuning accuracy on ILSRCV2012,\nsurpassing the much larger ViT-g/14 and approaching ViT-G/14 under identical\nsettings, with also less than half the inference cost. We conduct a thorough\nevaluation across multiple tasks, such as image classification, captioning, VQA\nand zero-shot transfer, demonstrating the effectiveness of our model across a\nbroad range of domains and identifying limitations. Overall, our findings\nchallenge the prevailing approach of blindly scaling up vision models and pave\na path for a more informed scaling.\n","authors":["Ibrahim Alabdulmohsin","Xiaohua Zhai","Alexander Kolesnikov","Lucas Beyer"],"pdf_url":"https://arxiv.org/pdf/2305.13035v3.pdf","comment":"10 pages, 7 figures, 9 tables. Version 2: Layout fixes"},{"id":"http://arxiv.org/abs/2310.11117v1","updated":"2023-10-17T10:04:47Z","published":"2023-10-17T10:04:47Z","title":"USDC: Unified Static and Dynamic Compression for Visual Transformer","summary":"  Visual Transformers have achieved great success in almost all vision tasks,\nsuch as classification, detection, and so on. However, the model complexity and\nthe inference speed of the visual transformers hinder their deployments in\nindustrial products. Various model compression techniques focus on directly\ncompressing the visual transformers into a smaller one while maintaining the\nmodel performance, however, the performance drops dramatically when the\ncompression ratio is large. Furthermore, several dynamic network techniques\nhave also been applied to dynamically compress the visual transformers to\nobtain input-adaptive efficient sub-structures during the inference stage,\nwhich can achieve a better trade-off between the compression ratio and the\nmodel performance. The upper bound of memory of dynamic models is not reduced\nin the practical deployment since the whole original visual transformer model\nand the additional control gating modules should be loaded onto devices\ntogether for inference. To alleviate two disadvantages of two categories of\nmethods, we propose to unify the static compression and dynamic compression\ntechniques jointly to obtain an input-adaptive compressed model, which can\nfurther better balance the total compression ratios and the model performances.\nMoreover, in practical deployment, the batch sizes of the training and\ninference stage are usually different, which will cause the model inference\nperformance to be worse than the model training performance, which is not\ntouched by all previous dynamic network papers. We propose a sub-group gates\naugmentation technique to solve this performance drop problem. Extensive\nexperiments demonstrate the superiority of our method on various baseline\nvisual transformers such as DeiT, T2T-ViT, and so on.\n","authors":["Huan Yuan","Chao Liao","Jianchao Tan","Peng Yao","Jiyuan Jia","Bin Chen","Chengru Song","Di Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.11117v1.pdf","comment":"This paper was actually finished in 2021"},{"id":"http://arxiv.org/abs/2210.11443v2","updated":"2023-10-17T09:54:20Z","published":"2022-10-20T17:45:22Z","title":"Snapshot of Algebraic Vision","summary":"  In this survey article, we present interactions between algebraic geometry\nand computer vision, which have recently come under the header of algebraic\nvision. The subject has given new insights in multiple view geometry and its\napplication to 3D scene reconstruction and carried a host of novel problems and\nideas back into algebraic geometry.\n","authors":["Joe Kileel","Kathlén Kohn"],"pdf_url":"https://arxiv.org/pdf/2210.11443v2.pdf","comment":"v2: incorporated referees' suggestions"},{"id":"http://arxiv.org/abs/2310.10123v2","updated":"2023-10-17T09:54:02Z","published":"2023-10-16T07:00:32Z","title":"AutoDIR: Automatic All-in-One Image Restoration with Latent Diffusion","summary":"  In this paper, we aim to solve complex real-world image restoration\nsituations, in which, one image may have a variety of unknown degradations. To\nthis end, we propose an all-in-one image restoration framework with latent\ndiffusion (AutoDIR), which can automatically detect and address multiple\nunknown degradations. Our framework first utilizes a Blind Image Quality\nAssessment Module (BIQA) to automatically detect and identify the unknown\ndominant image degradation type of the image. Then, an All-in-One Image\nRefinement (AIR) Module handles multiple kinds of degradation image restoration\nwith the guidance of BIQA. Finally, a Structure Correction Module (SCM) is\nproposed to recover the image details distorted by AIR. Our comprehensive\nevaluation demonstrates that AutoDIR outperforms state-of-the-art approaches by\nachieving superior restoration results while supporting a wider range of tasks.\nNotably, AutoDIR is also the first method to automatically handle real-scenario\nimages with multiple unknown degradations.\n","authors":["Yitong Jiang","Zhaoyang Zhang","Tianfan Xue","Jinwei Gu"],"pdf_url":"https://arxiv.org/pdf/2310.10123v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11112v1","updated":"2023-10-17T09:52:54Z","published":"2023-10-17T09:52:54Z","title":"Super resolution of histopathological frozen sections via deep learning\n  preserving tissue structure","summary":"  Histopathology plays a pivotal role in medical diagnostics. In contrast to\npreparing permanent sections for histopathology, a time-consuming process,\npreparing frozen sections is significantly faster and can be performed during\nsurgery, where the sample scanning time should be optimized. Super-resolution\ntechniques allow imaging the sample in lower magnification and sparing scanning\ntime. In this paper, we present a new approach to super resolution for\nhistopathological frozen sections, with focus on achieving better distortion\nmeasures, rather than pursuing photorealistic images that may compromise\ncritical diagnostic information. Our deep-learning architecture focuses on\nlearning the error between interpolated images and real images, thereby it\ngenerates high-resolution images while preserving critical image details,\nreducing the risk of diagnostic misinterpretation. This is done by leveraging\nthe loss functions in the frequency domain, assigning higher weights to the\nreconstruction of complex, high-frequency components. In comparison to existing\nmethods, we obtained significant improvements in terms of Structural Similarity\nIndex (SSIM) and Peak Signal-to-Noise Ratio (PSNR), as well as indicated\ndetails that lost in the low-resolution frozen-section images, affecting the\npathologist's clinical decisions. Our approach has a great potential in\nproviding more-rapid frozen-section imaging, with less scanning, while\npreserving the high resolution in the imaged sample.\n","authors":["Elad Yoshai","Gil Goldinger","Miki Haifler","Natan T. Shaked"],"pdf_url":"https://arxiv.org/pdf/2310.11112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11106v1","updated":"2023-10-17T09:44:30Z","published":"2023-10-17T09:44:30Z","title":"3D Structure-guided Network for Tooth Alignment in 2D Photograph","summary":"  Orthodontics focuses on rectifying misaligned teeth (i.e., malocclusions),\naffecting both masticatory function and aesthetics. However, orthodontic\ntreatment often involves complex, lengthy procedures. As such, generating a 2D\nphotograph depicting aligned teeth prior to orthodontic treatment is crucial\nfor effective dentist-patient communication and, more importantly, for\nencouraging patients to accept orthodontic intervention. In this paper, we\npropose a 3D structure-guided tooth alignment network that takes 2D photographs\nas input (e.g., photos captured by smartphones) and aligns the teeth within the\n2D image space to generate an orthodontic comparison photograph featuring\naesthetically pleasing, aligned teeth. Notably, while the process operates\nwithin a 2D image space, our method employs 3D intra-oral scanning models\ncollected in clinics to learn about orthodontic treatment, i.e., projecting the\npre- and post-orthodontic 3D tooth structures onto 2D tooth contours, followed\nby a diffusion model to learn the mapping relationship. Ultimately, the aligned\ntooth contours are leveraged to guide the generation of a 2D photograph with\naesthetically pleasing, aligned teeth and realistic textures. We evaluate our\nnetwork on various facial photographs, demonstrating its exceptional\nperformance and strong applicability within the orthodontic industry.\n","authors":["Yulong Dou","Lanzhuju Mei","Dinggang Shen","Zhiming Cui"],"pdf_url":"https://arxiv.org/pdf/2310.11106v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11105v1","updated":"2023-10-17T09:39:53Z","published":"2023-10-17T09:39:53Z","title":"Generalizability of CNN Architectures for Face Morph Presentation Attack","summary":"  Automatic border control systems are wide spread in modern airports\nworldwide. Morphing attacks on face biometrics is a serious threat that\nundermines the security and reliability of face recognition systems deployed in\nairports and border controls. Therefore, developing a robust Machine Learning\n(ML) system is necessary to prevent criminals crossing borders with fake\nidentifications especially since it has been shown that security officers\ncannot detect morphs better than machines. In this study, we investigate the\ngeneralization power of Convolutional Neural Network (CNN) architectures\nagainst morphing attacks. The investigation utilizes 5 distinct CNNs namely\nShuffleNet, DenseNet201, VGG16, EffecientNet-B0 and InceptionResNet-v2. Each\nCNN architecture represents a well-known family of CNN models in terms of\nnumber of parameters, architectural design and performance across various\ncomputer vision applications. To ensure robust evaluation, we employ 4\ndifferent datasets (Utrecht, London, Defacto and KurdFace) that contain a\ndiverse range of digital face images which cover variations in ethnicity,\ngender, age, lighting condition and camera setting. One of the fundamental\nconcepts of ML system design is the ability to generalize effectively to\npreviously unseen data, hence not only we evaluate the performance of CNN\nmodels within individual datasets but also explore their performance across\ncombined datasets and investigating each dataset in testing phase only.\nExperimental results on more than 8 thousand images (genuine and morph) from\nthe 4 datasets show that InceptionResNet-v2 generalizes better to unseen data\nand outperforms the other 4 CNN models.\n","authors":["Sherko R. HmaSalah","Aras Asaad"],"pdf_url":"https://arxiv.org/pdf/2310.11105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11093v1","updated":"2023-10-17T09:22:20Z","published":"2023-10-17T09:22:20Z","title":"SODA: Robust Training of Test-Time Data Adaptors","summary":"  Adapting models deployed to test distributions can mitigate the performance\ndegradation caused by distribution shifts. However, privacy concerns may render\nmodel parameters inaccessible. One promising approach involves utilizing\nzeroth-order optimization (ZOO) to train a data adaptor to adapt the test data\nto fit the deployed models. Nevertheless, the data adaptor trained with ZOO\ntypically brings restricted improvements due to the potential corruption of\ndata features caused by the data adaptor. To address this issue, we revisit ZOO\nin the context of test-time data adaptation. We find that the issue directly\nstems from the unreliable estimation of the gradients used to optimize the data\nadaptor, which is inherently due to the unreliable nature of the pseudo-labels\nassigned to the test data. Based on this observation, we propose\npseudo-label-robust data adaptation (SODA) to improve the performance of data\nadaptation. Specifically, SODA leverages high-confidence predicted labels as\nreliable labels to optimize the data adaptor with ZOO for label prediction. For\ndata with low-confidence predictions, SODA encourages the adaptor to preserve\ndata information to mitigate data corruption. Empirical results indicate that\nSODA can significantly enhance the performance of deployed models in the\npresence of distribution shifts without requiring access to model parameters.\n","authors":["Zige Wang","Yonggang Zhang","Zhen Fang","Long Lan","Wenjing Yang","Bo Han"],"pdf_url":"https://arxiv.org/pdf/2310.11093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.09483v3","updated":"2023-10-17T09:22:01Z","published":"2022-09-20T05:52:28Z","title":"Diffusion Unit: Interpretable Edge Enhancement and Suppression Learning\n  for 3D Point Cloud Segmentation","summary":"  3D point clouds are discrete samples of continuous surfaces which can be used\nfor various applications. However, the lack of true connectivity information,\ni.e., edge information, makes point cloud recognition challenging. Recent\nedge-aware methods incorporate edge modeling into network designs to better\ndescribe local structures. Although these methods show that incorporating edge\ninformation is beneficial, how edge information helps remains unclear, making\nit difficult for users to analyze its usefulness. To shed light on this issue,\nin this study, we propose a new algorithm called Diffusion Unit (DU) that\nhandles edge information in a principled and interpretable manner while\nproviding decent improvement. First, we theoretically show that DU learns to\nperform task-beneficial edge enhancement and suppression. Second, we\nexperimentally observe and verify the edge enhancement and suppression\nbehavior. Third, we empirically demonstrate that this behavior contributes to\nperformance improvement. Extensive experiments and analyses performed on\nchallenging benchmarks verify the effectiveness of DU. Specifically, our method\nachieves state-of-the-art performance in object part segmentation using\nShapeNet part and scene segmentation using S3DIS. Our source code is available\nat https://github.com/martianxiu/DiffusionUnit.\n","authors":["Haoyi Xiu","Xin Liu","Weimin Wang","Kyoung-Sook Kim","Takayuki Shinohara","Qiong Chang","Masashi Matsuoka"],"pdf_url":"https://arxiv.org/pdf/2209.09483v3.pdf","comment":"Neurocomputing"},{"id":"http://arxiv.org/abs/2310.11092v1","updated":"2023-10-17T09:21:29Z","published":"2023-10-17T09:21:29Z","title":"DORec: Decomposed Object Reconstruction Utilizing 2D Self-Supervised\n  Features","summary":"  Decomposing a target object from a complex background while reconstructing is\nchallenging. Most approaches acquire the perception for object instances\nthrough the use of manual labels, but the annotation procedure is costly. The\nrecent advancements in 2D self-supervised learning have brought new prospects\nto object-aware representation, yet it remains unclear how to leverage such\nnoisy 2D features for clean decomposition. In this paper, we propose a\nDecomposed Object Reconstruction (DORec) network based on neural implicit\nrepresentations. Our key idea is to transfer 2D self-supervised features into\nmasks of two levels of granularity to supervise the decomposition, including a\nbinary mask to indicate the foreground regions and a K-cluster mask to indicate\nthe semantically similar regions. These two masks are complementary to each\nother and lead to robust decomposition. Experimental results show the\nsuperiority of DORec in segmenting and reconstructing the foreground object on\nvarious datasets.\n","authors":["Jun Wu","Sicheng Li","Sihui Ji","Yue Wang","Rong Xiong","Yiyi Liao"],"pdf_url":"https://arxiv.org/pdf/2310.11092v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.04238v4","updated":"2023-10-17T09:16:06Z","published":"2023-03-07T21:03:48Z","title":"Patch of Invisibility: Naturalistic Physical Black-Box Adversarial\n  Attacks on Object Detectors","summary":"  Adversarial attacks on deep-learning models have been receiving increased\nattention in recent years. Work in this area has mostly focused on\ngradient-based techniques, so-called ``white-box'' attacks, wherein the\nattacker has access to the targeted model's internal parameters; such an\nassumption is usually unrealistic in the real world. Some attacks additionally\nuse the entire pixel space to fool a given model, which is neither practical\nnor physical (i.e., real-world). On the contrary, we propose herein a direct,\nblack-box, gradient-free method that uses the learned image manifold of a\npretrained generative adversarial network (GAN) to generate naturalistic\nphysical adversarial patches for object detectors. To our knowledge this is the\nfirst and only method that performs black-box physical attacks directly on\nobject-detection models, which results with a model-agnostic attack. We show\nthat our proposed method works both digitally and physically. We compared our\napproach against four different black-box attacks with different\nconfigurations. Our approach outperformed all other approaches that were tested\nin our experiments by a large margin.\n","authors":["Raz Lapid","Eylon Mizrahi","Moshe Sipper"],"pdf_url":"https://arxiv.org/pdf/2303.04238v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07184v2","updated":"2023-10-17T09:00:22Z","published":"2023-10-11T04:20:32Z","title":"NeuroInspect: Interpretable Neuron-based Debugging Framework through\n  Class-conditional Visualizations","summary":"  Despite deep learning (DL) has achieved remarkable progress in various\ndomains, the DL models are still prone to making mistakes. This issue\nnecessitates effective debugging tools for DL practitioners to interpret the\ndecision-making process within the networks. However, existing debugging\nmethods often demand extra data or adjustments to the decision process,\nlimiting their applicability. To tackle this problem, we present NeuroInspect,\nan interpretable neuron-based debugging framework with three key stages:\ncounterfactual explanations, feature visualizations, and false correlation\nmitigation. Our debugging framework first pinpoints neurons responsible for\nmistakes in the network and then visualizes features embedded in the neurons to\nbe human-interpretable. To provide these explanations, we introduce\nCLIP-Illusion, a novel feature visualization method that generates images\nrepresenting features conditioned on classes to examine the connection between\nneurons and the decision layer. We alleviate convoluted explanations of the\nconventional visualization approach by employing class information, thereby\nisolating mixed properties. This process offers more human-interpretable\nexplanations for model errors without altering the trained network or requiring\nadditional data. Furthermore, our framework mitigates false correlations\nlearned from a dataset under a stochastic perspective, modifying decisions for\nthe neurons considered as the main causes. We validate the effectiveness of our\nframework by addressing false correlations and improving inferences for classes\nwith the worst performance in real-world settings. Moreover, we demonstrate\nthat NeuroInspect helps debug the mistakes of DL models through evaluation for\nhuman understanding. The code is openly available at\nhttps://github.com/yeongjoonJu/NeuroInspect.\n","authors":["Yeong-Joon Ju","Ji-Hoon Park","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2310.07184v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15010v3","updated":"2023-10-17T08:57:30Z","published":"2023-06-26T18:49:09Z","title":"Efficient High-Resolution Template Matching with Vector Quantized\n  Nearest Neighbour Fields","summary":"  Template matching is a fundamental problem in computer vision with\napplications in fields including object detection, image registration, and\nobject tracking. Current methods rely on nearest-neighbour (NN) matching, where\nthe query feature space is converted to NN space by representing each query\npixel with its NN in the template. NN-based methods have been shown to perform\nbetter in occlusions, appearance changes, and non-rigid transformations;\nhowever, they scale poorly with high-resolution data and high feature\ndimensions. We present an NN-based method which efficiently reduces the NN\ncomputations and introduces filtering in the NN fields (NNFs). A vector\nquantization step is introduced before the NN calculation to represent the\ntemplate with $k$ features, and the filter response over the NNFs is used to\ncompare the template and query distributions over the features. We show that\nstate-of-the-art performance is achieved in low-resolution data, and our method\noutperforms previous methods at higher resolution.\n","authors":["Ankit Gupta","Ida-Maria Sintorn"],"pdf_url":"https://arxiv.org/pdf/2306.15010v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11077v1","updated":"2023-10-17T08:51:44Z","published":"2023-10-17T08:51:44Z","title":"United We Stand: Using Epoch-wise Agreement of Ensembles to Combat\n  Overfit","summary":"  Deep neural networks have become the method of choice for solving many image\nclassification tasks, largely because they can fit very complex functions\ndefined over raw images. The downside of such powerful learners is the danger\nof overfitting the training set, leading to poor generalization, which is\nusually avoided by regularization and \"early stopping\" of the training. In this\npaper, we propose a new deep network ensemble classifier that is very effective\nagainst overfit. We begin with the theoretical analysis of a regression model,\nwhose predictions - that the variance among classifiers increases when overfit\noccurs - is demonstrated empirically in deep networks in common use. Guided by\nthese results, we construct a new ensemble-based prediction method designed to\ncombat overfit, where the prediction is determined by the most consensual\nprediction throughout the training. On multiple image and text classification\ndatasets, we show that when regular ensembles suffer from overfit, our method\neliminates the harmful reduction in generalization due to overfit, and often\neven surpasses the performance obtained by early stopping. Our method is easy\nto implement, and can be integrated with any training scheme and architecture,\nwithout additional prior knowledge beyond the training set. Accordingly, it is\na practical and useful tool to overcome overfit.\n","authors":["Uri Stern","Daniel Shwartz","Daphna Weinshall"],"pdf_url":"https://arxiv.org/pdf/2310.11077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02569v2","updated":"2023-10-17T08:11:15Z","published":"2023-10-04T04:07:37Z","title":"ReForm-Eval: Evaluating Large Vision Language Models via Unified\n  Re-Formulation of Task-Oriented Benchmarks","summary":"  Recent years have witnessed remarkable progress in the development of large\nvision-language models (LVLMs). Benefiting from the strong language backbones\nand efficient cross-modal alignment strategies, LVLMs exhibit surprising\ncapabilities to perceive visual signals and perform visually grounded\nreasoning. However, the capabilities of LVLMs have not been comprehensively and\nquantitatively evaluate. Most existing multi-modal benchmarks require\ntask-oriented input-output formats, posing great challenges to automatically\nassess the free-form text output of LVLMs. To effectively leverage the\nannotations available in existing benchmarks and reduce the manual effort\nrequired for constructing new benchmarks, we propose to re-formulate existing\nbenchmarks into unified LVLM-compatible formats. Through systematic data\ncollection and reformulation, we present the ReForm-Eval benchmark, offering\nsubstantial data for evaluating various capabilities of LVLMs. Based on\nReForm-Eval, we conduct extensive experiments, thoroughly analyze the strengths\nand weaknesses of existing LVLMs, and identify the underlying factors. Our\nbenchmark and evaluation framework will be open-sourced as a cornerstone for\nadvancing the development of LVLMs.\n","authors":["Zejun Li","Ye Wang","Mengfei Du","Qingwen Liu","Binhao Wu","Jiwen Zhang","Chengxing Zhou","Zhihao Fan","Jie Fu","Jingjing Chen","Xuanjing Huang","Zhongyu Wei"],"pdf_url":"https://arxiv.org/pdf/2310.02569v2.pdf","comment":"38 pages, 11 figures, 24 tables"},{"id":"http://arxiv.org/abs/2306.04607v5","updated":"2023-10-17T07:51:55Z","published":"2023-06-07T17:17:58Z","title":"GeoDiffusion: Text-Prompted Geometric Control for Object Detection Data\n  Generation","summary":"  Diffusion models have attracted significant attention due to the remarkable\nability to create content and generate data for tasks like image\nclassification. However, the usage of diffusion models to generate the\nhigh-quality object detection data remains an underexplored area, where not\nonly image-level perceptual quality but also geometric conditions such as\nbounding boxes and camera views are essential. Previous studies have utilized\neither copy-paste synthesis or layout-to-image (L2I) generation with\nspecifically designed modules to encode semantic layouts. In this paper, we\npropose GeoDiffusion, a simple framework that can flexibly translate various\ngeometric conditions into text prompts and empower pre-trained text-to-image\n(T2I) diffusion models for high-quality detection data generation. Unlike\nprevious L2I methods, our GeoDiffusion is able to encode not only the bounding\nboxes but also extra geometric conditions such as camera views in self-driving\nscenes. Extensive experiments demonstrate GeoDiffusion outperforms previous L2I\nmethods while maintaining 4x training time faster. To the best of our\nknowledge, this is the first work to adopt diffusion models for layout-to-image\ngeneration with geometric conditions and demonstrate that L2I-generated images\ncan be beneficial for improving the performance of object detectors.\n","authors":["Kai Chen","Enze Xie","Zhe Chen","Yibo Wang","Lanqing Hong","Zhenguo Li","Dit-Yan Yeung"],"pdf_url":"https://arxiv.org/pdf/2306.04607v5.pdf","comment":"Project Page: https://kaichen1998.github.io/projects/geodiffusion/"},{"id":"http://arxiv.org/abs/2306.02602v2","updated":"2023-10-17T07:51:10Z","published":"2023-06-05T05:21:15Z","title":"ReContrast: Domain-Specific Anomaly Detection via Contrastive\n  Reconstruction","summary":"  Most advanced unsupervised anomaly detection (UAD) methods rely on modeling\nfeature representations of frozen encoder networks pre-trained on large-scale\ndatasets, e.g. ImageNet. However, the features extracted from the encoders that\nare borrowed from natural image domains coincide little with the features\nrequired in the target UAD domain, such as industrial inspection and medical\nimaging. In this paper, we propose a novel epistemic UAD method, namely\nReContrast, which optimizes the entire network to reduce biases towards the\npre-trained image domain and orients the network in the target domain. We start\nwith a feature reconstruction approach that detects anomalies from errors.\nEssentially, the elements of contrastive learning are elegantly embedded in\nfeature reconstruction to prevent the network from training instability,\npattern collapse, and identical shortcut, while simultaneously optimizing both\nthe encoder and decoder on the target domain. To demonstrate our transfer\nability on various image domains, we conduct extensive experiments across two\npopular industrial defect detection benchmarks and three medical image UAD\ntasks, which shows our superiority over current state-of-the-art methods.\n","authors":["Jia Guo","Shuai Lu","Lize Jia","Weihang Zhang","Huiqi Li"],"pdf_url":"https://arxiv.org/pdf/2306.02602v2.pdf","comment":"NeurIPS 2023 Poster"},{"id":"http://arxiv.org/abs/2305.12961v2","updated":"2023-10-17T07:44:33Z","published":"2023-05-22T12:11:07Z","title":"Enhanced Meta Label Correction for Coping with Label Corruption","summary":"  Traditional methods for learning with the presence of noisy labels have\nsuccessfully handled datasets with artificially injected noise but still fall\nshort of adequately handling real-world noise. With the increasing use of\nmeta-learning in the diverse fields of machine learning, researchers leveraged\nauxiliary small clean datasets to meta-correct the training labels.\nNonetheless, existing meta-label correction approaches are not fully exploiting\ntheir potential. In this study, we propose an Enhanced Meta Label Correction\napproach abbreviated as EMLC for the learning with noisy labels (LNL) problem.\nWe re-examine the meta-learning process and introduce faster and more accurate\nmeta-gradient derivations. We propose a novel teacher architecture tailored\nexplicitly to the LNL problem, equipped with novel training objectives. EMLC\noutperforms prior approaches and achieves state-of-the-art results in all\nstandard benchmarks. Notably, EMLC enhances the previous art on the noisy\nreal-world dataset Clothing1M by $1.52\\%$ while requiring $\\times 0.5$ the time\nper epoch and with much faster convergence of the meta-objective when compared\nto the baseline approach.\n","authors":["Mitchell Keren Taraday","Chaim Baskin"],"pdf_url":"https://arxiv.org/pdf/2305.12961v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2310.11050v1","updated":"2023-10-17T07:37:32Z","published":"2023-10-17T07:37:32Z","title":"$k$-$t$ CLAIR: Self-Consistency Guided Multi-Prior Learning for Dynamic\n  Parallel MR Image Reconstruction","summary":"  Cardiac magnetic resonance imaging (CMR) has been widely used in clinical\npractice for the medical diagnosis of cardiac diseases. However, the long\nacquisition time hinders its development in real-time applications. Here, we\npropose a novel self-consistency guided multi-prior learning framework named\n$k$-$t$ CLAIR to exploit spatiotemporal correlations from highly undersampled\ndata for accelerated dynamic parallel MRI reconstruction. The $k$-$t$ CLAIR\nprogressively reconstructs faithful images by leveraging multiple complementary\npriors learned in the $x$-$t$, $x$-$f$, and $k$-$t$ domains in an iterative\nfashion, as dynamic MRI exhibits high spatiotemporal redundancy. Additionally,\n$k$-$t$ CLAIR incorporates calibration information for prior learning,\nresulting in a more consistent reconstruction. Experimental results on cardiac\ncine and T1W/T2W images demonstrate that $k$-$t$ CLAIR achieves high-quality\ndynamic MR reconstruction in terms of both quantitative and qualitative\nperformance.\n","authors":["Liping Zhang","Weitian Chen"],"pdf_url":"https://arxiv.org/pdf/2310.11050v1.pdf","comment":"12 pages, 3 figures, 4 tables. CMRxRecon Challenge, MICCAI 2023"},{"id":"http://arxiv.org/abs/2310.11040v1","updated":"2023-10-17T07:13:28Z","published":"2023-10-17T07:13:28Z","title":"Co-Learning Semantic-aware Unsupervised Segmentation for Pathological\n  Image Registration","summary":"  The registration of pathological images plays an important role in medical\napplications. Despite its significance, most researchers in this field\nprimarily focus on the registration of normal tissue into normal tissue. The\nnegative impact of focal tissue, such as the loss of spatial correspondence\ninformation and the abnormal distortion of tissue, are rarely considered. In\nthis paper, we propose GIRNet, a novel unsupervised approach for pathological\nimage registration by incorporating segmentation and inpainting through the\nprinciples of Generation, Inpainting, and Registration (GIR). The registration,\nsegmentation, and inpainting modules are trained simultaneously in a\nco-learning manner so that the segmentation of the focal area and the\nregistration of inpainted pairs can improve collaboratively. Overall, the\nregistration of pathological images is achieved in a completely unsupervised\nlearning framework. Experimental results on multiple datasets, including\nMagnetic Resonance Imaging (MRI) of T1 sequences, demonstrate the efficacy of\nour proposed method. Our results show that our method can accurately achieve\nthe registration of pathological images and identify lesions even in\nchallenging imaging modalities. Our unsupervised approach offers a promising\nsolution for the efficient and cost-effective registration of pathological\nimages. Our code is available at\nhttps://github.com/brain-intelligence-lab/GIRNet.\n","authors":["Yang Liu","Shi Gu"],"pdf_url":"https://arxiv.org/pdf/2310.11040v1.pdf","comment":"13 pages, 7 figures, published in Medical Image Computing and\n  Computer Assisted Intervention (MICCAI) 2023"},{"id":"http://arxiv.org/abs/2310.11031v1","updated":"2023-10-17T07:01:24Z","published":"2023-10-17T07:01:24Z","title":"Domain Generalization Using Large Pretrained Models with\n  Mixture-of-Adapters","summary":"  Learning a robust vision model despite large distribution shift is essential\nfor model deployment in real-world settings. Especially, domain generalization\n(DG) algorithm aims to maintain the performance of a trained model on different\ndistributions which were not seen during training. One of the most effective\nmethods has been leveraging the already learned rich knowledge of large\npretrained models. However, naively fine-tuning large models to DG tasks is\noften practically infeasible due to memory limitations, extensive time\nrequirements for training, and the risk of learned knowledge deterioration.\nRecently, parameter-efficient fine-tuning (PEFT) methods have been proposed to\nreduce the high computational cost during training and efficiently adapt large\nmodels to downstream tasks. In this work, for the first time, we find that the\nuse of adapters in PEFT methods not only reduce high computational cost during\ntraining but also serve as an effective regularizer for DG tasks. Surprisingly,\na naive adapter implementation for large models achieve superior performance on\ncommon datasets. However, in situations of large distribution shifts,\nadditional factors such as optimal amount of regularization due to the strength\nof distribution shifts should be considered for a sophisticated adapter\nimplementation. To address this, we propose a mixture-of-expert based adapter\nfine-tuning method, dubbed as mixture-of-adapters (MoA). Specifically, we\nemploy multiple adapters that have varying capacities, and by using learnable\nrouters, we allocate each token to a proper adapter. By using both PEFT and MoA\nmethods, we effectively alleviate the performance deterioration caused by\ndistribution shifts and achieve state-of-the-art performance on diverse DG\nbenchmarks.\n","authors":["Gyuseong Lee","Wooseok Jang","Jin Hyeon Kim","Jaewoo Jung","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2310.11031v1.pdf","comment":"20 pages, 11 figures"},{"id":"http://arxiv.org/abs/2307.09052v3","updated":"2023-10-17T06:56:51Z","published":"2023-07-18T08:06:14Z","title":"Connections between Operator-splitting Methods and Deep Neural Networks\n  with Applications in Image Segmentation","summary":"  Deep neural network is a powerful tool for many tasks. Understanding why it\nis so successful and providing a mathematical explanation is an important\nproblem and has been one popular research direction in past years. In the\nliterature of mathematical analysis of deep neural networks, a lot of works is\ndedicated to establishing representation theories. How to make connections\nbetween deep neural networks and mathematical algorithms is still under\ndevelopment. In this paper, we give an algorithmic explanation for deep neural\nnetworks, especially in their connections with operator splitting. We show that\nwith certain splitting strategies, operator-splitting methods have the same\nstructure as networks. Utilizing this connection and the Potts model for image\nsegmentation, two networks inspired by operator-splitting methods are proposed.\nThe two networks are essentially two operator-splitting algorithms solving the\nPotts model. Numerical experiments are presented to demonstrate the\neffectiveness of the proposed networks.\n","authors":["Hao Liu","Xue-Cheng Tai","Raymond Chan"],"pdf_url":"https://arxiv.org/pdf/2307.09052v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10491v2","updated":"2023-10-17T06:24:02Z","published":"2023-09-19T09:59:08Z","title":"DCPT: Darkness Clue-Prompted Tracking in Nighttime UAVs","summary":"  Existing nighttime unmanned aerial vehicle (UAV) trackers follow an\n\"Enhance-then-Track\" architecture - first using a light enhancer to brighten\nthe nighttime video, then employing a daytime tracker to locate the object.\nThis separate enhancement and tracking fails to build an end-to-end trainable\nvision system. To address this, we propose a novel architecture called Darkness\nClue-Prompted Tracking (DCPT) that achieves robust UAV tracking at night by\nefficiently learning to generate darkness clue prompts. Without a separate\nenhancer, DCPT directly encodes anti-dark capabilities into prompts using a\ndarkness clue prompter (DCP). Specifically, DCP iteratively learns emphasizing\nand undermining projections for darkness clues. It then injects these learned\nvisual prompts into a daytime tracker with fixed parameters across transformer\nlayers. Moreover, a gated feature aggregation mechanism enables adaptive fusion\nbetween prompts and between prompts and the base model. Extensive experiments\nshow state-of-the-art performance for DCPT on multiple dark scenario\nbenchmarks. The unified end-to-end learning of enhancement and tracking in DCPT\nenables a more trainable system. The darkness clue prompting efficiently\ninjects anti-dark knowledge without extra modules. Code is available at\n\\href{https://github.com/bearyi26/DCPT}{here}.\n","authors":["Jiawen Zhu","Huayi Tang","Zhi-Qi Cheng","Jun-Yan He","Bin Luo","Shihao Qiu","Shengming Li","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2309.10491v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2310.02523v2","updated":"2023-10-17T06:13:50Z","published":"2023-10-04T01:47:36Z","title":"A Spatio-Temporal Attention-Based Method for Detecting Student Classroom\n  Behaviors","summary":"  Accurately detecting student behavior from classroom videos is beneficial for\nanalyzing their classroom status and improving teaching efficiency. However,\nlow accuracy in student classroom behavior detection is a prevalent issue. To\naddress this issue, we propose a Spatio-Temporal Attention-Based Method for\nDetecting Student Classroom Behaviors (BDSTA). Firstly, the SlowFast network is\nused to generate motion and environmental information feature maps from the\nvideo. Then, the spatio-temporal attention module is applied to the feature\nmaps, including information aggregation, compression and stimulation processes.\nSubsequently, attention maps in the time, channel and space dimensions are\nobtained, and multi-label behavior classification is performed based on these\nattention maps. To solve the long-tail data problem that exists in student\nclassroom behavior datasets, we use an improved focal loss function to assign\nmore weight to the tail class data during training. Experimental results are\nconducted on a self-made student classroom behavior dataset named STSCB.\nCompared with the SlowFast model, the average accuracy of student behavior\nclassification detection improves by 8.94\\% using BDSTA.\n","authors":["Fan Yang"],"pdf_url":"https://arxiv.org/pdf/2310.02523v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11332v2","updated":"2023-10-17T06:10:14Z","published":"2023-03-21T05:50:53Z","title":"Deep Learning for Video-based Person Re-Identification: A Survey","summary":"  Video-based person re-identification (video re-ID) has lately fascinated\ngrowing attention due to its broad practical applications in various areas,\nsuch as surveillance, smart city, and public safety. Nevertheless, video re-ID\nis quite difficult and is an ongoing stage due to numerous uncertain challenges\nsuch as viewpoint, occlusion, pose variation, and uncertain video sequence,\netc. In the last couple of years, deep learning on video re-ID has continuously\nachieved surprising results on public datasets, with various approaches being\ndeveloped to handle diverse problems in video re-ID. Compared to image-based\nre-ID, video re-ID is much more challenging and complex. To encourage future\nresearch and challenges, this first comprehensive paper introduces a review of\nup-to-date advancements in deep learning approaches for video re-ID. It broadly\ncovers three important aspects, including brief video re-ID methods with their\nlimitations, major milestones with technical challenges, and architectural\ndesign. It offers comparative performance analysis on various available\ndatasets, guidance to improve video re-ID with valuable thoughts, and exciting\nresearch directions.\n","authors":["Khawar Islam"],"pdf_url":"https://arxiv.org/pdf/2303.11332v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2203.01536v5","updated":"2023-10-17T06:05:41Z","published":"2022-03-03T06:17:03Z","title":"Recent Advances in Vision Transformer: A Survey and Outlook of Recent\n  Work","summary":"  Vision Transformers (ViTs) are becoming more popular and dominating technique\nfor various vision tasks, compare to Convolutional Neural Networks (CNNs). As a\ndemanding technique in computer vision, ViTs have been successfully solved\nvarious vision problems while focusing on long-range relationships. In this\npaper, we begin by introducing the fundamental concepts and background of the\nself-attention mechanism. Next, we provide a comprehensive overview of recent\ntop-performing ViT methods describing in terms of strength and weakness,\ncomputational cost as well as training and testing dataset. We thoroughly\ncompare the performance of various ViT algorithms and most representative CNN\nmethods on popular benchmark datasets. Finally, we explore some limitations\nwith insightful observations and provide further research direction. The\nproject page along with the collections of papers are available at\nhttps://github.com/khawar512/ViT-Survey\n","authors":["Khawar Islam"],"pdf_url":"https://arxiv.org/pdf/2203.01536v5.pdf","comment":"Added AAAI 2022 methods and working on ICLR 2022 methods and ICML\n  2022"},{"id":"http://arxiv.org/abs/2305.14014v2","updated":"2023-10-17T05:39:43Z","published":"2023-05-23T12:51:20Z","title":"CLIP4STR: A Simple Baseline for Scene Text Recognition with Pre-trained\n  Vision-Language Model","summary":"  Pre-trained vision-language models~(VLMs) are the de-facto foundation models\nfor various downstream tasks. However, scene text recognition methods still\nprefer backbones pre-trained on a single modality, namely, the visual modality,\ndespite the potential of VLMs to serve as powerful scene text readers. For\nexample, CLIP can robustly identify regular (horizontal) and irregular\n(rotated, curved, blurred, or occluded) text in images. With such merits, we\ntransform CLIP into a scene text reader and introduce CLIP4STR, a simple yet\neffective STR method built upon image and text encoders of CLIP. It has two\nencoder-decoder branches: a visual branch and a cross-modal branch. The visual\nbranch provides an initial prediction based on the visual feature, and the\ncross-modal branch refines this prediction by addressing the discrepancy\nbetween the visual feature and text semantics. To fully leverage the\ncapabilities of both branches, we design a dual predict-and-refine decoding\nscheme for inference. CLIP4STR achieves new state-of-the-art performance on 11\nSTR benchmarks. Additionally, a comprehensive empirical study is provided to\nenhance the understanding of the adaptation of CLIP to STR. We believe our\nmethod establishes a simple but strong baseline for future STR research with\nVLMs.\n","authors":["Shuai Zhao","Xiaohan Wang","Linchao Zhu","Ruijie Quan","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2305.14014v2.pdf","comment":"Preprint, work in progress"},{"id":"http://arxiv.org/abs/2308.08857v2","updated":"2023-10-17T05:27:05Z","published":"2023-08-17T08:31:11Z","title":"D-IF: Uncertainty-aware Human Digitization via Implicit Distribution\n  Field","summary":"  Realistic virtual humans play a crucial role in numerous industries, such as\nmetaverse, intelligent healthcare, and self-driving simulation. But creating\nthem on a large scale with high levels of realism remains a challenge. The\nutilization of deep implicit function sparks a new era of image-based 3D\nclothed human reconstruction, enabling pixel-aligned shape recovery with fine\ndetails. Subsequently, the vast majority of works locate the surface by\nregressing the deterministic implicit value for each point. However, should all\npoints be treated equally regardless of their proximity to the surface? In this\npaper, we propose replacing the implicit value with an adaptive uncertainty\ndistribution, to differentiate between points based on their distance to the\nsurface. This simple ``value to distribution'' transition yields significant\nimprovements on nearly all the baselines. Furthermore, qualitative results\ndemonstrate that the models trained using our uncertainty distribution loss,\ncan capture more intricate wrinkles, and realistic limbs. Code and models are\navailable for research purposes at https://github.com/psyai-net/D-IF_release.\n","authors":["Xueting Yang","Yihao Luo","Yuliang Xiu","Wei Wang","Hao Xu","Zhaoxin Fan"],"pdf_url":"https://arxiv.org/pdf/2308.08857v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15818v2","updated":"2023-10-17T04:59:54Z","published":"2023-09-27T17:44:18Z","title":"Show-1: Marrying Pixel and Latent Diffusion Models for Text-to-Video\n  Generation","summary":"  Significant advancements have been achieved in the realm of large-scale\npre-trained text-to-video Diffusion Models (VDMs). However, previous methods\neither rely solely on pixel-based VDMs, which come with high computational\ncosts, or on latent-based VDMs, which often struggle with precise text-video\nalignment. In this paper, we are the first to propose a hybrid model, dubbed as\nShow-1, which marries pixel-based and latent-based VDMs for text-to-video\ngeneration. Our model first uses pixel-based VDMs to produce a low-resolution\nvideo of strong text-video correlation. After that, we propose a novel expert\ntranslation method that employs the latent-based VDMs to further upsample the\nlow-resolution video to high resolution. Compared to latent VDMs, Show-1 can\nproduce high-quality videos of precise text-video alignment; Compared to pixel\nVDMs, Show-1 is much more efficient (GPU memory usage during inference is 15G\nvs 72G). We also validate our model on standard video generation benchmarks.\nOur code and model weights are publicly available at\nhttps://github.com/showlab/Show-1.\n","authors":["David Junhao Zhang","Jay Zhangjie Wu","Jia-Wei Liu","Rui Zhao","Lingmin Ran","Yuchao Gu","Difei Gao","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2309.15818v2.pdf","comment":"project page is https://showlab.github.io/Show-1"},{"id":"http://arxiv.org/abs/2310.10198v2","updated":"2023-10-17T04:53:30Z","published":"2023-10-16T09:09:02Z","title":"MoConVQ: Unified Physics-Based Motion Control via Scalable Discrete\n  Representations","summary":"  In this work, we present MoConVQ, a novel unified framework for physics-based\nmotion control leveraging scalable discrete representations. Building upon\nvector quantized variational autoencoders (VQ-VAE) and model-based\nreinforcement learning, our approach effectively learns motion embeddings from\na large, unstructured dataset spanning tens of hours of motion examples. The\nresultant motion representation not only captures diverse motion skills but\nalso offers a robust and intuitive interface for various applications. We\ndemonstrate the versatility of MoConVQ through several applications: universal\ntracking control from various motion sources, interactive character control\nwith latent motion representations using supervised learning, physics-based\nmotion generation from natural language descriptions using the GPT framework,\nand, most interestingly, seamless integration with large language models (LLMs)\nwith in-context learning to tackle complex and abstract tasks.\n","authors":["Heyuan Yao","Zhenhua Song","Yuyang Zhou","Tenglong Ao","Baoquan Chen","Libin Liu"],"pdf_url":"https://arxiv.org/pdf/2310.10198v2.pdf","comment":"Project page: https://pku-mocca.github.io/MoConVQ-page/"},{"id":"http://arxiv.org/abs/2302.04867v4","updated":"2023-10-17T04:13:57Z","published":"2023-02-09T18:59:48Z","title":"UniPC: A Unified Predictor-Corrector Framework for Fast Sampling of\n  Diffusion Models","summary":"  Diffusion probabilistic models (DPMs) have demonstrated a very promising\nability in high-resolution image synthesis. However, sampling from a\npre-trained DPM is time-consuming due to the multiple evaluations of the\ndenoising network, making it more and more important to accelerate the sampling\nof DPMs. Despite recent progress in designing fast samplers, existing methods\nstill cannot generate satisfying images in many applications where fewer steps\n(e.g., $<$10) are favored. In this paper, we develop a unified corrector (UniC)\nthat can be applied after any existing DPM sampler to increase the order of\naccuracy without extra model evaluations, and derive a unified predictor (UniP)\nthat supports arbitrary order as a byproduct. Combining UniP and UniC, we\npropose a unified predictor-corrector framework called UniPC for the fast\nsampling of DPMs, which has a unified analytical form for any order and can\nsignificantly improve the sampling quality over previous methods, especially in\nextremely few steps. We evaluate our methods through extensive experiments\nincluding both unconditional and conditional sampling using pixel-space and\nlatent-space DPMs. Our UniPC can achieve 3.87 FID on CIFAR10 (unconditional)\nand 7.51 FID on ImageNet 256$\\times$256 (conditional) with only 10 function\nevaluations. Code is available at https://github.com/wl-zhao/UniPC.\n","authors":["Wenliang Zhao","Lujia Bai","Yongming Rao","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2302.04867v4.pdf","comment":"Accepted by NeurIPS 2023. Project page:\n  https://unipc.ivg-research.xyz"},{"id":"http://arxiv.org/abs/2310.10975v1","updated":"2023-10-17T03:42:12Z","published":"2023-10-17T03:42:12Z","title":"NICE: Improving Panoptic Narrative Detection and Segmentation with\n  Cascading Collaborative Learning","summary":"  Panoptic Narrative Detection (PND) and Segmentation (PNS) are two challenging\ntasks that involve identifying and locating multiple targets in an image\naccording to a long narrative description. In this paper, we propose a unified\nand effective framework called NICE that can jointly learn these two panoptic\nnarrative recognition tasks. Existing visual grounding tasks use a two-branch\nparadigm, but applying this directly to PND and PNS can result in prediction\nconflict due to their intrinsic many-to-many alignment property. To address\nthis, we introduce two cascading modules based on the barycenter of the mask,\nwhich are Coordinate Guided Aggregation (CGA) and Barycenter Driven\nLocalization (BDL), responsible for segmentation and detection, respectively.\nBy linking PNS and PND in series with the barycenter of segmentation as the\nanchor, our approach naturally aligns the two tasks and allows them to\ncomplement each other for improved performance. Specifically, CGA provides the\nbarycenter as a reference for detection, reducing BDL's reliance on a large\nnumber of candidate boxes. BDL leverages its excellent properties to\ndistinguish different instances, which improves the performance of CGA for\nsegmentation. Extensive experiments demonstrate that NICE surpasses all\nexisting methods by a large margin, achieving 4.1% for PND and 2.9% for PNS\nover the state-of-the-art. These results validate the effectiveness of our\nproposed collaborative learning strategy. The project of this work is made\npublicly available at https://github.com/Mr-Neko/NICE.\n","authors":["Haowei Wang","Jiayi Ji","Tianyu Guo","Yilong Yang","Yiyi Zhou","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2310.10975v1.pdf","comment":"18 pages. 9 figures, 9 tables"},{"id":"http://arxiv.org/abs/2310.01596v2","updated":"2023-10-17T03:41:32Z","published":"2023-10-02T19:41:42Z","title":"ImagenHub: Standardizing the evaluation of conditional image generation\n  models","summary":"  Recently, a myriad of conditional image generation and editing models have\nbeen developed to serve different downstream tasks, including text-to-image\ngeneration, text-guided image editing, subject-driven image generation,\ncontrol-guided image generation, etc. However, we observe huge inconsistencies\nin experimental conditions: datasets, inference, and evaluation metrics -\nrender fair comparisons difficult. This paper proposes ImagenHub, which is a\none-stop library to standardize the inference and evaluation of all the\nconditional image generation models. Firstly, we define seven prominent tasks\nand curate high-quality evaluation datasets for them. Secondly, we built a\nunified inference pipeline to ensure fair comparison. Thirdly, we design two\nhuman evaluation scores, i.e. Semantic Consistency and Perceptual Quality,\nalong with comprehensive guidelines to evaluate generated images. We train\nexpert raters to evaluate the model outputs based on the proposed metrics. Our\nhuman evaluation achieves a high inter-worker agreement of Krippendorff's alpha\non 76% models with a value higher than 0.4. We comprehensively evaluated a\ntotal of around 30 models and observed three key takeaways: (1) the existing\nmodels' performance is generally unsatisfying except for Text-guided Image\nGeneration and Subject-driven Image Generation, with 74% models achieving an\noverall score lower than 0.5. (2) we examined the claims from published papers\nand found 83% of them hold with a few exceptions. (3) None of the existing\nautomatic metrics has a Spearman's correlation higher than 0.2 except\nsubject-driven image generation. Moving forward, we will continue our efforts\nto evaluate newly published models and update our leaderboard to keep track of\nthe progress in conditional image generation.\n","authors":["Max Ku","Tianle Li","Kai Zhang","Yujie Lu","Xingyu Fu","Wenwen Zhuang","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2310.01596v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09909v2","updated":"2023-10-17T03:41:09Z","published":"2023-10-15T18:32:27Z","title":"Can GPT-4V(ision) Serve Medical Applications? Case Studies on GPT-4V for\n  Multimodal Medical Diagnosis","summary":"  Driven by the large foundation models, the development of artificial\nintelligence has witnessed tremendous progress lately, leading to a surge of\ngeneral interest from the public. In this study, we aim to assess the\nperformance of OpenAI's newest model, GPT-4V(ision), specifically in the realm\nof multimodal medical diagnosis. Our evaluation encompasses 17 human body\nsystems, including Central Nervous System, Head and Neck, Cardiac, Chest,\nHematology, Hepatobiliary, Gastrointestinal, Urogenital, Gynecology,\nObstetrics, Breast, Musculoskeletal, Spine, Vascular, Oncology, Trauma,\nPediatrics, with images taken from 8 modalities used in daily clinic routine,\ne.g., X-ray, Computed Tomography (CT), Magnetic Resonance Imaging (MRI),\nPositron Emission Tomography (PET), Digital Subtraction Angiography (DSA),\nMammography, Ultrasound, and Pathology. We probe the GPT-4V's ability on\nmultiple clinical tasks with or without patent history provided, including\nimaging modality and anatomy recognition, disease diagnosis, report generation,\ndisease localisation.\n  Our observation shows that, while GPT-4V demonstrates proficiency in\ndistinguishing between medical image modalities and anatomy, it faces\nsignificant challenges in disease diagnosis and generating comprehensive\nreports. These findings underscore that while large multimodal models have made\nsignificant advancements in computer vision and natural language processing, it\nremains far from being used to effectively support real-world medical\napplications and clinical decision-making.\n  All images used in this report can be found in\nhttps://github.com/chaoyi-wu/GPT-4V_Medical_Evaluation.\n","authors":["Chaoyi Wu","Jiayu Lei","Qiaoyu Zheng","Weike Zhao","Weixiong Lin","Xiaoman Zhang","Xiao Zhou","Ziheng Zhao","Ya Zhang","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2310.09909v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10533v2","updated":"2023-10-17T03:37:22Z","published":"2023-10-16T15:54:09Z","title":"Label-efficient Segmentation via Affinity Propagation","summary":"  Weakly-supervised segmentation with label-efficient sparse annotations has\nattracted increasing research attention to reduce the cost of laborious\npixel-wise labeling process, while the pairwise affinity modeling techniques\nplay an essential role in this task. Most of the existing approaches focus on\nusing the local appearance kernel to model the neighboring pairwise potentials.\nHowever, such a local operation fails to capture the long-range dependencies\nand ignores the topology of objects. In this work, we formulate the affinity\nmodeling as an affinity propagation process, and propose a local and a global\npairwise affinity terms to generate accurate soft pseudo labels. An efficient\nalgorithm is also developed to reduce significantly the computational cost. The\nproposed approach can be conveniently plugged into existing segmentation\nnetworks. Experiments on three typical label-efficient segmentation tasks, i.e.\nbox-supervised instance segmentation, point/scribble-supervised semantic\nsegmentation and CLIP-guided semantic segmentation, demonstrate the superior\nperformance of the proposed approach.\n","authors":["Wentong Li","Yuqian Yuan","Song Wang","Wenyu Liu","Dongqi Tang","Jian Liu","Jianke Zhu","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.10533v2.pdf","comment":"NeurIPS2023 Acceptance. Project\n  Page:https://LiWentomng.github.io/apro/. Code:\n  https://github.com/CircleRadon/APro"},{"id":"http://arxiv.org/abs/2310.08872v2","updated":"2023-10-17T03:36:26Z","published":"2023-10-13T05:48:42Z","title":"R&B: Region and Boundary Aware Zero-shot Grounded Text-to-image\n  Generation","summary":"  Recent text-to-image (T2I) diffusion models have achieved remarkable progress\nin generating high-quality images given text-prompts as input. However, these\nmodels fail to convey appropriate spatial composition specified by a layout\ninstruction. In this work, we probe into zero-shot grounded T2I generation with\ndiffusion models, that is, generating images corresponding to the input layout\ninformation without training auxiliary modules or finetuning diffusion models.\nWe propose a Region and Boundary (R&B) aware cross-attention guidance approach\nthat gradually modulates the attention maps of diffusion model during\ngenerative process, and assists the model to synthesize images (1) with high\nfidelity, (2) highly compatible with textual input, and (3) interpreting layout\ninstructions accurately. Specifically, we leverage the discrete sampling to\nbridge the gap between consecutive attention maps and discrete layout\nconstraints, and design a region-aware loss to refine the generative layout\nduring diffusion process. We further propose a boundary-aware loss to\nstrengthen object discriminability within the corresponding regions.\nExperimental results show that our method outperforms existing state-of-the-art\nzero-shot grounded T2I generation methods by a large margin both qualitatively\nand quantitatively on several benchmarks.\n","authors":["Jiayu Xiao","Liang Li","Henglei Lv","Shuhui Wang","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2310.08872v2.pdf","comment":"Preprint. Under review. Project page:\n  https://sagileo.github.io/Region-and-Boundary"},{"id":"http://arxiv.org/abs/2310.10971v1","updated":"2023-10-17T03:35:27Z","published":"2023-10-17T03:35:27Z","title":"Context-Aware Meta-Learning","summary":"  Large Language Models like ChatGPT demonstrate a remarkable capacity to learn\nnew concepts during inference without any fine-tuning. However, visual models\ntrained to detect new objects during inference have been unable to replicate\nthis ability, and instead either perform poorly or require meta-training and/or\nfine-tuning on similar objects. In this work, we propose a meta-learning\nalgorithm that emulates Large Language Models by learning new visual concepts\nduring inference without fine-tuning. Our approach leverages a frozen\npre-trained feature extractor, and analogous to in-context learning, recasts\nmeta-learning as sequence modeling over datapoints with known labels and a test\ndatapoint with an unknown label. On 8 out of 11 meta-learning benchmarks, our\napproach -- without meta-training or fine-tuning -- exceeds or matches the\nstate-of-the-art algorithm, P>M>F, which is meta-trained on these benchmarks.\n","authors":["Christopher Fifty","Dennis Duan","Ronald G. Junkins","Ehsan Amid","Jure Leskovec","Christopher Ré","Sebastian Thrun"],"pdf_url":"https://arxiv.org/pdf/2310.10971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.04264v4","updated":"2023-10-17T03:34:15Z","published":"2023-02-08T18:58:00Z","title":"Nerfstudio: A Modular Framework for Neural Radiance Field Development","summary":"  Neural Radiance Fields (NeRF) are a rapidly growing area of research with\nwide-ranging applications in computer vision, graphics, robotics, and more. In\norder to streamline the development and deployment of NeRF research, we propose\na modular PyTorch framework, Nerfstudio. Our framework includes plug-and-play\ncomponents for implementing NeRF-based methods, which make it easy for\nresearchers and practitioners to incorporate NeRF into their projects.\nAdditionally, the modular design enables support for extensive real-time\nvisualization tools, streamlined pipelines for importing captured in-the-wild\ndata, and tools for exporting to video, point cloud and mesh representations.\nThe modularity of Nerfstudio enables the development of Nerfacto, our method\nthat combines components from recent papers to achieve a balance between speed\nand quality, while also remaining flexible to future modifications. To promote\ncommunity-driven development, all associated code and data are made publicly\navailable with open-source licensing at https://nerf.studio.\n","authors":["Matthew Tancik","Ethan Weber","Evonne Ng","Ruilong Li","Brent Yi","Justin Kerr","Terrance Wang","Alexander Kristoffersen","Jake Austin","Kamyar Salahi","Abhik Ahuja","David McAllister","Angjoo Kanazawa"],"pdf_url":"https://arxiv.org/pdf/2302.04264v4.pdf","comment":"Project page at https://nerf.studio"},{"id":"http://arxiv.org/abs/2310.05647v3","updated":"2023-10-17T03:28:55Z","published":"2023-10-09T11:59:11Z","title":"Exploiting Manifold Structured Data Priors for Improved MR\n  Fingerprinting Reconstruction","summary":"  Estimating tissue parameter maps with high accuracy and precision from highly\nundersampled measurements presents one of the major challenges in MR\nfingerprinting (MRF). Many existing works project the recovered voxel\nfingerprints onto the Bloch manifold to improve reconstruction performance.\nHowever, little research focuses on exploiting the latent manifold structure\npriors among fingerprints. To fill this gap, we propose a novel MRF\nreconstruction framework based on manifold structured data priors. Since it is\ndifficult to directly estimate the fingerprint manifold structure, we model the\ntissue parameters as points on a low-dimensional parameter manifold. We reveal\nthat the fingerprint manifold shares the same intrinsic topology as the\nparameter manifold, although being embedded in different Euclidean spaces. To\nexploit the non-linear and non-local redundancies in MRF data, we divide the\nMRF data into spatial patches, and the similarity measurement among data\npatches can be accurately obtained using the Euclidean distance between the\ncorresponding patches in the parameter manifold. The measured similarity is\nthen used to construct the graph Laplacian operator, which represents the\nfingerprint manifold structure. Thus, the fingerprint manifold structure is\nintroduced in the reconstruction framework by using the low-dimensional\nparameter manifold. Additionally, we incorporate the locally low-rank prior in\nthe reconstruction framework to further utilize the local correlations within\neach patch for improved reconstruction performance. We also adopt a\nGPU-accelerated NUFFT library to accelerate reconstruction in non-Cartesian\nsampling scenarios. Experimental results demonstrate that our method can\nachieve significantly improved reconstruction performance with reduced\ncomputational time over the state-of-the-art methods.\n","authors":["Peng Li","Yuping Ji","Yue Hu"],"pdf_url":"https://arxiv.org/pdf/2310.05647v3.pdf","comment":"10 pages, 10 figures, will submit to IEEE Transactions on Medical\n  Imaging"},{"id":"http://arxiv.org/abs/2303.11632v2","updated":"2023-10-17T03:26:22Z","published":"2023-03-21T07:00:13Z","title":"An Embarrassingly Simple Approach for Wafer Feature Extraction and\n  Defect Pattern Recognition","summary":"  Identifying defect patterns in a wafer map during manufacturing is crucial to\nfind the root cause of the underlying issue and provides valuable insights on\nimproving yield in the foundry. Currently used methods use deep neural networks\nto identify the defects. These methods are generally very huge and have\nsignificant inference time. They also require GPU support to efficiently\noperate. All these issues make these models not fit for on-line prediction in\nthe manufacturing foundry. In this paper, we propose an extremely simple yet\neffective technique to extract features from wafer images. The proposed method\nis extremely fast, intuitive, and non-parametric while being explainable. The\nexperiment results show that the proposed pipeline outperforms conventional\ndeep learning models. Our feature extraction requires no training or\nfine-tuning while preserving the relative shape and location of data points as\nrevealed by our interpretability analysis.\n","authors":["Nitish Shukla"],"pdf_url":"https://arxiv.org/pdf/2303.11632v2.pdf","comment":"study is not relevant"},{"id":"http://arxiv.org/abs/2303.13827v2","updated":"2023-10-17T03:26:07Z","published":"2023-03-24T06:24:07Z","title":"Efficient Mixed-Type Wafer Defect Pattern Recognition Using Compact\n  Deformable Convolutional Transformers","summary":"  Manufacturing wafers is an intricate task involving thousands of steps.\nDefect Pattern Recognition (DPR) of wafer maps is crucial to find the root\ncause of the issue and further improving the yield in the wafer foundry.\nMixed-type DPR is much more complicated compared to single-type DPR due to\nvaried spatial features, the uncertainty of defects, and the number of defects\npresent. To accurately predict the number of defects as well as the types of\ndefects, we propose a novel compact deformable convolutional transformer (DC\nTransformer). Specifically, DC Transformer focuses on the global features\npresent in the wafer map by virtue of learnable deformable kernels and\nmulti-head attention to the global features. The proposed method succinctly\nmodels the internal relationship between the wafer maps and the defects. DC\nTransformer is evaluated on a real dataset containing 38 defect patterns.\nExperimental results show that DC Transformer performs exceptionally well in\nrecognizing both single and mixed-type defects. The proposed method outperforms\nthe current state of the models by a considerable margin\n","authors":["Nitish Shukla"],"pdf_url":"https://arxiv.org/pdf/2303.13827v2.pdf","comment":"Study is not relevant"},{"id":"http://arxiv.org/abs/2310.10963v1","updated":"2023-10-17T03:25:22Z","published":"2023-10-17T03:25:22Z","title":"MRI brain tumor segmentation using informative feature vectors and\n  kernel dictionary learning","summary":"  This paper presents a method based on a kernel dictionary learning algorithm\nfor segmenting brain tumor regions in magnetic resonance images (MRI). A set of\nfirst-order and second-order statistical feature vectors are extracted from\npatches of size 3 * 3 around pixels in the brain MRI scans. These feature\nvectors are utilized to train two kernel dictionaries separately for healthy\nand tumorous tissues. To enhance the efficiency of the dictionaries and reduce\ntraining time, a correlation-based sample selection technique is developed to\nidentify the most informative and discriminative subset of feature vectors.\nThis technique aims to improve the performance of the dictionaries by selecting\na subset of feature vectors that provide valuable information for the\nsegmentation task. Subsequently, a linear classifier is utilized to distinguish\nbetween healthy and unhealthy pixels based on the learned dictionaries. The\nresults demonstrate that the proposed method outperforms other existing methods\nin terms of segmentation accuracy and significantly reduces both the time and\nmemory required, resulting in a remarkably fast training process.\n","authors":["Seyedeh Mahya Mousavi","Mohammad Mostafavi"],"pdf_url":"https://arxiv.org/pdf/2310.10963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10958v1","updated":"2023-10-17T03:11:30Z","published":"2023-10-17T03:11:30Z","title":"Enhancing Deep Neural Network Training Efficiency and Performance\n  through Linear Prediction","summary":"  Deep neural networks (DNN) have achieved remarkable success in various\nfields, including computer vision and natural language processing. However,\ntraining an effective DNN model still poses challenges. This paper aims to\npropose a method to optimize the training effectiveness of DNN, with the goal\nof improving model performance. Firstly, based on the observation that the DNN\nparameters change in certain laws during training process, the potential of\nparameter prediction for improving model training efficiency and performance is\ndiscovered. Secondly, considering the magnitude of DNN model parameters,\nhardware limitations and characteristics of Stochastic Gradient Descent (SGD)\nfor noise tolerance, a Parameter Linear Prediction (PLP) method is exploit to\nperform DNN parameter prediction. Finally, validations are carried out on some\nrepresentative backbones. Experiment results show that compare to the normal\ntraining ways, under the same training conditions and epochs, by employing\nproposed PLP method, the optimal model is able to obtain average about 1%\naccuracy improvement and 0.01 top-1/top-5 error reduction for Vgg16, Resnet18\nand GoogLeNet based on CIFAR-100 dataset, which shown the effectiveness of the\nproposed method on different DNN structures, and validated its capacity in\nenhancing DNN training efficiency and performance.\n","authors":["Hejie Ying","Mengmeng Song","Yaohong Tang","Shungen Xiao","Zimin Xiao"],"pdf_url":"https://arxiv.org/pdf/2310.10958v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10957v1","updated":"2023-10-17T03:08:35Z","published":"2023-10-17T03:08:35Z","title":"Medical Image Segmentation via Sparse Coding Decoder","summary":"  Transformers have achieved significant success in medical image segmentation,\nowing to its capability to capture long-range dependencies. Previous works\nincorporate convolutional layers into the encoder module of transformers,\nthereby enhancing their ability to learn local relationships among pixels.\nHowever, transformers may suffer from limited generalization capabilities and\nreduced robustness, attributed to the insufficient spatial recovery ability of\ntheir decoders. To address this issue, A convolution sparse vector coding based\ndecoder is proposed , namely CAScaded multi-layer Convolutional Sparse vector\nCoding DEcoder (CASCSCDE), which represents features extracted by the encoder\nusing sparse vectors. To prove the effectiveness of our CASCSCDE, The\nwidely-used TransUNet model is chosen for the demonstration purpose, and the\nCASCSCDE is incorporated with TransUNet to establish the TransCASCSCDE\narchitecture. Our experiments demonstrate that TransUNet with CASCSCDE\nsignificantly enhances performance on the Synapse benchmark, obtaining up to\n3.15\\% and 1.16\\% improvements in DICE and mIoU scores, respectively. CASCSCDE\nopens new ways for constructing decoders based on convolutional sparse vector\ncoding.\n","authors":["Long Zeng","Kaigui Wu"],"pdf_url":"https://arxiv.org/pdf/2310.10957v1.pdf","comment":"8 pages, 1 figures"},{"id":"http://arxiv.org/abs/2310.10951v1","updated":"2023-10-17T02:56:10Z","published":"2023-10-17T02:56:10Z","title":"FusionU-Net: U-Net with Enhanced Skip Connection for Pathology Image\n  Segmentation","summary":"  In recent years, U-Net and its variants have been widely used in pathology\nimage segmentation tasks. One of the key designs of U-Net is the use of skip\nconnections between the encoder and decoder, which helps to recover detailed\ninformation after upsampling. While most variations of U-Net adopt the original\nskip connection design, there is semantic gap between the encoder and decoder\nthat can negatively impact model performance. Therefore, it is important to\nreduce this semantic gap before conducting skip connection. To address this\nissue, we propose a new segmentation network called FusionU-Net, which is based\non U-Net structure and incorporates a fusion module to exchange information\nbetween different skip connections to reduce semantic gaps. Unlike the other\nfusion modules in existing networks, ours is based on a two-round fusion design\nthat fully considers the local relevance between adjacent encoder layer outputs\nand the need for bi-directional information exchange across multiple layers. We\nconducted extensive experiments on multiple pathology image datasets to\nevaluate our model and found that FusionU-Net achieves better performance\ncompared to other competing methods. We argue our fusion module is more\neffective than the designs of existing networks, and it could be easily\nembedded into other networks to further enhance the model performance.\n","authors":["Zongyi Li","Hongbing Lyu","Jun Wang"],"pdf_url":"https://arxiv.org/pdf/2310.10951v1.pdf","comment":"9 pages, 4 figures and 4 tables"},{"id":"http://arxiv.org/abs/2310.10942v1","updated":"2023-10-17T02:38:09Z","published":"2023-10-17T02:38:09Z","title":"Unanswerable Visual Question Answering","summary":"  Teaching Visual Question Answering (VQA) models to abstain from unanswerable\nquestions is indispensable for building a trustworthy AI system. Existing\nstudies, though have explored various aspects of VQA, yet marginally ignored\nthis particular attribute. This paper aims to bridge the research gap by\ncontributing a comprehensive dataset, called UNK-VQA. The dataset is\nspecifically designed to address the challenge of questions that can be\nunanswerable. To this end, we first augment the existing data via deliberate\nperturbations on either the image or question. In specific, we carefully ensure\nthat the question-image semantics remain close to the original unperturbed\ndistribution. By means of this, the identification of unanswerable questions\nbecomes challenging, setting our dataset apart from others that involve mere\nimage replacement. We then extensively evaluate the zero- and few-shot\nperformance of several emerging multi-modal large models and discover\nsignificant limitations of them when applied to our dataset. Additionally, we\nalso propose a straightforward method to tackle these unanswerable questions.\nThis dataset, we believe, will serve as a valuable benchmark for enhancing the\nabstention capability of VQA models, thereby leading to increased\ntrustworthiness of AI systems.\n","authors":["Yanyang Guo","Fangkai Jiao","Zhiqi Shen","Liqiang Nie","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2310.10942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09759v2","updated":"2023-10-17T02:32:19Z","published":"2023-10-15T07:06:01Z","title":"Prototype-oriented Unsupervised Change Detection for Disaster Management","summary":"  Climate change has led to an increased frequency of natural disasters such as\nfloods and cyclones. This emphasizes the importance of effective disaster\nmonitoring. In response, the remote sensing community has explored change\ndetection methods. These methods are primarily categorized into supervised\ntechniques, which yield precise results but come with high labeling costs, and\nunsupervised techniques, which eliminate the need for labeling but involve\nintricate hyperparameter tuning. To address these challenges, we propose a\nnovel unsupervised change detection method named Prototype-oriented\nUnsupervised Change Detection for Disaster Management (PUCD). PUCD captures\nchanges by comparing features from pre-event, post-event, and\nprototype-oriented change synthesis images via a foundational model, and\nrefines results using the Segment Anything Model (SAM). Although PUCD is an\nunsupervised change detection, it does not require complex hyperparameter\ntuning. We evaluate PUCD framework on the LEVIR-Extension dataset and the\ndisaster dataset and it achieves state-of-the-art performance compared to other\nmethods on the LEVIR-Extension dataset.\n","authors":["Youngtack Oh","Minseok Seo","Doyi Kim","Junghoon Seo"],"pdf_url":"https://arxiv.org/pdf/2310.09759v2.pdf","comment":"4page, 2 figures"},{"id":"http://arxiv.org/abs/2310.05136v4","updated":"2023-10-17T02:27:52Z","published":"2023-10-08T12:10:44Z","title":"InstructDET: Diversifying Referring Object Detection with Generalized\n  Instructions","summary":"  We propose InstructDET, a data-centric method for referring object detection\n(ROD) that localizes target objects based on user instructions. While deriving\nfrom referring expressions (REC), the instructions we leverage are greatly\ndiversified to encompass common user intentions related to object detection.\nFor one image, we produce tremendous instructions that refer to every single\nobject and different combinations of multiple objects. Each instruction and its\ncorresponding object bounding boxes (bbxs) constitute one training data pair.\nIn order to encompass common detection expressions, we involve emerging\nvision-language model (VLM) and large language model (LLM) to generate\ninstructions guided by text prompts and object bbxs, as the generalizations of\nfoundation models are effective to produce human-like expressions (e.g.,\ndescribing object property, category, and relationship). We name our\nconstructed dataset as InDET. It contains images, bbxs and generalized\ninstructions that are from foundation models. Our InDET is developed from\nexisting REC datasets and object detection datasets, with the expanding\npotential that any image with object bbxs can be incorporated through using our\nInstructDET method. By using our InDET dataset, we show that a conventional ROD\nmodel surpasses existing methods on standard REC datasets and our InDET test\nset. Our data-centric method InstructDET, with automatic data expansion by\nleveraging foundation models, directs a promising field that ROD can be greatly\ndiversified to execute common object detection instructions.\n","authors":["Ronghao Dang","Jiangyan Feng","Haodong Zhang","Chongjian Ge","Lin Song","Lijun Gong","Chengju Liu","Qijun Chen","Feng Zhu","Rui Zhao","Yibing Song"],"pdf_url":"https://arxiv.org/pdf/2310.05136v4.pdf","comment":"27 pages (include Appendix) Technical Report"},{"id":"http://arxiv.org/abs/2301.05856v2","updated":"2023-10-17T02:07:00Z","published":"2023-01-14T08:32:16Z","title":"EARL: An Elliptical Distribution aided Adaptive Rotation Label\n  Assignment for Oriented Object Detection in Remote Sensing Images","summary":"  Label assignment is a crucial process in object detection, which\nsignificantly influences the detection performance by determining positive or\nnegative samples during training process. However, existing label assignment\nstrategies barely consider the characteristics of targets in remote sensing\nimages (RSIs) thoroughly, e.g., large variations in scales and aspect ratios,\nleading to insufficient and imbalanced sampling and introducing more\nlow-quality samples, thereby limiting detection performance. To solve the above\nproblems, an Elliptical Distribution aided Adaptive Rotation Label Assignment\n(EARL) is proposed to select high-quality positive samples adaptively in\nanchor-free detectors. Specifically, an adaptive scale sampling (ADS) strategy\nis presented to select samples adaptively among multi-level feature maps\naccording to the scales of targets, which achieves sufficient sampling with\nmore balanced scale-level sample distribution. In addition, a dynamic\nelliptical distribution aided sampling (DED) strategy is proposed to make the\nsample distribution more flexible to fit the shapes and orientations of\ntargets, and filter out low-quality samples. Furthermore, a spatial distance\nweighting (SDW) module is introduced to integrate the adaptive distance\nweighting into loss function, which makes the detector more focused on the\nhigh-quality samples. Extensive experiments on several popular datasets\ndemonstrate the effectiveness and superiority of our proposed EARL, where\nwithout bells and whistles, it can be easily applied to different detectors and\nachieve state-of-the-art performance. The source code will be available at:\nhttps://github.com/Justlovesmile/EARL.\n","authors":["Jian Guan","Mingjie Xie","Youtian Lin","Guangjun He","Pengming Feng"],"pdf_url":"https://arxiv.org/pdf/2301.05856v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16979v2","updated":"2023-10-17T02:06:18Z","published":"2023-06-29T14:33:20Z","title":"Defending Black-box Classifiers by Bayesian Boundary Correction","summary":"  Classifiers based on deep neural networks have been recently challenged by\nAdversarial Attack, where the widely existing vulnerability has invoked the\nresearch in defending them from potential threats. Given a vulnerable\nclassifier, existing defense methods are mostly white-box and often require\nre-training the victim under modified loss functions/training regimes. While\nthe model/data/training specifics of the victim are usually unavailable to the\nuser, re-training is unappealing, if not impossible for reasons such as limited\ncomputational resources. To this end, we propose a new black-box defense\nframework. It can turn any pre-trained classifier into a resilient one with\nlittle knowledge of the model specifics. This is achieved by new joint Bayesian\ntreatments on the clean data, the adversarial examples and the classifier, for\nmaximizing their joint probability. It is further equipped with a new\npost-train strategy which keeps the victim intact. We name our framework\nBayesian Boundary Correction (BBC). BBC is a general and flexible framework\nthat can easily adapt to different data types. We instantiate BBC for image\nclassification and skeleton-based human activity recognition, for both static\nand dynamic data. Exhaustive evaluation shows that BBC has superior robustness\nand can enhance robustness without severely hurting the clean accuracy,\ncompared with existing defense methods.\n","authors":["He Wang","Yunfeng Diao"],"pdf_url":"https://arxiv.org/pdf/2306.16979v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2203.04713"},{"id":"http://arxiv.org/abs/2309.04965v2","updated":"2023-10-17T01:30:57Z","published":"2023-09-10T08:55:24Z","title":"Prefix-diffusion: A Lightweight Diffusion Model for Diverse Image\n  Captioning","summary":"  While impressive performance has been achieved in image captioning, the\nlimited diversity of the generated captions and the large parameter scale\nremain major barriers to the real-word application of these systems. In this\nwork, we propose a lightweight image captioning network in combination with\ncontinuous diffusion, called Prefix-diffusion. To achieve diversity, we design\nan efficient method that injects prefix image embeddings into the denoising\nprocess of the diffusion model. In order to reduce trainable parameters, we\nemploy a pre-trained model to extract image features and further design an\nextra mapping network. Prefix-diffusion is able to generate diverse captions\nwith relatively less parameters, while maintaining the fluency and relevance of\nthe captions benefiting from the generative capabilities of the diffusion\nmodel. Our work paves the way for scaling up diffusion models for image\ncaptioning, and achieves promising performance compared with recent approaches.\n","authors":["Guisheng Liu","Yi Li","Zhengcong Fei","Haiyan Fu","Xiangyang Luo","Yanqing Guo"],"pdf_url":"https://arxiv.org/pdf/2309.04965v2.pdf","comment":"11 pages,4 figures, 6 tables"},{"id":"http://arxiv.org/abs/2310.10912v1","updated":"2023-10-17T01:12:08Z","published":"2023-10-17T01:12:08Z","title":"Towards Training-free Open-world Segmentation via Image Prompting\n  Foundation Models","summary":"  The realm of computer vision has witnessed a paradigm shift with the advent\nof foundational models, mirroring the transformative influence of large\nlanguage models in the domain of natural language processing. This paper delves\ninto the exploration of open-world segmentation, presenting a novel approach\ncalled Image Prompt Segmentation (IPSeg) that harnesses the power of vision\nfoundational models. At the heart of IPSeg lies the principle of a\ntraining-free paradigm, which capitalizes on image prompting techniques. IPSeg\nutilizes a single image containing a subjective visual concept as a flexible\nprompt to query vision foundation models like DINOv2 and Stable Diffusion. Our\napproach extracts robust features for the prompt image and input image, then\nmatches the input representations to the prompt representations via a novel\nfeature interaction module to generate point prompts highlighting target\nobjects in the input image. The generated point prompts are further utilized to\nguide the Segment Anything Model to segment the target object in the input\nimage. The proposed method stands out by eliminating the need for exhaustive\ntraining sessions, thereby offering a more efficient and scalable solution.\nExperiments on COCO, PASCAL VOC, and other datasets demonstrate IPSeg's\nefficacy for flexible open-world segmentation using intuitive image prompts.\nThis work pioneers tapping foundation models for open-world understanding\nthrough visual concepts conveyed in images.\n","authors":["Lv Tang","Peng-Tao Jiang","Hao-Ke Xiao","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2310.10912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04780v4","updated":"2023-10-17T00:51:46Z","published":"2023-10-07T11:45:33Z","title":"IPMix: Label-Preserving Data Augmentation Method for Training Robust\n  Classifiers","summary":"  Data augmentation has been proven effective for training high-accuracy\nconvolutional neural network classifiers by preventing overfitting. However,\nbuilding deep neural networks in real-world scenarios requires not only high\naccuracy on clean data but also robustness when data distributions shift. While\nprior methods have proposed that there is a trade-off between accuracy and\nrobustness, we propose IPMix, a simple data augmentation approach to improve\nrobustness without hurting clean accuracy. IPMix integrates three levels of\ndata augmentation (image-level, patch-level, and pixel-level) into a coherent\nand label-preserving technique to increase the diversity of training data with\nlimited computational overhead. To further improve the robustness, IPMix\nintroduces structural complexity at different levels to generate more diverse\nimages and adopts the random mixing method for multi-scale information fusion.\nExperiments demonstrate that IPMix outperforms state-of-the-art corruption\nrobustness on CIFAR-C and ImageNet-C. In addition, we show that IPMix also\nsignificantly improves the other safety measures, including robustness to\nadversarial perturbations, calibration, prediction consistency, and anomaly\ndetection, achieving state-of-the-art or comparable results on several\nbenchmarks, including ImageNet-R, ImageNet-A, and ImageNet-O.\n","authors":["Zhenglin Huang","Xianan Bao","Na Zhang","Qingqi Zhang","Xiaomei Tu","Biao Wu","Xi Yang"],"pdf_url":"https://arxiv.org/pdf/2310.04780v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01998v2","updated":"2023-10-17T00:48:36Z","published":"2023-07-05T03:07:00Z","title":"Zero-Shot Neural Architecture Search: Challenges, Solutions, and\n  Opportunities","summary":"  Recently, zero-shot (or training-free) Neural Architecture Search (NAS)\napproaches have been proposed to liberate NAS from the expensive training\nprocess. The key idea behind zero-shot NAS approaches is to design proxies that\ncan predict the accuracy of some given networks without training the network\nparameters. The proxies proposed so far are usually inspired by recent progress\nin theoretical understanding of deep learning and have shown great potential on\nseveral datasets and NAS benchmarks. This paper aims to comprehensively review\nand compare the state-of-the-art (SOTA) zero-shot NAS approaches, with an\nemphasis on their hardware awareness. To this end, we first review the\nmainstream zero-shot proxies and discuss their theoretical underpinnings. We\nthen compare these zero-shot proxies through large-scale experiments and\ndemonstrate their effectiveness in both hardware-aware and hardware-oblivious\nNAS scenarios. Finally, we point out several promising ideas to design better\nproxies. Our source code and the list of related papers are available on\nhttps://github.com/SLDGroup/survey-zero-shot-nas.\n","authors":["Guihong Li","Duc Hoang","Kartikeya Bhardwaj","Ming Lin","Zhangyang Wang","Radu Marculescu"],"pdf_url":"https://arxiv.org/pdf/2307.01998v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08586v2","updated":"2023-10-17T00:12:20Z","published":"2023-09-15T17:43:40Z","title":"Replacing softmax with ReLU in Vision Transformers","summary":"  Previous research observed accuracy degradation when replacing the attention\nsoftmax with a point-wise activation such as ReLU. In the context of vision\ntransformers, we find that this degradation is mitigated when dividing by\nsequence length. Our experiments training small to large vision transformers on\nImageNet-21k indicate that ReLU-attention can approach or match the performance\nof softmax-attention in terms of scaling behavior as a function of compute.\n","authors":["Mitchell Wortsman","Jaehoon Lee","Justin Gilmer","Simon Kornblith"],"pdf_url":"https://arxiv.org/pdf/2309.08586v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.13013v2","updated":"2023-10-17T00:11:15Z","published":"2023-04-25T17:38:18Z","title":"Stable and low-precision training for large-scale vision-language models","summary":"  We introduce new methods for 1) accelerating and 2) stabilizing training for\nlarge language-vision models. 1) For acceleration, we introduce SwitchBack, a\nlinear layer for int8 quantized training which provides a speed-up of 13-25%\nwhile matching the performance of bfloat16 training within 0.1 percentage\npoints for the 1B parameter CLIP ViT-Huge -- the largest int8 training to date.\nOur main focus is int8 as GPU support for float8 is rare, though we also\nanalyze float8 training through simulation. While SwitchBack proves effective\nfor float8, we show that standard techniques are also successful if the network\nis trained and initialized so that large feature magnitudes are discouraged,\nwhich we accomplish via layer-scale initialized with zeros. 2) For stability,\nwe analyze loss spikes and find they consistently occur 1-8 iterations after\nthe squared gradients become under-estimated by their AdamW second moment\nestimator. As a result, we recommend an AdamW-Adafactor hybrid which avoids\nloss spikes when training a CLIP ViT-Huge model and outperforms gradient\nclipping at the scales we test.\n","authors":["Mitchell Wortsman","Tim Dettmers","Luke Zettlemoyer","Ari Morcos","Ali Farhadi","Ludwig Schmidt"],"pdf_url":"https://arxiv.org/pdf/2304.13013v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.11629v1","updated":"2023-10-17T23:37:23Z","published":"2023-10-17T23:37:23Z","title":"Holistic Parking Slot Detection with Polygon-Shaped Representations","summary":"  Current parking slot detection in advanced driver-assistance systems (ADAS)\nprimarily relies on ultrasonic sensors. This method has several limitations\nsuch as the need to scan the entire parking slot before detecting it, the\nincapacity of detecting multiple slots in a row, and the difficulty of\nclassifying them. Due to the complex visual environment, vehicles are equipped\nwith surround view camera systems to detect vacant parking slots. Previous\nresearch works in this field mostly use image-domain models to solve the\nproblem. These two-stage approaches separate the 2D detection and 3D pose\nestimation steps using camera calibration. In this paper, we propose one-step\nHolistic Parking Slot Network (HPS-Net), a tailor-made adaptation of the You\nOnly Look Once (YOLO)v4 algorithm. This camera-based approach directly outputs\nthe four vertex coordinates of the parking slot in topview domain, instead of a\nbounding box in raw camera images. Several visible points and shapes can be\nproposed from different angles. A novel regression loss function named\npolygon-corner Generalized Intersection over Union (GIoU) for polygon vertex\nposition optimization is also proposed to manage the slot orientation and to\ndistinguish the entrance line. Experiments show that HPS-Net can detect various\nvacant parking slots with a F1-score of 0.92 on our internal Valeo Parking\nSlots Dataset (VPSD) and 0.99 on the public dataset PS2.0. It provides a\nsatisfying generalization and robustness in various parking scenarios, such as\nindoor (F1: 0.86) or paved ground (F1: 0.91). Moreover, it achieves a real-time\ndetection speed of 17 FPS on Nvidia Drive AGX Xavier. A demo video can be found\nat https://streamable.com/75j7sj.\n","authors":["Lihao Wang","Antonyo Musabini","Christel Leonet","Rachid Benmokhtar","Amaury Breheret","Chaima Yedes","Fabian Burger","Thomas Boulay","Xavier Perrotton"],"pdf_url":"https://arxiv.org/pdf/2310.11629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11622v1","updated":"2023-10-17T23:20:36Z","published":"2023-10-17T23:20:36Z","title":"High-Resolution Building and Road Detection from Sentinel-2","summary":"  Mapping buildings and roads automatically with remote sensing typically\nrequires high-resolution imagery, which is expensive to obtain and often\nsparsely available. In this work we demonstrate how multiple 10 m resolution\nSentinel-2 images can be used to generate 50 cm resolution building and road\nsegmentation masks. This is done by training a `student' model with access to\nSentinel-2 images to reproduce the predictions of a `teacher' model which has\naccess to corresponding high-resolution imagery. While the predictions do not\nhave all the fine detail of the teacher model, we find that we are able to\nretain much of the performance: for building segmentation we achieve 78.3%\nmIoU, compared to the high-resolution teacher model accuracy of 85.3% mIoU. We\nalso describe a related method for counting individual buildings in a\nSentinel-2 patch which achieves R^2 = 0.91 against true counts. This work opens\nup new possibilities for using freely available Sentinel-2 imagery for a range\nof tasks that previously could only be done with high-resolution satellite\nimagery.\n","authors":["Wojciech Sirko","Emmanuel Asiedu Brempong","Juliana T. C. Marcos","Abigail Annkah","Abel Korme","Mohammed Alewi Hassen","Krishna Sapkota","Tomer Shekel","Abdoulaye Diack","Sella Nevo","Jason Hickey","John Quinn"],"pdf_url":"https://arxiv.org/pdf/2310.11622v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09199v2","updated":"2023-10-17T22:38:51Z","published":"2023-10-13T15:45:19Z","title":"PaLI-3 Vision Language Models: Smaller, Faster, Stronger","summary":"  This paper presents PaLI-3, a smaller, faster, and stronger vision language\nmodel (VLM) that compares favorably to similar models that are 10x larger. As\npart of arriving at this strong performance, we compare Vision Transformer\n(ViT) models pretrained using classification objectives to contrastively\n(SigLIP) pretrained ones. We find that, while slightly underperforming on\nstandard image classification benchmarks, SigLIP-based PaLI shows superior\nperformance across various multimodal benchmarks, especially on localization\nand visually-situated text understanding. We scale the SigLIP image encoder up\nto 2 billion parameters, and achieves a new state-of-the-art on multilingual\ncross-modal retrieval. We hope that PaLI-3, at only 5B parameters, rekindles\nresearch on fundamental pieces of complex VLMs, and could fuel a new generation\nof scaled-up models.\n","authors":["Xi Chen","Xiao Wang","Lucas Beyer","Alexander Kolesnikov","Jialin Wu","Paul Voigtlaender","Basil Mustafa","Sebastian Goodman","Ibrahim Alabdulmohsin","Piotr Padlewski","Daniel Salz","Xi Xiong","Daniel Vlasic","Filip Pavetic","Keran Rong","Tianli Yu","Daniel Keysers","Xiaohua Zhai","Radu Soricut"],"pdf_url":"https://arxiv.org/pdf/2310.09199v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.09616v8","updated":"2023-10-17T22:06:33Z","published":"2022-09-19T09:16:07Z","title":"Provably Uncertainty-Guided Universal Domain Adaptation","summary":"  Universal domain adaptation (UniDA) aims to transfer the knowledge from a\nlabeled source domain to an unlabeled target domain without any assumptions of\nthe label sets, which requires distinguishing the unknown samples from the\nknown ones in the target domain. A main challenge of UniDA is that the\nnonidentical label sets cause the misalignment between the two domains.\nMoreover, the domain discrepancy and the supervised objectives in the source\ndomain easily lead the whole model to be biased towards the common classes and\nproduce overconfident predictions for unknown samples. To address the above\nchallenging problems, we propose a new uncertainty-guided UniDA framework.\nFirstly, we introduce an empirical estimation of the probability of a target\nsample belonging to the unknown class which fully exploits the distribution of\nthe target samples in the latent space. Then, based on the estimation, we\npropose a novel neighbors searching scheme in a linear subspace with a\n$\\delta$-filter to estimate the uncertainty score of a target sample and\ndiscover unknown samples. It fully utilizes the relationship between a target\nsample and its neighbors in the source domain to avoid the influence of domain\nmisalignment. Secondly, this paper well balances the confidences of predictions\nfor both known and unknown samples through an uncertainty-guided margin loss\nbased on the confidences of discovered unknown samples, which can reduce the\ngap between the intra-class variances of known classes with respect to the\nunknown class. Finally, experiments on three public datasets demonstrate that\nour method significantly outperforms existing state-of-the-art methods.\n","authors":["Yifan Wang","Lin Zhang","Ran Song","Paul L. Rosin","Yibin Li","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2209.09616v8.pdf","comment":"13 pages. arXiv admin note: text overlap with arXiv:2207.09280"},{"id":"http://arxiv.org/abs/2310.11608v1","updated":"2023-10-17T22:04:42Z","published":"2023-10-17T22:04:42Z","title":"Classification of Safety Driver Attention During Autonomous Vehicle\n  Operation","summary":"  Despite the continual advances in Advanced Driver Assistance Systems (ADAS)\nand the development of high-level autonomous vehicles (AV), there is a general\nconsensus that for the short to medium term, there is a requirement for a human\nsupervisor to handle the edge cases that inevitably arise. Given this\nrequirement, it is essential that the state of the vehicle operator is\nmonitored to ensure they are contributing to the vehicle's safe operation. This\npaper introduces a dual-source approach integrating data from an infrared\ncamera facing the vehicle operator and vehicle perception systems to produce a\nmetric for driver alertness in order to promote and ensure safe operator\nbehaviour. The infrared camera detects the driver's head, enabling the\ncalculation of head orientation, which is relevant as the head typically moves\naccording to the individual's focus of attention. By incorporating\nenvironmental data from the perception system, it becomes possible to determine\nwhether the vehicle operator observes objects in the surroundings. Experiments\nwere conducted using data collected in Sydney, Australia, simulating AV\noperations in an urban environment. Our results demonstrate that the proposed\nsystem effectively determines a metric for the attention levels of the vehicle\noperator, enabling interventions such as warnings or reducing autonomous\nfunctionality as appropriate. This comprehensive solution shows promise in\ncontributing to ADAS and AVs' overall safety and efficiency in a real-world\nsetting.\n","authors":["Santiago Gerling Konrad","Julie Stephany Berrio","Mao Shan","Favio Masson","Stewart Worrall"],"pdf_url":"https://arxiv.org/pdf/2310.11608v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09473v2","updated":"2023-10-17T22:01:00Z","published":"2023-02-19T04:03:22Z","title":"Video-Text Retrieval by Supervised Sparse Multi-Grained Learning","summary":"  While recent progress in video-text retrieval has been advanced by the\nexploration of better representation learning, in this paper, we present a\nnovel multi-grained sparse learning framework, S3MA, to learn an aligned sparse\nspace shared between the video and the text for video-text retrieval. The\nshared sparse space is initialized with a finite number of sparse concepts,\neach of which refers to a number of words. With the text data at hand, we learn\nand update the shared sparse space in a supervised manner using the proposed\nsimilarity and alignment losses. Moreover, to enable multi-grained alignment,\nwe incorporate frame representations for better modeling the video modality and\ncalculating fine-grained and coarse-grained similarities. Benefiting from the\nlearned shared sparse space and multi-grained similarities, extensive\nexperiments on several video-text retrieval benchmarks demonstrate the\nsuperiority of S3MA over existing methods. Our code is available at\nhttps://github.com/yimuwangcs/Better_Cross_Modal_Retrieval.\n","authors":["Yimu Wang","Peng Shi"],"pdf_url":"https://arxiv.org/pdf/2302.09473v2.pdf","comment":"Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.11605v1","updated":"2023-10-17T21:59:45Z","published":"2023-10-17T21:59:45Z","title":"DIAR: Deep Image Alignment and Reconstruction using Swin Transformers","summary":"  When taking images of some occluded content, one is often faced with the\nproblem that every individual image frame contains unwanted artifacts, but a\ncollection of images contains all relevant information if properly aligned and\naggregated. In this paper, we attempt to build a deep learning pipeline that\nsimultaneously aligns a sequence of distorted images and reconstructs them. We\ncreate a dataset that contains images with image distortions, such as lighting,\nspecularities, shadows, and occlusion. We create perspective distortions with\ncorresponding ground-truth homographies as labels. We use our dataset to train\nSwin transformer models to analyze sequential image data. The attention maps\nenable the model to detect relevant image content and differentiate it from\noutliers and artifacts. We further explore using neural feature maps as\nalternatives to classical key point detectors. The feature maps of trained\nconvolutional layers provide dense image descriptors that can be used to find\npoint correspondences between images. We utilize this to compute coarse image\nalignments and explore its limitations.\n","authors":["Monika Kwiatkowski","Simon Matern","Olaf Hellwich"],"pdf_url":"https://arxiv.org/pdf/2310.11605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11598v1","updated":"2023-10-17T21:45:51Z","published":"2023-10-17T21:45:51Z","title":"Learning Neural Implicit through Volume Rendering with Attentive Depth\n  Fusion Priors","summary":"  Learning neural implicit representations has achieved remarkable performance\nin 3D reconstruction from multi-view images. Current methods use volume\nrendering to render implicit representations into either RGB or depth images\nthat are supervised by multi-view ground truth. However, rendering a view each\ntime suffers from incomplete depth at holes and unawareness of occluded\nstructures from the depth supervision, which severely affects the accuracy of\ngeometry inference via volume rendering. To resolve this issue, we propose to\nlearn neural implicit representations from multi-view RGBD images through\nvolume rendering with an attentive depth fusion prior. Our prior allows neural\nnetworks to perceive coarse 3D structures from the Truncated Signed Distance\nFunction (TSDF) fused from all depth images available for rendering. The TSDF\nenables accessing the missing depth at holes on one depth image and the\noccluded parts that are invisible from the current view. By introducing a novel\nattention mechanism, we allow neural networks to directly use the depth fusion\nprior with the inferred occupancy as the learned implicit function. Our\nattention mechanism works with either a one-time fused TSDF that represents a\nwhole scene or an incrementally fused TSDF that represents a partial scene in\nthe context of Simultaneous Localization and Mapping (SLAM). Our evaluations on\nwidely used benchmarks including synthetic and real-world scans show our\nsuperiority over the latest neural implicit methods. Project page:\nhttps://machineperceptionlab.github.io/Attentive_DF_Prior/\n","authors":["Pengchong Hu","Zhizhong Han"],"pdf_url":"https://arxiv.org/pdf/2310.11598v1.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.11595v1","updated":"2023-10-17T21:43:42Z","published":"2023-10-17T21:43:42Z","title":"WaveAttack: Asymmetric Frequency Obfuscation-based Backdoor Attacks\n  Against Deep Neural Networks","summary":"  Due to the popularity of Artificial Intelligence (AI) technology, numerous\nbackdoor attacks are designed by adversaries to mislead deep neural network\npredictions by manipulating training samples and training processes. Although\nbackdoor attacks are effective in various real scenarios, they still suffer\nfrom the problems of both low fidelity of poisoned samples and non-negligible\ntransfer in latent space, which make them easily detectable by existing\nbackdoor detection algorithms. To overcome the weakness, this paper proposes a\nnovel frequency-based backdoor attack method named WaveAttack, which obtains\nimage high-frequency features through Discrete Wavelet Transform (DWT) to\ngenerate backdoor triggers. Furthermore, we introduce an asymmetric frequency\nobfuscation method, which can add an adaptive residual in the training and\ninference stage to improve the impact of triggers and further enhance the\neffectiveness of WaveAttack. Comprehensive experimental results show that\nWaveAttack not only achieves higher stealthiness and effectiveness, but also\noutperforms state-of-the-art (SOTA) backdoor attack methods in the fidelity of\nimages by up to 28.27\\% improvement in PSNR, 1.61\\% improvement in SSIM, and\n70.59\\% reduction in IS. Our code is available at\nhttps://anonymous.4open.science/r/AnonymousRep-701D.\n","authors":["Jun Xia","Zhihao Yue","Yingbo Zhou","Zhiwei Ling","Xian Wei","Mingsong Chen"],"pdf_url":"https://arxiv.org/pdf/2310.11595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11577v1","updated":"2023-10-17T20:55:53Z","published":"2023-10-17T20:55:53Z","title":"Studying the Effects of Sex-related Differences on Brain Age Prediction\n  using brain MR Imaging","summary":"  While utilizing machine learning models, one of the most crucial aspects is\nhow bias and fairness affect model outcomes for diverse demographics. This\nbecomes especially relevant in the context of machine learning for medical\nimaging applications as these models are increasingly being used for diagnosis\nand treatment planning. In this paper, we study biases related to sex when\ndeveloping a machine learning model based on brain magnetic resonance images\n(MRI). We investigate the effects of sex by performing brain age prediction\nconsidering different experimental designs: model trained using only female\nsubjects, only male subjects and a balanced dataset. We also perform evaluation\non multiple MRI datasets (Calgary-Campinas(CC359) and CamCAN) to assess the\ngeneralization capability of the proposed models. We found disparities in the\nperformance of brain age prediction models when trained on distinct sex\nsubgroups and datasets, in both final predictions and decision making (assessed\nusing interpretability models). Our results demonstrated variations in model\ngeneralizability across sex-specific subgroups, suggesting potential biases in\nmodels trained on unbalanced datasets. This underlines the critical role of\ncareful experimental design in generating fair and reliable outcomes.\n","authors":["Mahsa Dibaji","Neha Gianchandani","Akhil Nair","Mansi Singhal","Roberto Souza","Mariana Bento"],"pdf_url":"https://arxiv.org/pdf/2310.11577v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12237v2","updated":"2023-10-17T20:50:05Z","published":"2023-03-21T23:44:02Z","title":"Automated deep learning segmentation of high-resolution 7 T postmortem\n  MRI for quantitative analysis of structure-pathology correlations in\n  neurodegenerative diseases","summary":"  Postmortem MRI allows brain anatomy to be examined at high resolution and to\nlink pathology measures with morphometric measurements. However, automated\nsegmentation methods for brain mapping in postmortem MRI are not well\ndeveloped, primarily due to limited availability of labeled datasets, and\nheterogeneity in scanner hardware and acquisition protocols. In this work, we\npresent a high resolution of 135 postmortem human brain tissue specimens imaged\nat 0.3 mm$^{3}$ isotropic using a T2w sequence on a 7T whole-body MRI scanner.\nWe developed a deep learning pipeline to segment the cortical mantle by\nbenchmarking the performance of nine deep neural architectures, followed by\npost-hoc topological correction. We then segment four subcortical structures\n(caudate, putamen, globus pallidus, and thalamus), white matter\nhyperintensities, and the normal appearing white matter. We show generalizing\ncapabilities across whole brain hemispheres in different specimens, and also on\nunseen images acquired at 0.28 mm^3 and 0.16 mm^3 isotropic T2*w FLASH sequence\nat 7T. We then compute localized cortical thickness and volumetric measurements\nacross key regions, and link them with semi-quantitative neuropathological\nratings. Our code, Jupyter notebooks, and the containerized executables are\npublicly available at: https://pulkit-khandelwal.github.io/exvivo-brain-upenn\n","authors":["Pulkit Khandelwal","Michael Tran Duong","Shokufeh Sadaghiani","Sydney Lim","Amanda Denning","Eunice Chung","Sadhana Ravikumar","Sanaz Arezoumandan","Claire Peterson","Madigan Bedard","Noah Capp","Ranjit Ittyerah","Elyse Migdal","Grace Choi","Emily Kopp","Bridget Loja","Eusha Hasan","Jiacheng Li","Alejandra Bahena","Karthik Prabhakaran","Gabor Mizsei","Marianna Gabrielyan","Theresa Schuck","Winifred Trotman","John Robinson","Daniel Ohm","Edward B. Lee","John Q. Trojanowski","Corey McMillan","Murray Grossman","David J. Irwin","John Detre","M. Dylan Tisdall","Sandhitsu R. Das","Laura E. M. Wisse","David A. Wolk","Paul A. Yushkevich"],"pdf_url":"https://arxiv.org/pdf/2303.12237v2.pdf","comment":"Preprint submitted to NeuroImage Project website:\n  https://pulkit-khandelwal.github.io/exvivo-brain-upenn"},{"id":"http://arxiv.org/abs/2310.11535v1","updated":"2023-10-17T19:10:45Z","published":"2023-10-17T19:10:45Z","title":"Learning Lens Blur Fields","summary":"  Optical blur is an inherent property of any lens system and is challenging to\nmodel in modern cameras because of their complex optical elements. To tackle\nthis challenge, we introduce a high-dimensional neural representation of\nblur$-$$\\textit{the lens blur field}$$-$and a practical method for acquiring\nit. The lens blur field is a multilayer perceptron (MLP) designed to (1)\naccurately capture variations of the lens 2D point spread function over image\nplane location, focus setting and, optionally, depth and (2) represent these\nvariations parametrically as a single, sensor-specific function. The\nrepresentation models the combined effects of defocus, diffraction, aberration,\nand accounts for sensor features such as pixel color filters and pixel-specific\nmicro-lenses. To learn the real-world blur field of a given device, we\nformulate a generalized non-blind deconvolution problem that directly optimizes\nthe MLP weights using a small set of focal stacks as the only input. We also\nprovide a first-of-its-kind dataset of 5D blur fields$-$for smartphone cameras,\ncamera bodies equipped with a variety of lenses, etc. Lastly, we show that\nacquired 5D blur fields are expressive and accurate enough to reveal, for the\nfirst time, differences in optical behavior of smartphone devices of the same\nmake and model.\n","authors":["Esther Y. H. Lin","Zhecheng Wang","Rebecca Lin","Daniel Miau","Florian Kainz","Jiawen Chen","Xuaner Cecilia Zhang","David B. Lindell","Kiriakos N. Kutulakos"],"pdf_url":"https://arxiv.org/pdf/2310.11535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15084v2","updated":"2023-10-17T18:54:20Z","published":"2023-09-26T17:27:22Z","title":"The Surveillance AI Pipeline","summary":"  A rapidly growing number of voices argue that AI research, and computer\nvision in particular, is powering mass surveillance. Yet the direct path from\ncomputer vision research to surveillance has remained obscured and difficult to\nassess. Here, we reveal the Surveillance AI pipeline by analyzing three decades\nof computer vision research papers and downstream patents, more than 40,000\ndocuments. We find the large majority of annotated computer vision papers and\npatents self-report their technology enables extracting data about humans.\nMoreover, the majority of these technologies specifically enable extracting\ndata about human bodies and body parts. We present both quantitative and rich\nqualitative analysis illuminating these practices of human data extraction.\nStudying the roots of this pipeline, we find that institutions that\nprolifically produce computer vision research, namely elite universities and\n\"big tech\" corporations, are subsequently cited in thousands of surveillance\npatents. Further, we find consistent evidence against the narrative that only\nthese few rogue entities are contributing to surveillance. Rather, we expose\nthe fieldwide norm that when an institution, nation, or subfield authors\ncomputer vision papers with downstream patents, the majority of these papers\nare used in surveillance patents. In total, we find the number of papers with\ndownstream surveillance patents increased more than five-fold between the 1990s\nand the 2010s, with computer vision research now having been used in more than\n11,000 surveillance patents. Finally, in addition to the high levels of\nsurveillance we find documented in computer vision papers and patents, we\nunearth pervasive patterns of documents using language that obfuscates the\nextent of surveillance. Our analysis reveals the pipeline by which computer\nvision research has powered the ongoing expansion of surveillance.\n","authors":["Pratyusha Ria Kalluri","William Agnew","Myra Cheng","Kentrell Owens","Luca Soldaini","Abeba Birhane"],"pdf_url":"https://arxiv.org/pdf/2309.15084v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11513v1","updated":"2023-10-17T18:20:03Z","published":"2023-10-17T18:20:03Z","title":"GenEval: An Object-Focused Framework for Evaluating Text-to-Image\n  Alignment","summary":"  Recent breakthroughs in diffusion models, multimodal pretraining, and\nefficient finetuning have led to an explosion of text-to-image generative\nmodels. Given human evaluation is expensive and difficult to scale, automated\nmethods are critical for evaluating the increasingly large number of new\nmodels. However, most current automated evaluation metrics like FID or\nCLIPScore only offer a holistic measure of image quality or image-text\nalignment, and are unsuited for fine-grained or instance-level analysis. In\nthis paper, we introduce GenEval, an object-focused framework to evaluate\ncompositional image properties such as object co-occurrence, position, count,\nand color. We show that current object detection models can be leveraged to\nevaluate text-to-image models on a variety of generation tasks with strong\nhuman agreement, and that other discriminative vision models can be linked to\nthis pipeline to further verify properties like object color. We then evaluate\nseveral open-source text-to-image models and analyze their relative generative\ncapabilities on our benchmark. We find that recent models demonstrate\nsignificant improvement on these tasks, though they are still lacking in\ncomplex capabilities such as spatial relations and attribute binding. Finally,\nwe demonstrate how GenEval might be used to help discover existing failure\nmodes, in order to inform development of the next generation of text-to-image\nmodels. Our code to run the GenEval framework is publicly available at\nhttps://github.com/djghosh13/geneval.\n","authors":["Dhruba Ghosh","Hanna Hajishirzi","Ludwig Schmidt"],"pdf_url":"https://arxiv.org/pdf/2310.11513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09978v2","updated":"2023-10-17T18:15:15Z","published":"2023-10-15T23:05:17Z","title":"Chinese Painting Style Transfer Using Deep Generative Models","summary":"  Artistic style transfer aims to modify the style of the image while\npreserving its content. Style transfer using deep learning models has been\nwidely studied since 2015, and most of the applications are focused on specific\nartists like Van Gogh, Monet, Cezanne. There are few researches and\napplications on traditional Chinese painting style transfer. In this paper, we\nwill study and leverage different state-of-the-art deep generative models for\nChinese painting style transfer and evaluate the performance both qualitatively\nand quantitatively. In addition, we propose our own algorithm that combines\nseveral style transfer models for our task. Specifically, we will transfer two\nmain types of traditional Chinese painting style, known as \"Gong-bi\" and\n\"Shui-mo\" (to modern images like nature objects, portraits and landscapes.\n","authors":["Weijian Ma","Yanyang Kong"],"pdf_url":"https://arxiv.org/pdf/2310.09978v2.pdf","comment":"Paper is too old (written in 2019)"},{"id":"http://arxiv.org/abs/2304.10532v3","updated":"2023-10-17T18:15:06Z","published":"2023-04-20T17:59:05Z","title":"Nerfbusters: Removing Ghostly Artifacts from Casually Captured NeRFs","summary":"  Casually captured Neural Radiance Fields (NeRFs) suffer from artifacts such\nas floaters or flawed geometry when rendered outside the camera trajectory.\nExisting evaluation protocols often do not capture these effects, since they\nusually only assess image quality at every 8th frame of the training capture.\nTo push forward progress in novel-view synthesis, we propose a new dataset and\nevaluation procedure, where two camera trajectories are recorded of the scene:\none used for training, and the other for evaluation. In this more challenging\nin-the-wild setting, we find that existing hand-crafted regularizers do not\nremove floaters nor improve scene geometry. Thus, we propose a 3D\ndiffusion-based method that leverages local 3D priors and a novel density-based\nscore distillation sampling loss to discourage artifacts during NeRF\noptimization. We show that this data-driven prior removes floaters and improves\nscene geometry for casual captures.\n","authors":["Frederik Warburg","Ethan Weber","Matthew Tancik","Aleksander Holynski","Angjoo Kanazawa"],"pdf_url":"https://arxiv.org/pdf/2304.10532v3.pdf","comment":"ICCV 2023, project page: https://ethanweber.me/nerfbusters"},{"id":"http://arxiv.org/abs/2310.00530v2","updated":"2023-10-17T18:14:02Z","published":"2023-10-01T00:21:01Z","title":"Enabling Neural Radiance Fields (NeRF) for Large-scale Aerial Images --\n  A Multi-tiling Approach and the Geometry Assessment of NeRF","summary":"  Neural Radiance Fields (NeRF) offer the potential to benefit 3D\nreconstruction tasks, including aerial photogrammetry. However, the scalability\nand accuracy of the inferred geometry are not well-documented for large-scale\naerial assets,since such datasets usually result in very high memory\nconsumption and slow convergence.. In this paper, we aim to scale the NeRF on\nlarge-scael aerial datasets and provide a thorough geometry assessment of NeRF.\nSpecifically, we introduce a location-specific sampling technique as well as a\nmulti-camera tiling (MCT) strategy to reduce memory consumption during image\nloading for RAM, representation training for GPU memory, and increase the\nconvergence rate within tiles. MCT decomposes a large-frame image into multiple\ntiled images with different camera models, allowing these small-frame images to\nbe fed into the training process as needed for specific locations without a\nloss of accuracy. We implement our method on a representative approach,\nMip-NeRF, and compare its geometry performance with threephotgrammetric MVS\npipelines on two typical aerial datasets against LiDAR reference data. Both\nqualitative and quantitative results suggest that the proposed NeRF approach\nproduces better completeness and object details than traditional approaches,\nalthough as of now, it still falls short in terms of accuracy.\n","authors":["Ningli Xu","Rongjun Qin","Debao Huang","Fabio Remondino"],"pdf_url":"https://arxiv.org/pdf/2310.00530v2.pdf","comment":"9 Figure"},{"id":"http://arxiv.org/abs/2308.11063v2","updated":"2023-10-17T18:13:48Z","published":"2023-08-21T22:16:49Z","title":"MetaGCD: Learning to Continually Learn in Generalized Category Discovery","summary":"  In this paper, we consider a real-world scenario where a model that is\ntrained on pre-defined classes continually encounters unlabeled data that\ncontains both known and novel classes. The goal is to continually discover\nnovel classes while maintaining the performance in known classes. We name the\nsetting Continual Generalized Category Discovery (C-GCD). Existing methods for\nnovel class discovery cannot directly handle the C-GCD setting due to some\nunrealistic assumptions, such as the unlabeled data only containing novel\nclasses. Furthermore, they fail to discover novel classes in a continual\nfashion. In this work, we lift all these assumptions and propose an approach,\ncalled MetaGCD, to learn how to incrementally discover with less forgetting.\nOur proposed method uses a meta-learning framework and leverages the offline\nlabeled data to simulate the testing incremental learning process. A\nmeta-objective is defined to revolve around two conflicting learning objectives\nto achieve novel class discovery without forgetting. Furthermore, a soft\nneighborhood-based contrastive network is proposed to discriminate uncorrelated\nimages while attracting correlated images. We build strong baselines and\nconduct extensive experiments on three widely used benchmarks to demonstrate\nthe superiority of our method.\n","authors":["Yanan Wu","Zhixiang Chi","Yang Wang","Songhe Feng"],"pdf_url":"https://arxiv.org/pdf/2308.11063v2.pdf","comment":"This paper has been accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2309.07891v3","updated":"2023-10-17T18:11:28Z","published":"2023-09-14T17:42:08Z","title":"HandNeRF: Learning to Reconstruct Hand-Object Interaction Scene from a\n  Single RGB Image","summary":"  This paper presents a method to learn hand-object interaction prior for\nreconstructing a 3D hand-object scene from a single RGB image. The inference as\nwell as training-data generation for 3D hand-object scene reconstruction is\nchallenging due to the depth ambiguity of a single image and occlusions by the\nhand and object. We turn this challenge into an opportunity by utilizing the\nhand shape to constrain the possible relative configuration of the hand and\nobject geometry. We design a generalizable implicit function, HandNeRF, that\nexplicitly encodes the correlation of the 3D hand shape features and 2D object\nfeatures to predict the hand and object scene geometry. With experiments on\nreal-world datasets, we show that HandNeRF is able to reconstruct hand-object\nscenes of novel grasp configurations more accurately than comparable methods.\nMoreover, we demonstrate that object reconstruction from HandNeRF ensures more\naccurate execution of downstream tasks, such as grasping and motion planning\nfor robotic hand-over and manipulation. The code will be release here:\nhttps://github.com/SamsungLabs/HandNeRF\n","authors":["Hongsuk Choi","Nikhil Chavan-Dafle","Jiacheng Yuan","Volkan Isler","Hyunsoo Park"],"pdf_url":"https://arxiv.org/pdf/2309.07891v3.pdf","comment":"12 pages including the supplementary material, 8 tables, 12 figures"},{"id":"http://arxiv.org/abs/2310.11482v1","updated":"2023-10-17T13:06:39Z","published":"2023-10-17T13:06:39Z","title":"Rethinking Class-incremental Learning in the Era of Large Pre-trained\n  Models via Test-Time Adaptation","summary":"  Class-incremental learning (CIL) is a challenging task that involves\ncontinually learning to categorize classes into new tasks without forgetting\npreviously learned information. The advent of the large pre-trained models\n(PTMs) has fast-tracked the progress in CIL due to the highly transferable PTM\nrepresentations, where tuning a small set of parameters results in\nstate-of-the-art performance when compared with the traditional CIL methods\nthat are trained from scratch. However, repeated fine-tuning on each task\ndestroys the rich representations of the PTMs and further leads to forgetting\nprevious tasks. To strike a balance between the stability and plasticity of\nPTMs for CIL, we propose a novel perspective of eliminating training on every\nnew task and instead performing test-time adaptation (TTA) directly on the test\ninstances. Concretely, we propose \"Test-Time Adaptation for Class-Incremental\nLearning\" (TTACIL) that first fine-tunes Layer Norm parameters of the PTM on\neach test instance for learning task-specific features, and then resets them\nback to the base model to preserve stability. As a consequence, TTACIL does not\nundergo any forgetting, while benefiting each task with the rich PTM features.\nAdditionally, by design, our method is robust to common data corruptions. Our\nTTACIL outperforms several state-of-the-art CIL methods when evaluated on\nmultiple CIL benchmarks under both clean and corrupted data.\n","authors":["Imad Eddine Marouf","Subhankar Roy","Enzo Tartaglione","Stéphane Lathuilière"],"pdf_url":"https://arxiv.org/pdf/2310.11482v1.pdf","comment":"8 pages,5 figures"},{"id":"http://arxiv.org/abs/2310.11480v1","updated":"2023-10-17T12:33:43Z","published":"2023-10-17T12:33:43Z","title":"Whole-brain radiomics for clustered federated personalization in brain\n  tumor segmentation","summary":"  Federated learning and its application to medical image segmentation have\nrecently become a popular research topic. This training paradigm suffers from\nstatistical heterogeneity between participating institutions' local datasets,\nincurring convergence slowdown as well as potential accuracy loss compared to\nclassical training. To mitigate this effect, federated personalization emerged\nas the federated optimization of one model per institution. We propose a novel\npersonalization algorithm tailored to the feature shift induced by the usage of\ndifferent scanners and acquisition parameters by different institutions. This\nmethod is the first to account for both inter and intra-institution feature\nshift (multiple scanners used in a single institution). It is based on the\ncomputation, within each centre, of a series of radiomic features capturing the\nglobal texture of each 3D image volume, followed by a clustering analysis\npooling all feature vectors transferred from the local institutions to the\ncentral server. Each computed clustered decentralized dataset (potentially\nincluding data from different institutions) then serves to finetune a global\nmodel obtained through classical federated learning. We validate our approach\non the Federated Brain Tumor Segmentation 2022 Challenge dataset (FeTS2022).\nOur code is available at (https://github.com/MatthisManthe/radiomics_CFFL).\n","authors":["Matthis Manthe","Stefan Duffner","Carole Lartizien"],"pdf_url":"https://arxiv.org/pdf/2310.11480v1.pdf","comment":"Accepted at Medical Imaging with Deep Learning (MiDL) 2023 conference"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2310.11405v1","updated":"2023-10-17T17:13:13Z","published":"2023-10-17T17:13:13Z","title":"On Coherence-based Predictors for Dense Query Performance Prediction","summary":"  Query Performance Prediction (QPP) estimates the effectiveness of a search\nengine's results in response to a query without relevance judgments.\nTraditionally, post-retrieval predictors have focused upon either the\ndistribution of the retrieval scores, or the coherence of the top-ranked\ndocuments using traditional bag-of-words index representations. More recently,\nBERT-based models using dense embedded document representations have been used\nto create new predictors, but mostly applied to predict the performance of\nrankings created by BM25. Instead, we aim to predict the effectiveness of\nrankings created by single-representation dense retrieval models (ANCE &\nTCT-ColBERT). Therefore, we propose a number of variants of existing\nunsupervised coherence-based predictors that employ neural embedding\nrepresentations. In our experiments on the TREC Deep Learning Track datasets,\nwe demonstrate improved accuracy upon dense retrieval (up to 92% compared to\nsparse variants for TCT-ColBERT and 188% for ANCE). Going deeper, we select the\nmost representative and best performing predictors to study the importance of\ndifferences among predictors and query types on query performance. Using\nexisting distribution-based evaluation QPP measures and a particular type of\nlinear mixed models, we find that query types further significantly influence\nquery performance (and are up to 35% responsible for the unstable performance\nof QPP predictors), and that this sensitivity is unique to dense retrieval\nmodels. Our approach introduces a new setting for obtaining richer information\non query differences in dense QPP that can explain potential unstable\nperformance of existing predictors and outlines the unique characteristics of\ndifferent query types on dense retrieval models.\n","authors":["Maria Vlachou","Craig Macdonald"],"pdf_url":"https://arxiv.org/pdf/2310.11405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.01815v6","updated":"2023-10-17T17:06:34Z","published":"2022-04-04T19:42:46Z","title":"Tensor Completion with Provable Consistency and Fairness Guarantees for\n  Recommender Systems","summary":"  We introduce a new consistency-based approach for defining and solving\nnonnegative/positive matrix and tensor completion problems. The novelty of the\nframework is that instead of artificially making the problem well-posed in the\nform of an application-arbitrary optimization problem, e.g., minimizing a bulk\nstructural measure such as rank or norm, we show that a single\nproperty/constraint: preserving unit-scale consistency, guarantees the\nexistence of both a solution and, under relatively weak support assumptions,\nuniqueness. The framework and solution algorithms also generalize directly to\ntensors of arbitrary dimensions while maintaining computational complexity that\nis linear in problem size for fixed dimension d. In the context of recommender\nsystem (RS) applications, we prove that two reasonable properties that should\nbe expected to hold for any solution to the RS problem are sufficient to permit\nuniqueness guarantees to be established within our framework. This is\nremarkable because it obviates the need for heuristic-based statistical or AI\nmethods despite what appear to be distinctly human/subjective variables at the\nheart of the problem. Key theoretical contributions include a general\nunit-consistent tensor-completion framework with proofs of its properties,\ne.g., consensus-order and fairness, and algorithms with optimal runtime and\nspace complexities, e.g., O(1) term-completion with preprocessing complexity\nthat is linear in the number of known terms of the matrix/tensor. From a\npractical perspective, the seamless ability of the framework to generalize to\nexploit high-dimensional structural relationships among key state variables,\ne.g., user and product attributes, offers a means for extracting significantly\nmore information than is possible for alternative methods that cannot\ngeneralize beyond direct user-product relationships.\n","authors":["Tung Nguyen","Jeffrey Uhlmann"],"pdf_url":"https://arxiv.org/pdf/2204.01815v6.pdf","comment":"Final published version"},{"id":"http://arxiv.org/abs/2310.11270v1","updated":"2023-10-17T13:42:32Z","published":"2023-10-17T13:42:32Z","title":"Graph Neural Networks for Recommendation: Reproducibility, Graph\n  Topology, and Node Representation","summary":"  Graph neural networks (GNNs) have gained prominence in recommendation systems\nin recent years. By representing the user-item matrix as a bipartite and\nundirected graph, GNNs have demonstrated their potential to capture short- and\nlong-distance user-item interactions, thereby learning more accurate preference\npatterns than traditional recommendation approaches. In contrast to previous\ntutorials on the same topic, this tutorial aims to present and examine three\nkey aspects that characterize GNNs for recommendation: (i) the reproducibility\nof state-of-the-art approaches, (ii) the potential impact of graph topological\ncharacteristics on the performance of these models, and (iii) strategies for\nlearning node representations when training features from scratch or utilizing\npre-trained embeddings as additional item information (e.g., multimodal\nfeatures). The goal is to provide three novel theoretical and practical\nperspectives on the field, currently subject to debate in graph learning but\nlong been overlooked in the context of recommendation systems.\n","authors":["Daniele Malitesta","Claudio Pomo","Tommaso Di Noia"],"pdf_url":"https://arxiv.org/pdf/2310.11270v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04633v3","updated":"2023-10-17T13:36:33Z","published":"2023-10-07T01:00:40Z","title":"Unbiased and Robust: External Attention-enhanced Graph Contrastive\n  Learning for Cross-domain Sequential Recommendation","summary":"  Cross-domain sequential recommenders (CSRs) are gaining considerable research\nattention as they can capture user sequential preference by leveraging side\ninformation from multiple domains. However, these works typically follow an\nideal setup, i.e., different domains obey similar data distribution, which\nignores the bias brought by asymmetric interaction densities (a.k.a. the\ninter-domain density bias). Besides, the frequently adopted mechanism (e.g.,\nthe self-attention network) in sequence encoder only focuses on the\ninteractions within a local view, which overlooks the global correlations\nbetween different training batches. To this end, we propose an External\nAttention-enhanced Graph Contrastive Learning framework, namely EA-GCL.\nSpecifically, to remove the impact of the inter-domain density bias, an\nauxiliary Self-Supervised Learning (SSL) task is attached to the traditional\ngraph encoder under a multi-task learning manner. To robustly capture users'\nbehavioral patterns, we develop an external attention-based sequence encoder\nthat contains an MLP-based memory-sharing structure. Unlike the self-attention\nmechanism, such a structure can effectively alleviate the bias interference\nfrom the batch-based training scheme. Extensive experiments on two real-world\ndatasets demonstrate that EA-GCL outperforms several state-of-the-art baselines\non CSR tasks. The source codes and relevant datasets are available at\nhttps://github.com/HoupingY/EA-GCL.\n","authors":["Xinhua Wang","Houping Yue","Zizheng Wang","Liancheng Xu","Jinyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.04633v3.pdf","comment":"9 pages, 4 figures, accepted by ICDM 2023 (workshop-GML4Rec)"},{"id":"http://arxiv.org/abs/2305.07609v3","updated":"2023-10-17T13:29:54Z","published":"2023-05-12T16:54:36Z","title":"Is ChatGPT Fair for Recommendation? Evaluating Fairness in Large\n  Language Model Recommendation","summary":"  The remarkable achievements of Large Language Models (LLMs) have led to the\nemergence of a novel recommendation paradigm -- Recommendation via LLM\n(RecLLM). Nevertheless, it is important to note that LLMs may contain social\nprejudices, and therefore, the fairness of recommendations made by RecLLM\nrequires further investigation. To avoid the potential risks of RecLLM, it is\nimperative to evaluate the fairness of RecLLM with respect to various sensitive\nattributes on the user side. Due to the differences between the RecLLM paradigm\nand the traditional recommendation paradigm, it is problematic to directly use\nthe fairness benchmark of traditional recommendation. To address the dilemma,\nwe propose a novel benchmark called Fairness of Recommendation via LLM\n(FaiRLLM). This benchmark comprises carefully crafted metrics and a dataset\nthat accounts for eight sensitive attributes1 in two recommendation scenarios:\nmusic and movies. By utilizing our FaiRLLM benchmark, we conducted an\nevaluation of ChatGPT and discovered that it still exhibits unfairness to some\nsensitive attributes when generating recommendations. Our code and dataset can\nbe found at https://github.com/jizhi-zhang/FaiRLLM.\n","authors":["Jizhi Zhang","Keqin Bao","Yang Zhang","Wenjie Wang","Fuli Feng","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2305.07609v3.pdf","comment":"Accepted by Recsys 2023 (Short)"},{"id":"http://arxiv.org/abs/2305.00447v3","updated":"2023-10-17T13:29:42Z","published":"2023-04-30T10:55:56Z","title":"TALLRec: An Effective and Efficient Tuning Framework to Align Large\n  Language Model with Recommendation","summary":"  Large Language Models (LLMs) have demonstrated remarkable performance across\ndiverse domains, thereby prompting researchers to explore their potential for\nuse in recommendation systems. Initial attempts have leveraged the exceptional\ncapabilities of LLMs, such as rich knowledge and strong generalization through\nIn-context Learning, which involves phrasing the recommendation task as\nprompts. Nevertheless, the performance of LLMs in recommendation tasks remains\nsuboptimal due to a substantial disparity between the training tasks for LLMs\nand recommendation tasks, as well as inadequate recommendation data during\npre-training. To bridge the gap, we consider building a Large Recommendation\nLanguage Model by tunning LLMs with recommendation data. To this end, we\npropose an efficient and effective Tuning framework for Aligning LLMs with\nRecommendation, namely TALLRec. We have demonstrated that the proposed TALLRec\nframework can significantly enhance the recommendation capabilities of LLMs in\nthe movie and book domains, even with a limited dataset of fewer than 100\nsamples. Additionally, the proposed framework is highly efficient and can be\nexecuted on a single RTX 3090 with LLaMA-7B. Furthermore, the fine-tuned LLM\nexhibits robust cross-domain generalization. Our code and data are available at\nhttps://github.com/SAI990323/TALLRec.\n","authors":["Keqin Bao","Jizhi Zhang","Yang Zhang","Wenjie Wang","Fuli Feng","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2305.00447v3.pdf","comment":"RecSys '23: Proceedings of the 17th ACM Conference on Recommender\n  Systems; September 2023 Pages; 1007-1014"},{"id":"http://arxiv.org/abs/2308.09904v2","updated":"2023-10-17T11:48:10Z","published":"2023-08-19T04:46:01Z","title":"RAH! RecSys-Assistant-Human: A Human-Centered Recommendation Framework\n  with LLM Agents","summary":"  The rapid evolution of the web has led to an exponential growth in content.\nRecommender systems play a crucial role in Human-Computer Interaction (HCI) by\ntailoring content based on individual preferences. Despite their importance,\nchallenges persist in balancing recommendation accuracy with user satisfaction,\naddressing biases while preserving user privacy, and solving cold-start\nproblems in cross-domain situations. This research argues that addressing these\nissues is not solely the recommender systems' responsibility, and a\nhuman-centered approach is vital. We introduce the RAH Recommender system,\nAssistant, and Human) framework, an innovative solution with LLM-based agents\nsuch as Perceive, Learn, Act, Critic, and Reflect, emphasizing the alignment\nwith user personalities. The framework utilizes the Learn-Act-Critic loop and a\nreflection mechanism for improving user alignment. Using the real-world data,\nour experiments demonstrate the RAH framework's efficacy in various\nrecommendation domains, from reducing human burden to mitigating biases and\nenhancing user control. Notably, our contributions provide a human-centered\nrecommendation framework that partners effectively with various recommendation\nmodels.\n","authors":["Yubo Shu","Haonan Zhang","Hansu Gu","Peng Zhang","Tun Lu","Dongsheng Li","Ning Gu"],"pdf_url":"https://arxiv.org/pdf/2308.09904v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11088v1","updated":"2023-10-17T09:13:24Z","published":"2023-10-17T09:13:24Z","title":"MeKB-Rec: Personal Knowledge Graph Learning for Cross-Domain\n  Recommendation","summary":"  It is a long-standing challenge in modern recommender systems to effectively\nmake recommendations for new users, namely the cold-start problem. Cross-Domain\nRecommendation (CDR) has been proposed to address this challenge, but current\nways to represent users' interests across systems are still severely limited.\nWe introduce Personal Knowledge Graph (PKG) as a domain-invariant interest\nrepresentation, and propose a novel CDR paradigm named MeKB-Rec. We first link\nusers and entities in a knowledge base to construct a PKG of users' interests,\nnamed MeKB. Then we learn a semantic representation of MeKB for the\ncross-domain recommendation. To efficiently utilize limited training data in\nCDR, MeKB-Rec employs Pretrained Language Models to inject world knowledge into\nunderstanding users' interests. Beyond most existing systems, our approach\nbuilds a semantic mapping across domains which breaks the requirement for\nin-domain user behaviors, enabling zero-shot recommendations for new users in a\nlow-resource domain. We experiment MeKB-Rec on well-established public CDR\ndatasets, and demonstrate that the new formulation % is more powerful than\nprevious approaches, achieves a new state-of-the-art that significantly\nimproves HR@10 and NDCG@10 metrics over best previous approaches by 24\\%--91\\%,\nwith a 105\\% improvement for HR@10 of zero-shot users with no behavior in the\ntarget domain. We deploy MeKB-Rec in WeiXin recommendation scenarios and\nachieve significant gains in core online metrics. MeKB-Rec is now serving\nhundreds of millions of users in real-world products.\n","authors":["Xin Su","Yao Zhou","Zifei Shan","Qian Chen"],"pdf_url":"https://arxiv.org/pdf/2310.11088v1.pdf","comment":"13 pages, 4 figures, conference"},{"id":"http://arxiv.org/abs/2306.15010v3","updated":"2023-10-17T08:57:30Z","published":"2023-06-26T18:49:09Z","title":"Efficient High-Resolution Template Matching with Vector Quantized\n  Nearest Neighbour Fields","summary":"  Template matching is a fundamental problem in computer vision with\napplications in fields including object detection, image registration, and\nobject tracking. Current methods rely on nearest-neighbour (NN) matching, where\nthe query feature space is converted to NN space by representing each query\npixel with its NN in the template. NN-based methods have been shown to perform\nbetter in occlusions, appearance changes, and non-rigid transformations;\nhowever, they scale poorly with high-resolution data and high feature\ndimensions. We present an NN-based method which efficiently reduces the NN\ncomputations and introduces filtering in the NN fields (NNFs). A vector\nquantization step is introduced before the NN calculation to represent the\ntemplate with $k$ features, and the filter response over the NNFs is used to\ncompare the template and query distributions over the features. We show that\nstate-of-the-art performance is achieved in low-resolution data, and our method\noutperforms previous methods at higher resolution.\n","authors":["Ankit Gupta","Ida-Maria Sintorn"],"pdf_url":"https://arxiv.org/pdf/2306.15010v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11049v1","updated":"2023-10-17T07:35:11Z","published":"2023-10-17T07:35:11Z","title":"Nonet at SemEval-2023 Task 6: Methodologies for Legal Evaluation","summary":"  This paper describes our submission to the SemEval-2023 for Task 6 on\nLegalEval: Understanding Legal Texts. Our submission concentrated on three\nsubtasks: Legal Named Entity Recognition (L-NER) for Task-B, Legal Judgment\nPrediction (LJP) for Task-C1, and Court Judgment Prediction with Explanation\n(CJPE) for Task-C2. We conducted various experiments on these subtasks and\npresented the results in detail, including data statistics and methodology. It\nis worth noting that legal tasks, such as those tackled in this research, have\nbeen gaining importance due to the increasing need to automate legal analysis\nand support. Our team obtained competitive rankings of 15$^{th}$, 11$^{th}$,\nand 1$^{st}$ in Task-B, Task-C1, and Task-C2, respectively, as reported on the\nleaderboard.\n","authors":["Shubham Kumar Nigam","Aniket Deroy","Noel Shallum","Ayush Kumar Mishra","Anup Roy","Shubham Kumar Mishra","Arnab Bhattacharya","Saptarshi Ghosh","Kripabandhu Ghosh"],"pdf_url":"https://arxiv.org/pdf/2310.11049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2105.01331v3","updated":"2023-10-17T07:30:40Z","published":"2021-05-04T07:27:42Z","title":"BLM-17m: A Large-Scale Dataset for Black Lives Matter Topic Detection on\n  Twitter","summary":"  Protection of human rights is one of the most important problems of our\nworld. In this paper, our aim is to provide a dataset which covers one of the\nmost significant human rights contradiction in recent months affected the whole\nworld, George Floyd incident. We propose a labeled dataset for topic detection\nthat contains 17 million tweets. These Tweets are collected from 25 May 2020 to\n21 August 2020 that covers 89 days from start of this incident. We labeled the\ndataset by monitoring most trending news topics from global and local\nnewspapers. Apart from that, we present two baselines, TF-IDF and LDA. We\nevaluated the results of these two methods with three different k values for\nmetrics of precision, recall and f1-score. The collected dataset is available\nat https://github.com/MeysamAsgariC/BLMT.\n","authors":["Hasan Kemik","Nusret Özateş","Meysam Asgari-Chenaghlu","Yang Li","Erik Cambria"],"pdf_url":"https://arxiv.org/pdf/2105.01331v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.05521v3","updated":"2023-10-17T07:25:38Z","published":"2022-10-11T15:12:41Z","title":"Hybrid Inverted Index Is a Robust Accelerator for Dense Retrieval","summary":"  Inverted file structure is a common technique for accelerating dense\nretrieval. It clusters documents based on their embeddings; during searching,\nit probes nearby clusters w.r.t. an input query and only evaluates documents\nwithin them by subsequent codecs, thus avoiding the expensive cost of\nexhaustive traversal. However, the clustering is always lossy, which results in\nthe miss of relevant documents in the probed clusters and hence degrades\nretrieval quality. In contrast, lexical matching, such as overlaps of salient\nterms, tends to be strong feature for identifying relevant documents. In this\nwork, we present the Hybrid Inverted Index (HI$^2$), where the embedding\nclusters and salient terms work collaboratively to accelerate dense retrieval.\nTo make best of both effectiveness and efficiency, we devise a cluster selector\nand a term selector, to construct compact inverted lists and efficiently\nsearching through them. Moreover, we leverage simple unsupervised algorithms as\nwell as end-to-end knowledge distillation to learn these two modules, with the\nlatter further boosting the effectiveness. Based on comprehensive experiments\non popular retrieval benchmarks, we verify that clusters and terms indeed\ncomplement each other, enabling HI$^2$ to achieve lossless retrieval quality\nwith competitive efficiency across various index settings. Our code and\ncheckpoint are publicly available at\nhttps://github.com/namespace-Pt/Adon/tree/HI2.\n","authors":["Peitian Zhang","Zheng Liu","Shitao Xiao","Zhicheng Dou","Jing Yao"],"pdf_url":"https://arxiv.org/pdf/2210.05521v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09234v2","updated":"2023-10-17T04:53:08Z","published":"2023-10-13T16:37:53Z","title":"ClickPrompt: CTR Models are Strong Prompt Generators for Adapting\n  Language Models to CTR Prediction","summary":"  Click-through rate (CTR) prediction has become increasingly indispensable for\nvarious Internet applications. Traditional CTR models convert the multi-field\ncategorical data into ID features via one-hot encoding, and extract the\ncollaborative signals among features. Such a paradigm suffers from the problem\nof semantic information loss. Another line of research explores the potential\nof pretrained language models (PLMs) for CTR prediction by converting input\ndata into textual sentences through hard prompt templates. Although semantic\nsignals are preserved, they generally fail to capture the collaborative\ninformation (e.g., feature interactions, pure ID features), not to mention the\nunacceptable inference overhead brought by the huge model size. In this paper,\nwe aim to model both the semantic knowledge and collaborative knowledge for\naccurate CTR estimation, and meanwhile address the inference inefficiency\nissue. To benefit from both worlds and close their gaps, we propose a novel\nmodel-agnostic framework (i.e., ClickPrompt), where we incorporate CTR models\nto generate interaction-aware soft prompts for PLMs. We design a\nprompt-augmented masked language modeling (PA-MLM) pretraining task, where PLM\nhas to recover the masked tokens based on the language context, as well as the\nsoft prompts generated by CTR model. The collaborative and semantic knowledge\nfrom ID and textual features would be explicitly aligned and interacted via the\nprompt interface. Then, we can either tune the CTR model with PLM for superior\nperformance, or solely tune the CTR model without PLM for inference efficiency.\nExperiments on four real-world datasets validate the effectiveness of\nClickPrompt compared with existing baselines.\n","authors":["Jianghao Lin","Bo Chen","Hangyu Wang","Yunjia Xi","Yanru Qu","Xinyi Dai","Kangning Zhang","Ruiming Tang","Yong Yu","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.09234v2.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2212.10764v2","updated":"2023-10-17T22:15:38Z","published":"2022-12-21T04:49:55Z","title":"Learning List-Level Domain-Invariant Representations for Ranking","summary":"  Domain adaptation aims to transfer the knowledge learned on (data-rich)\nsource domains to (low-resource) target domains, and a popular method is\ninvariant representation learning, which matches and aligns the data\ndistributions on the feature space. Although this method is studied extensively\nand applied on classification and regression problems, its adoption on ranking\nproblems is sporadic, and the few existing implementations lack theoretical\njustifications. This paper revisits invariant representation learning for\nranking. Upon reviewing prior work, we found that they implement what we call\nitem-level alignment, which aligns the distributions of the items being ranked\nfrom all lists in aggregate but ignores their list structure. However, the list\nstructure should be leveraged, because it is intrinsic to ranking problems\nwhere the data and the metrics are defined and computed on lists, not the items\nby themselves. To close this discrepancy, we propose list-level alignment --\nlearning domain-invariant representations at the higher level of lists. The\nbenefits are twofold: it leads to the first domain adaptation generalization\nbound for ranking, in turn providing theoretical support for the proposed\nmethod, and it achieves better empirical transfer performance for unsupervised\ndomain adaptation on ranking tasks, including passage reranking.\n","authors":["Ruicheng Xian","Honglei Zhuang","Zhen Qin","Hamed Zamani","Jing Lu","Ji Ma","Kai Hui","Han Zhao","Xuanhui Wang","Michael Bendersky"],"pdf_url":"https://arxiv.org/pdf/2212.10764v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2302.09473v2","updated":"2023-10-17T22:01:00Z","published":"2023-02-19T04:03:22Z","title":"Video-Text Retrieval by Supervised Sparse Multi-Grained Learning","summary":"  While recent progress in video-text retrieval has been advanced by the\nexploration of better representation learning, in this paper, we present a\nnovel multi-grained sparse learning framework, S3MA, to learn an aligned sparse\nspace shared between the video and the text for video-text retrieval. The\nshared sparse space is initialized with a finite number of sparse concepts,\neach of which refers to a number of words. With the text data at hand, we learn\nand update the shared sparse space in a supervised manner using the proposed\nsimilarity and alignment losses. Moreover, to enable multi-grained alignment,\nwe incorporate frame representations for better modeling the video modality and\ncalculating fine-grained and coarse-grained similarities. Benefiting from the\nlearned shared sparse space and multi-grained similarities, extensive\nexperiments on several video-text retrieval benchmarks demonstrate the\nsuperiority of S3MA over existing methods. Our code is available at\nhttps://github.com/yimuwangcs/Better_Cross_Modal_Retrieval.\n","authors":["Yimu Wang","Peng Shi"],"pdf_url":"https://arxiv.org/pdf/2302.09473v2.pdf","comment":"Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2301.03560v2","updated":"2023-10-17T21:42:38Z","published":"2023-01-09T18:20:55Z","title":"Solo: Data Discovery Using Natural Language Questions Via A\n  Self-Supervised Approach","summary":"  Most deployed data discovery systems, such as Google Datasets, and open data\nportals only support keyword search. Keyword search is geared towards general\naudiences but limits the types of queries the systems can answer. We propose a\nnew system that lets users write natural language questions directly. A major\nbarrier to using this learned data discovery system is it needs\nexpensive-to-collect training data, thus limiting its utility. In this paper,\nwe introduce a self-supervised approach to assemble training datasets and train\nlearned discovery systems without human intervention. It requires addressing\nseveral challenges, including the design of self-supervised strategies for data\ndiscovery, table representation strategies to feed to the models, and relevance\nmodels that work well with the synthetically generated questions. We combine\nall the above contributions into a system, Solo, that solves the problem end to\nend. The evaluation results demonstrate the new techniques outperform\nstate-of-the-art approaches on well-known benchmarks. All in all, the technique\nis a stepping stone towards building learned discovery systems. The code is\nopen-sourced at https://github.com/TheDataStation/solo\n","authors":["Qiming Wang","Raul Castro Fernandez"],"pdf_url":"https://arxiv.org/pdf/2301.03560v2.pdf","comment":"To appear at Sigmod 2024"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2310.11451v1","updated":"2023-10-17T17:58:34Z","published":"2023-10-17T17:58:34Z","title":"Seeking Neural Nuggets: Knowledge Transfer in Large Language Models from\n  a Parametric Perspective","summary":"  Large Language Models (LLMs) inherently encode a wealth of knowledge within\ntheir parameters through pre-training on extensive corpora. While prior\nresearch has delved into operations on these parameters to manipulate the\nunderlying implicit knowledge (encompassing detection, editing, and merging),\nthere remains an ambiguous understanding regarding their transferability across\nmodels with varying scales. In this paper, we seek to empirically investigate\nknowledge transfer from larger to smaller models through a parametric\nperspective. To achieve this, we employ sensitivity-based techniques to extract\nand align knowledge-specific parameters between different LLMs. Moreover, the\nLoRA module is used as the intermediary mechanism for injecting the extracted\nknowledge into smaller models. Evaluations across four benchmarks validate the\nefficacy of our proposed method. Our findings highlight the critical factors\ncontributing to the process of parametric knowledge transfer, underscoring the\ntransferability of model parameters across LLMs of different scales. We release\ncode and data at \\url{https://github.com/maszhongming/ParaKnowTransfer}.\n","authors":["Ming Zhong","Chenxin An","Weizhu Chen","Jiawei Han","Pengcheng He"],"pdf_url":"https://arxiv.org/pdf/2310.11451v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2310.11450v1","updated":"2023-10-17T17:58:19Z","published":"2023-10-17T17:58:19Z","title":"Explaining Deep Neural Networks for Bearing Fault Detection with\n  Vibration Concepts","summary":"  Concept-based explanation methods, such as Concept Activation Vectors, are\npotent means to quantify how abstract or high-level characteristics of input\ndata influence the predictions of complex deep neural networks. However,\napplying them to industrial prediction problems is challenging as it is not\nimmediately clear how to define and access appropriate concepts for individual\nuse cases and specific data types. In this work, we investigate how to leverage\nestablished concept-based explanation techniques in the context of bearing\nfault detection with deep neural networks trained on vibration signals. Since\nbearings are prevalent in almost every rotating equipment, ensuring the\nreliability of intransparent fault detection models is crucial to prevent\ncostly repairs and downtimes of industrial machinery. Our evaluations\ndemonstrate that explaining opaque models in terms of vibration concepts\nenables human-comprehensible and intuitive insights about their inner workings,\nbut the underlying assumptions need to be carefully validated first.\n","authors":["Thomas Decker","Michael Lebacher","Volker Tresp"],"pdf_url":"https://arxiv.org/pdf/2310.11450v1.pdf","comment":"2023 IEEE 21st International Conference on Industrial Informatics\n  (INDIN)"},{"id":"http://arxiv.org/abs/2310.11449v1","updated":"2023-10-17T17:58:00Z","published":"2023-10-17T17:58:00Z","title":"DELIFFAS: Deformable Light Fields for Fast Avatar Synthesis","summary":"  Generating controllable and photorealistic digital human avatars is a\nlong-standing and important problem in Vision and Graphics. Recent methods have\nshown great progress in terms of either photorealism or inference speed while\nthe combination of the two desired properties still remains unsolved. To this\nend, we propose a novel method, called DELIFFAS, which parameterizes the\nappearance of the human as a surface light field that is attached to a\ncontrollable and deforming human mesh model. At the core, we represent the\nlight field around the human with a deformable two-surface parameterization,\nwhich enables fast and accurate inference of the human appearance. This allows\nperceptual supervision on the full image compared to previous approaches that\ncould only supervise individual pixels or small patches due to their slow\nruntime. Our carefully designed human representation and supervision strategy\nleads to state-of-the-art synthesis results and inference time. The video\nresults and code are available at\nhttps://vcai.mpi-inf.mpg.de/projects/DELIFFAS.\n","authors":["Youngjoong Kwon","Lingjie Liu","Henry Fuchs","Marc Habermann","Christian Theobalt"],"pdf_url":"https://arxiv.org/pdf/2310.11449v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10649v2","updated":"2023-10-17T17:55:33Z","published":"2023-10-16T17:59:54Z","title":"A Computational Framework for Solving Wasserstein Lagrangian Flows","summary":"  The dynamical formulation of the optimal transport can be extended through\nvarious choices of the underlying geometry ($\\textit{kinetic energy}$), and the\nregularization of density paths ($\\textit{potential energy}$). These\ncombinations yield different variational problems ($\\textit{Lagrangians}$),\nencompassing many variations of the optimal transport problem such as the\nSchr\\\"odinger bridge, unbalanced optimal transport, and optimal transport with\nphysical constraints, among others. In general, the optimal density path is\nunknown, and solving these variational problems can be computationally\nchallenging. Leveraging the dual formulation of the Lagrangians, we propose a\nnovel deep learning based framework approaching all of these problems from a\nunified perspective. Our method does not require simulating or backpropagating\nthrough the trajectories of the learned dynamics, and does not need access to\noptimal couplings. We showcase the versatility of the proposed framework by\noutperforming previous approaches for the single-cell trajectory inference,\nwhere incorporating prior knowledge into the dynamics is crucial for correct\npredictions.\n","authors":["Kirill Neklyudov","Rob Brekelmans","Alexander Tong","Lazar Atanackovic","Qiang Liu","Alireza Makhzani"],"pdf_url":"https://arxiv.org/pdf/2310.10649v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11445v1","updated":"2023-10-17T17:55:32Z","published":"2023-10-17T17:55:32Z","title":"Stochastic Quantum Sampling for Non-Logconcave Distributions and\n  Estimating Partition Functions","summary":"  We present quantum algorithms for sampling from non-logconcave probability\ndistributions in the form of $\\pi(x) \\propto \\exp(-\\beta f(x))$. Here, $f$ can\nbe written as a finite sum $f(x):= \\frac{1}{N}\\sum_{k=1}^N f_k(x)$. Our\napproach is based on quantum simulated annealing on slowly varying Markov\nchains derived from unadjusted Langevin algorithms, removing the necessity for\nfunction evaluations which can be computationally expensive for large data sets\nin mixture modeling and multi-stable systems. We also incorporate a stochastic\ngradient oracle that implements the quantum walk operators inexactly by only\nusing mini-batch gradients. As a result, our stochastic gradient based\nalgorithm only accesses small subsets of data points in implementing the\nquantum walk. One challenge of quantizing the resulting Markov chains is that\nthey do not satisfy the detailed balance condition in general. Consequently,\nthe mixing time of the algorithm cannot be expressed in terms of the spectral\ngap of the transition density, making the quantum algorithms nontrivial to\nanalyze. To overcome these challenges, we first build a hypothetical Markov\nchain that is reversible, and also converges to the target distribution. Then,\nwe quantified the distance between our algorithm's output and the target\ndistribution by using this hypothetical chain as a bridge to establish the\ntotal complexity. Our quantum algorithms exhibit polynomial speedups in terms\nof both dimension and precision dependencies when compared to the best-known\nclassical algorithms.\n","authors":["Guneykan Ozgul","Xiantao Li","Mehrdad Mahdavi","Chunhao Wang"],"pdf_url":"https://arxiv.org/pdf/2310.11445v1.pdf","comment":"32 pages"},{"id":"http://arxiv.org/abs/2310.09986v2","updated":"2023-10-17T17:50:36Z","published":"2023-10-15T23:59:57Z","title":"On Statistical Learning of Branch and Bound for Vehicle Routing\n  Optimization","summary":"  Recently, machine learning of the branch and bound algorithm has shown\npromise in approximating competent solutions to NP-hard problems. In this\npaper, we utilize and comprehensively compare the outcomes of three neural\nnetworks--graph convolutional neural network (GCNN), GraphSAGE, and graph\nattention network (GAT)--to solve the capacitated vehicle routing problem. We\ntrain these neural networks to emulate the decision-making process of the\ncomputationally expensive Strong Branching strategy. The neural networks are\ntrained on six instances with distinct topologies from the CVRPLIB and\nevaluated on eight additional instances. Moreover, we reduced the minimum\nnumber of vehicles required to solve a CVRP instance to a bin-packing problem,\nwhich was addressed in a similar manner. Through rigorous experimentation, we\nfound that this approach can match or improve upon the performance of the\nbranch and bound algorithm with the Strong Branching strategy while requiring\nsignificantly less computational time. The source code that corresponds to our\nresearch findings and methodology is readily accessible and available for\nreference at the following web address: https://isotlaboratory.github.io/ml4vrp\n","authors":["Andrew Naguib","Waleed A. Yousef","Issa Traoré","Mohammad Mamun"],"pdf_url":"https://arxiv.org/pdf/2310.09986v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11439v1","updated":"2023-10-17T17:50:22Z","published":"2023-10-17T17:50:22Z","title":"Understanding deep neural networks through the lens of their\n  non-linearity","summary":"  The remarkable success of deep neural networks (DNN) is often attributed to\ntheir high expressive power and their ability to approximate functions of\narbitrary complexity. Indeed, DNNs are highly non-linear models, and activation\nfunctions introduced into them are largely responsible for this. While many\nworks studied the expressive power of DNNs through the lens of their\napproximation capabilities, quantifying the non-linearity of DNNs or of\nindividual activation functions remains an open problem. In this paper, we\npropose the first theoretically sound solution to track non-linearity\npropagation in deep neural networks with a specific focus on computer vision\napplications. Our proposed affinity score allows us to gain insights into the\ninner workings of a wide range of different architectures and learning\nparadigms. We provide extensive experimental results that highlight the\npractical utility of the proposed affinity score and its potential for\nlong-reaching applications.\n","authors":["Quentin Bouniot","Ievgen Redko","Anton Mallasto","Charlotte Laclau","Karol Arndt","Oliver Struckmeier","Markus Heinonen","Ville Kyrki","Samuel Kaski"],"pdf_url":"https://arxiv.org/pdf/2310.11439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09447v2","updated":"2023-10-17T17:46:01Z","published":"2023-03-16T16:23:13Z","title":"Steering Prototypes with Prompt-tuning for Rehearsal-free Continual\n  Learning","summary":"  In the context of continual learning, prototypes-as representative class\nembeddings-offer advantages in memory conservation and the mitigation of\ncatastrophic forgetting. However, challenges related to semantic drift and\nprototype interference persist. In this study, we introduce the Contrastive\nPrototypical Prompt (CPP) approach. Through task-specific prompt-tuning,\nunderpinned by a contrastive learning objective, we effectively address both\naforementioned challenges. Our evaluations on four challenging\nclass-incremental benchmarks reveal that CPP achieves a significant 4% to 6%\nimprovement over state-of-the-art methods. Importantly, CPP operates without a\nrehearsal buffer and narrows the performance divergence between continual and\noffline joint-learning, suggesting an innovative scheme for Transformer-based\ncontinual learning systems.\n","authors":["Zhuowei Li","Long Zhao","Zizhao Zhang","Han Zhang","Di Liu","Ting Liu","Dimitris N. Metaxas"],"pdf_url":"https://arxiv.org/pdf/2303.09447v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11431v1","updated":"2023-10-17T17:41:28Z","published":"2023-10-17T17:41:28Z","title":"Identifying Interpretable Visual Features in Artificial and Biological\n  Neural Systems","summary":"  Single neurons in neural networks are often ``interpretable'' in that they\nrepresent individual, intuitively meaningful features. However, many neurons\nexhibit $\\textit{mixed selectivity}$, i.e., they represent multiple unrelated\nfeatures. A recent hypothesis proposes that features in deep networks may be\nrepresented in $\\textit{superposition}$, i.e., on non-orthogonal axes by\nmultiple neurons, since the number of possible interpretable features in\nnatural data is generally larger than the number of neurons in a given network.\nAccordingly, we should be able to find meaningful directions in activation\nspace that are not aligned with individual neurons. Here, we propose (1) an\nautomated method for quantifying visual interpretability that is validated\nagainst a large database of human psychophysics judgments of neuron\ninterpretability, and (2) an approach for finding meaningful directions in\nnetwork activation space. We leverage these methods to discover directions in\nconvolutional neural networks that are more intuitively meaningful than\nindividual neurons, as we confirm and investigate in a series of analyses.\nMoreover, we apply the same method to two recent datasets of visual neural\nresponses in the brain and find that our conclusions largely transfer to real\nneural data, suggesting that superposition might be deployed by the brain. This\nalso provides a link with disentanglement and raises fundamental questions\nabout robust, efficient and factorized representations in both artificial and\nbiological neural systems.\n","authors":["David Klindt","Sophia Sanborn","Francisco Acosta","Frédéric Poitevin","Nina Miolane"],"pdf_url":"https://arxiv.org/pdf/2310.11431v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11428v1","updated":"2023-10-17T17:39:40Z","published":"2023-10-17T17:39:40Z","title":"Butterfly Effects of SGD Noise: Error Amplification in Behavior Cloning\n  and Autoregression","summary":"  This work studies training instabilities of behavior cloning with deep neural\nnetworks. We observe that minibatch SGD updates to the policy network during\ntraining result in sharp oscillations in long-horizon rewards, despite\nnegligibly affecting the behavior cloning loss. We empirically disentangle the\nstatistical and computational causes of these oscillations, and find them to\nstem from the chaotic propagation of minibatch SGD noise through unstable\nclosed-loop dynamics. While SGD noise is benign in the single-step action\nprediction objective, it results in catastrophic error accumulation over long\nhorizons, an effect we term gradient variance amplification (GVA). We show that\nmany standard mitigation techniques do not alleviate GVA, but find an\nexponential moving average (EMA) of iterates to be surprisingly effective at\ndoing so. We illustrate the generality of this phenomenon by showing the\nexistence of GVA and its amelioration by EMA in both continuous control and\nautoregressive language generation. Finally, we provide theoretical vignettes\nthat highlight the benefits of EMA in alleviating GVA and shed light on the\nextent to which classical convex models can help in understanding the benefits\nof iterate averaging in deep learning.\n","authors":["Adam Block","Dylan J. Foster","Akshay Krishnamurthy","Max Simchowitz","Cyril Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.11428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11407v1","updated":"2023-10-17T17:14:07Z","published":"2023-10-17T17:14:07Z","title":"Group-blind optimal transport to group parity and its constrained\n  variants","summary":"  Fairness holds a pivotal role in the realm of machine learning, particularly\nwhen it comes to addressing groups categorised by sensitive attributes, e.g.,\ngender, race. Prevailing algorithms in fair learning predominantly hinge on\naccessibility or estimations of these sensitive attributes, at least in the\ntraining process. We design a single group-blind projection map that aligns the\nfeature distributions of both groups in the source data, achieving\n(demographic) group parity, without requiring values of the protected attribute\nfor individual samples in the computation of the map, as well as its use.\nInstead, our approach utilises the feature distributions of the privileged and\nunprivileged groups in a boarder population and the essential assumption that\nthe source data are unbiased representation of the population. We present\nnumerical results on synthetic data and real data.\n","authors":["Quan Zhou","Jakub Marecek"],"pdf_url":"https://arxiv.org/pdf/2310.11407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11401v1","updated":"2023-10-17T17:10:56Z","published":"2023-10-17T17:10:56Z","title":"Enhancing Group Fairness in Online Settings Using Oblique Decision\n  Forests","summary":"  Fairness, especially group fairness, is an important consideration in the\ncontext of machine learning systems. The most commonly adopted group\nfairness-enhancing techniques are in-processing methods that rely on a mixture\nof a fairness objective (e.g., demographic parity) and a task-specific\nobjective (e.g., cross-entropy) during the training process. However, when data\narrives in an online fashion -- one instance at a time -- optimizing such\nfairness objectives poses several challenges. In particular, group fairness\nobjectives are defined using expectations of predictions across different\ndemographic groups. In the online setting, where the algorithm has access to a\nsingle instance at a time, estimating the group fairness objective requires\nadditional storage and significantly more computation (e.g., forward/backward\npasses) than the task-specific objective at every time step. In this paper, we\npropose Aranyani, an ensemble of oblique decision trees, to make fair decisions\nin online settings. The hierarchical tree structure of Aranyani enables\nparameter isolation and allows us to efficiently compute the fairness gradients\nusing aggregate statistics of previous decisions, eliminating the need for\nadditional storage and forward/backward passes. We also present an efficient\nframework to train Aranyani and theoretically analyze several of its\nproperties. We conduct empirical evaluations on 5 publicly available benchmarks\n(including vision and language datasets) to show that Aranyani achieves a\nbetter accuracy-fairness trade-off compared to baseline approaches.\n","authors":["Somnath Basu Roy Chowdhury","Nicholas Monath","Ahmad Beirami","Rahul Kidambi","Avinava Dubey","Amr Ahmed","Snigdha Chaturvedi"],"pdf_url":"https://arxiv.org/pdf/2310.11401v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2204.01815v6","updated":"2023-10-17T17:06:34Z","published":"2022-04-04T19:42:46Z","title":"Tensor Completion with Provable Consistency and Fairness Guarantees for\n  Recommender Systems","summary":"  We introduce a new consistency-based approach for defining and solving\nnonnegative/positive matrix and tensor completion problems. The novelty of the\nframework is that instead of artificially making the problem well-posed in the\nform of an application-arbitrary optimization problem, e.g., minimizing a bulk\nstructural measure such as rank or norm, we show that a single\nproperty/constraint: preserving unit-scale consistency, guarantees the\nexistence of both a solution and, under relatively weak support assumptions,\nuniqueness. The framework and solution algorithms also generalize directly to\ntensors of arbitrary dimensions while maintaining computational complexity that\nis linear in problem size for fixed dimension d. In the context of recommender\nsystem (RS) applications, we prove that two reasonable properties that should\nbe expected to hold for any solution to the RS problem are sufficient to permit\nuniqueness guarantees to be established within our framework. This is\nremarkable because it obviates the need for heuristic-based statistical or AI\nmethods despite what appear to be distinctly human/subjective variables at the\nheart of the problem. Key theoretical contributions include a general\nunit-consistent tensor-completion framework with proofs of its properties,\ne.g., consensus-order and fairness, and algorithms with optimal runtime and\nspace complexities, e.g., O(1) term-completion with preprocessing complexity\nthat is linear in the number of known terms of the matrix/tensor. From a\npractical perspective, the seamless ability of the framework to generalize to\nexploit high-dimensional structural relationships among key state variables,\ne.g., user and product attributes, offers a means for extracting significantly\nmore information than is possible for alternative methods that cannot\ngeneralize beyond direct user-product relationships.\n","authors":["Tung Nguyen","Jeffrey Uhlmann"],"pdf_url":"https://arxiv.org/pdf/2204.01815v6.pdf","comment":"Final published version"},{"id":"http://arxiv.org/abs/2303.04614v2","updated":"2023-10-17T17:06:04Z","published":"2023-03-08T14:35:03Z","title":"Densely Connected $G$-invariant Deep Neural Networks with Signed\n  Permutation Representations","summary":"  We introduce and investigate, for finite groups $G$, $G$-invariant deep\nneural network ($G$-DNN) architectures with ReLU activation that are densely\nconnected-- i.e., include all possible skip connections. In contrast to other\n$G$-invariant architectures in the literature, the preactivations of\nthe$G$-DNNs presented here are able to transform by \\emph{signed} permutation\nrepresentations (signed perm-reps) of $G$. Moreover, the individual layers of\nthe $G$-DNNs are not required to be $G$-equivariant; instead, the\npreactivations are constrained to be $G$-equivariant functions of the network\ninput in a way that couples weights across all layers. The result is a richer\nfamily of $G$-invariant architectures never seen previously. We derive an\nefficient implementation of $G$-DNNs after a reparameterization of weights, as\nwell as necessary and sufficient conditions for an architecture to be\n``admissible''-- i.e., nondegenerate and inequivalent to smaller architectures.\nWe include code that allows a user to build a $G$-DNN interactively\nlayer-by-layer, with the final architecture guaranteed to be admissible. We\nshow that there are far more admissible $G$-DNN architectures than those\naccessible with the ``concatenated ReLU'' activation function from the\nliterature. Finally, we apply $G$-DNNs to two example problems -- (1)\nmultiplication in $\\{-1, 1\\}$ (with theoretical guarantees) and (2) 3D object\nclassification -- % finding that the inclusion of signed perm-reps\nsignificantly boosts predictive performance compared to baselines with only\nordinary (i.e., unsigned) perm-reps.\n","authors":["Devanshu Agrawal","James Ostrowski"],"pdf_url":"https://arxiv.org/pdf/2303.04614v2.pdf","comment":"40 pages, 2 figures, 4 tables. For associated code repository see\n  https://github.com/dagrawa2/gdnn_code"},{"id":"http://arxiv.org/abs/2310.11397v1","updated":"2023-10-17T17:03:00Z","published":"2023-10-17T17:03:00Z","title":"Last One Standing: A Comparative Analysis of Security and Privacy of\n  Soft Prompt Tuning, LoRA, and In-Context Learning","summary":"  Large Language Models (LLMs) are powerful tools for natural language\nprocessing, enabling novel applications and user experiences. However, to\nachieve optimal performance, LLMs often require adaptation with private data,\nwhich poses privacy and security challenges. Several techniques have been\nproposed to adapt LLMs with private data, such as Low-Rank Adaptation (LoRA),\nSoft Prompt Tuning (SPT), and In-Context Learning (ICL), but their comparative\nprivacy and security properties have not been systematically investigated. In\nthis work, we fill this gap by evaluating the robustness of LoRA, SPT, and ICL\nagainst three types of well-established attacks: membership inference, which\nexposes data leakage (privacy); backdoor, which injects malicious behavior\n(security); and model stealing, which can violate intellectual property\n(privacy and security). Our results show that there is no silver bullet for\nprivacy and security in LLM adaptation and each technique has different\nstrengths and weaknesses.\n","authors":["Rui Wen","Tianhao Wang","Michael Backes","Yang Zhang","Ahmed Salem"],"pdf_url":"https://arxiv.org/pdf/2310.11397v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.05372v2","updated":"2023-10-17T16:56:55Z","published":"2023-02-10T16:50:40Z","title":"Towards Minimax Optimality of Model-based Robust Reinforcement Learning","summary":"  We study the sample complexity of obtaining an $\\epsilon$-optimal policy in\n\\emph{Robust} discounted Markov Decision Processes (RMDPs), given only access\nto a generative model of the nominal kernel. This problem is widely studied in\nthe non-robust case, and it is known that any planning approach applied to an\nempirical MDP estimated with $\\tilde{\\mathcal{O}}(\\frac{H^3 \\mid S \\mid\\mid A\n\\mid}{\\epsilon^2})$ samples provides an $\\epsilon$-optimal policy, which is\nminimax optimal. Results in the robust case are much more scarce. For $sa$-\n(resp $s$-)rectangular uncertainty sets, the best known sample complexity is\n$\\tilde{\\mathcal{O}}(\\frac{H^4 \\mid S \\mid^2\\mid A \\mid}{\\epsilon^2})$ (resp.\n$\\tilde{\\mathcal{O}}(\\frac{H^4 \\mid S \\mid^2\\mid A \\mid^2}{\\epsilon^2})$), for\nspecific algorithms and when the uncertainty set is based on the total\nvariation (TV), the KL or the Chi-square divergences. In this paper, we\nconsider uncertainty sets defined with an $L_p$-ball (recovering the TV case),\nand study the sample complexity of \\emph{any} planning algorithm (with high\naccuracy guarantee on the solution) applied to an empirical RMDP estimated\nusing the generative model. In the general case, we prove a sample complexity\nof $\\tilde{\\mathcal{O}}(\\frac{H^4 \\mid S \\mid\\mid A \\mid}{\\epsilon^2})$ for\nboth the $sa$- and $s$-rectangular cases (improvements of $\\mid S \\mid$ and\n$\\mid S \\mid\\mid A \\mid$ respectively). When the size of the uncertainty is\nsmall enough, we improve the sample complexity to\n$\\tilde{\\mathcal{O}}(\\frac{H^3 \\mid S \\mid\\mid A \\mid }{\\epsilon^2})$,\nrecovering the lower-bound for the non-robust case for the first time and a\nrobust lower-bound when the size of the uncertainty is small enough.\n","authors":["Pierre Clavier","Erwan Le Pennec","Matthieu Geist"],"pdf_url":"https://arxiv.org/pdf/2302.05372v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11389v1","updated":"2023-10-17T16:35:39Z","published":"2023-10-17T16:35:39Z","title":"VaR\\ and CVaR Estimation in a Markov Cost Process: Lower and Upper\n  Bounds","summary":"  We tackle the problem of estimating the Value-at-Risk (VaR) and the\nConditional Value-at-Risk (CVaR) of the infinite-horizon discounted cost within\na Markov cost process. First, we derive a minimax lower bound of\n$\\Omega(1/\\sqrt{n})$ that holds both in an expected and in a probabilistic\nsense. Then, using a finite-horizon truncation scheme, we derive an upper bound\nfor the error in CVaR estimation, which matches our lower bound up to constant\nfactors. Finally, we discuss an extension of our estimation scheme that covers\nmore general risk measures satisfying a certain continuity criterion, e.g.,\nspectral risk measures, utility-based shortfall risk. To the best of our\nknowledge, our work is the first to provide lower and upper bounds on the\nestimation error for any risk measure within Markovian settings. We remark that\nour lower bounds also extend to the infinite-horizon discounted costs' mean.\nEven in that case, our result $\\Omega(1/\\sqrt{n}) $ improves upon the existing\nresult $\\Omega(1/n)$[13].\n","authors":["Sanjay Bhat","Prashanth L. A.","Gugan Thoppe"],"pdf_url":"https://arxiv.org/pdf/2310.11389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16309v3","updated":"2023-10-17T16:34:46Z","published":"2023-05-25T17:58:14Z","title":"Imitating Task and Motion Planning with Visuomotor Transformers","summary":"  Imitation learning is a powerful tool for training robot manipulation\npolicies, allowing them to learn from expert demonstrations without manual\nprogramming or trial-and-error. However, common methods of data collection,\nsuch as human supervision, scale poorly, as they are time-consuming and\nlabor-intensive. In contrast, Task and Motion Planning (TAMP) can autonomously\ngenerate large-scale datasets of diverse demonstrations. In this work, we show\nthat the combination of large-scale datasets generated by TAMP supervisors and\nflexible Transformer models to fit them is a powerful paradigm for robot\nmanipulation. To that end, we present a novel imitation learning system called\nOPTIMUS that trains large-scale visuomotor Transformer policies by imitating a\nTAMP agent. OPTIMUS introduces a pipeline for generating TAMP data that is\nspecifically curated for imitation learning and can be used to train performant\ntransformer-based policies. In this paper, we present a thorough study of the\ndesign decisions required to imitate TAMP and demonstrate that OPTIMUS can\nsolve a wide variety of challenging vision-based manipulation tasks with over\n70 different objects, ranging from long-horizon pick-and-place tasks, to shelf\nand articulated object manipulation, achieving 70 to 80% success rates. Video\nresults and code at https://mihdalal.github.io/optimus/\n","authors":["Murtaza Dalal","Ajay Mandlekar","Caelan Garrett","Ankur Handa","Ruslan Salakhutdinov","Dieter Fox"],"pdf_url":"https://arxiv.org/pdf/2305.16309v3.pdf","comment":"Conference on Robot Learning (CoRL) 2023. 8 pages, 5 figures, 2\n  tables; 11 pages appendix (10 additional figures)"},{"id":"http://arxiv.org/abs/2310.03708v2","updated":"2023-10-17T16:29:35Z","published":"2023-10-05T17:35:26Z","title":"Beyond One-Preference-for-All: Multi-Objective Direct Preference\n  Optimization for Language Models","summary":"  A single language model (LM), despite aligning well with an average labeler\nthrough reinforcement learning from human feedback (RLHF), may not universally\nsuit diverse human preferences. Recent approaches thus pursue customization,\ntraining separate principle-based reward models to represent different\nalignment objectives (e.g. helpfulness, harmlessness, or honesty). Different\nLMs can then be trained for different preferences through multi-objective RLHF\n(MORLHF) with different objective weightings. Yet, RLHF is unstable and\nresource-heavy, especially for MORLHF with diverse and usually conflicting\nobjectives. In this paper, we present Multi-Objective Direct Preference\nOptimization (MODPO), an RL-free algorithm that extends Direct Preference\nOptimization (DPO) for multiple alignment objectives. Essentially, MODPO folds\nLM learning directly into reward modeling, aligning LMs with the weighted sum\nof all principle-based rewards using pure cross-entropy loss. While\ntheoretically guaranteed to produce the same optimal solutions as MORLHF, MODPO\nis practically more stable and computationally efficient, obviating value\nfunction modeling and online sample collection. Empirical results in safety\nalignment and long-form question answering confirm that MODPO matches or\noutperforms existing methods, consistently producing one of the most\ncompetitive LM fronts that cater to diverse preferences with 3 times fewer\ncomputations compared with MORLHF.\n","authors":["Zhanhui Zhou","Jie Liu","Chao Yang","Jing Shao","Yu Liu","Xiangyu Yue","Wanli Ouyang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2310.03708v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05726v2","updated":"2023-10-17T16:25:25Z","published":"2023-06-09T07:46:24Z","title":"Iteratively Refined Behavior Regularization for Offline Reinforcement\n  Learning","summary":"  One of the fundamental challenges for offline reinforcement learning (RL) is\nensuring robustness to data distribution. Whether the data originates from a\nnear-optimal policy or not, we anticipate that an algorithm should demonstrate\nits ability to learn an effective control policy that seamlessly aligns with\nthe inherent distribution of offline data. Unfortunately, behavior\nregularization, a simple yet effective offline RL algorithm, tends to struggle\nin this regard. In this paper, we propose a new algorithm that substantially\nenhances behavior-regularization based on conservative policy iteration. Our\nkey observation is that by iteratively refining the reference policy used for\nbehavior regularization, conservative policy update guarantees gradually\nimprovement, while also implicitly avoiding querying out-of-sample actions to\nprevent catastrophic learning failures. We prove that in the tabular setting\nthis algorithm is capable of learning the optimal policy covered by the offline\ndataset, commonly referred to as the in-sample optimal policy. We then explore\nseveral implementation details of the algorithm when function approximations\nare applied. The resulting algorithm is easy to implement, requiring only a few\nlines of code modification to existing methods. Experimental results on the\nD4RL benchmark indicate that our method outperforms previous state-of-the-art\nbaselines in most tasks, clearly demonstrate its superiority over behavior\nregularization.\n","authors":["Xiaohan Hu","Yi Ma","Chenjun Xiao","Yan Zheng","Jianye Hao"],"pdf_url":"https://arxiv.org/pdf/2306.05726v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2107.02565v4","updated":"2023-10-17T16:22:04Z","published":"2021-07-06T12:08:44Z","title":"Prioritized training on points that are learnable, worth learning, and\n  not yet learned (workshop version)","summary":"  We introduce Goldilocks Selection, a technique for faster model training\nwhich selects a sequence of training points that are \"just right\". We propose\nan information-theoretic acquisition function -- the reducible validation loss\n-- and compute it with a small proxy model -- GoldiProx -- to efficiently\nchoose training points that maximize information about a validation set. We\nshow that the \"hard\" (e.g. high loss) points usually selected in the\noptimization literature are typically noisy, while the \"easy\" (e.g. low noise)\nsamples often prioritized for curriculum learning confer less information.\nFurther, points with uncertain labels, typically targeted by active learning,\ntend to be less relevant to the task. In contrast, Goldilocks Selection chooses\npoints that are \"just right\" and empirically outperforms the above approaches.\nMoreover, the selected sequence can transfer to other architectures;\npractitioners can share and reuse it without the need to recreate it.\n","authors":["Sören Mindermann","Muhammed Razzak","Winnie Xu","Andreas Kirsch","Mrinank Sharma","Adrien Morisot","Aidan N. Gomez","Sebastian Farquhar","Jan Brauner","Yarin Gal"],"pdf_url":"https://arxiv.org/pdf/2107.02565v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11377v1","updated":"2023-10-17T16:21:28Z","published":"2023-10-17T16:21:28Z","title":"Faster Algorithms for Generalized Mean Densest Subgraph Problem","summary":"  The densest subgraph of a large graph usually refers to some subgraph with\nthe highest average degree, which has been extended to the family of $p$-means\ndense subgraph objectives by~\\citet{veldt2021generalized}. The $p$-mean densest\nsubgraph problem seeks a subgraph with the highest average $p$-th-power degree,\nwhereas the standard densest subgraph problem seeks a subgraph with a simple\nhighest average degree. It was shown that the standard peeling algorithm can\nperform arbitrarily poorly on generalized objective when $p>1$ but uncertain\nwhen $0<p<1$. In this paper, we are the first to show that a standard peeling\nalgorithm can still yield $2^{1/p}$-approximation for the case $0<p < 1$.\n(Veldt 2021) proposed a new generalized peeling algorithm (GENPEEL), which for\n$p \\geq 1$ has an approximation guarantee ratio $(p+1)^{1/p}$, and time\ncomplexity $O(mn)$, where $m$ and $n$ denote the number of edges and nodes in\ngraph respectively. In terms of algorithmic contributions, we propose a new and\nfaster generalized peeling algorithm (called GENPEEL++ in this paper), which\nfor $p \\in [1, +\\infty)$ has an approximation guarantee ratio $(2(p+1))^{1/p}$,\nand time complexity $O(m(\\log n))$, where $m$ and $n$ denote the number of\nedges and nodes in graph, respectively. This approximation ratio converges to 1\nas $p \\rightarrow \\infty$.\n","authors":["Chenglin Fan","Ping Li","Hanyu Peng"],"pdf_url":"https://arxiv.org/pdf/2310.11377v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2106.00909 by other authors"},{"id":"http://arxiv.org/abs/2310.11366v1","updated":"2023-10-17T16:04:33Z","published":"2023-10-17T16:04:33Z","title":"Lie Group Decompositions for Equivariant Neural Networks","summary":"  Invariance and equivariance to geometrical transformations have proven to be\nvery useful inductive biases when training (convolutional) neural network\nmodels, especially in the low-data regime. Much work has focused on the case\nwhere the symmetry group employed is compact or abelian, or both. Recent work\nhas explored enlarging the class of transformations used to the case of Lie\ngroups, principally through the use of their Lie algebra, as well as the group\nexponential and logarithm maps. The applicability of such methods to larger\ntransformation groups is limited by the fact that depending on the group of\ninterest $G$, the exponential map may not be surjective. Further limitations\nare encountered when $G$ is neither compact nor abelian. Using the structure\nand geometry of Lie groups and their homogeneous spaces, we present a framework\nby which it is possible to work with such groups primarily focusing on the Lie\ngroups $G = \\text{GL}^{+}(n, \\mathbb{R})$ and $G = \\text{SL}(n, \\mathbb{R})$,\nas well as their representation as affine transformations $\\mathbb{R}^{n}\n\\rtimes G$. Invariant integration as well as a global parametrization is\nrealized by decomposing the `larger` groups into subgroups and submanifolds\nwhich can be handled individually. Under this framework, we show how\nconvolution kernels can be parametrized to build models equivariant with\nrespect to affine transformations. We evaluate the robustness and\nout-of-distribution generalisation capability of our model on the standard\naffine-invariant benchmark classification task, where we outperform all\nprevious equivariant models as well as all Capsule Network proposals.\n","authors":["Mircea Mironenco","Patrick Forré"],"pdf_url":"https://arxiv.org/pdf/2310.11366v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.02978v2","updated":"2023-10-17T16:03:38Z","published":"2023-02-06T18:09:06Z","title":"MuG: A Multimodal Classification Benchmark on Game Data with Tabular,\n  Textual, and Visual Fields","summary":"  Previous research has demonstrated the advantages of integrating data from\nmultiple sources over traditional unimodal data, leading to the emergence of\nnumerous novel multimodal applications. We propose a multimodal classification\nbenchmark MuG with eight datasets that allows researchers to evaluate and\nimprove their models. These datasets are collected from four various genres of\ngames that cover tabular, textual, and visual modalities. We conduct\nmulti-aspect data analysis to provide insights into the benchmark, including\nlabel balance ratios, percentages of missing features, distributions of data\nwithin each modality, and the correlations between labels and input modalities.\nWe further present experimental results obtained by several state-of-the-art\nunimodal classifiers and multimodal classifiers, which demonstrate the\nchallenging and multimodal-dependent properties of the benchmark. MuG is\nreleased at https://github.com/lujiaying/MUG-Bench with the data, tutorials,\nand implemented baselines.\n","authors":["Jiaying Lu","Yongchen Qian","Shifan Zhao","Yuanzhe Xi","Carl Yang"],"pdf_url":"https://arxiv.org/pdf/2302.02978v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.03100v2","updated":"2023-10-17T16:00:29Z","published":"2022-11-06T12:54:28Z","title":"Predicting User-specific Future Activities using LSTM-based Multi-label\n  Classification","summary":"  User-specific future activity prediction in the healthcare domain based on\nprevious activities can drastically improve the services provided by the\nnurses. It is challenging because, unlike other domains, activities in\nhealthcare involve both nurses and patients, and they also vary from hour to\nhour. In this paper, we employ various data processing techniques to organize\nand modify the data structure and an LSTM-based multi-label classifier for a\nnovel 2-stage training approach (user-agnostic pre-training and user-specific\nfine-tuning). Our experiment achieves a validation accuracy of 31.58\\%,\nprecision 57.94%, recall 68.31%, and F1 score 60.38%. We concluded that proper\ndata pre-processing and a 2-stage training process resulted in better\nperformance. This experiment is a part of the \"Fourth Nurse Care Activity\nRecognition Challenge\" by our team \"Not A Fan of Local Minima\".\n","authors":["Mohammad Sabik Irbaz","Fardin Ahsan Sakib","Lutfun Nahar Lota"],"pdf_url":"https://arxiv.org/pdf/2211.03100v2.pdf","comment":"Accepted at the Proceedings of the 4th International Conference on\n  Activity and Behavior Computing 2022 (ABC 2022) Reference:\n  https://abc-research.github.io/challenge2022/results/"},{"id":"http://arxiv.org/abs/2306.10190v2","updated":"2023-10-17T15:44:32Z","published":"2023-06-16T21:51:04Z","title":"ALP: Action-Aware Embodied Learning for Perception","summary":"  Current methods in training and benchmarking vision models exhibit an\nover-reliance on passive, curated datasets. Although models trained on these\ndatasets have shown strong performance in a wide variety of tasks such as\nclassification, detection, and segmentation, they fundamentally are unable to\ngeneralize to an ever-evolving world due to constant out-of-distribution shifts\nof input data. Therefore, instead of training on fixed datasets, can we\napproach learning in a more human-centric and adaptive manner? In this paper,\nwe introduce Action-Aware Embodied Learning for Perception (ALP), an embodied\nlearning framework that incorporates action information into representation\nlearning through a combination of optimizing a reinforcement learning policy\nand an inverse dynamics prediction objective. Our method actively explores in\ncomplex 3D environments to both learn generalizable task-agnostic visual\nrepresentations as well as collect downstream training data. We show that ALP\noutperforms existing baselines in several downstream perception tasks. In\naddition, we show that by training on actively collected data more relevant to\nthe environment and task, our method generalizes more robustly to downstream\ntasks compared to models pre-trained on fixed datasets such as ImageNet.\n","authors":["Xinran Liang","Anthony Han","Wilson Yan","Aditi Raghunathan","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2306.10190v2.pdf","comment":"project website available at https://xinranliang.github.io/alp/"},{"id":"http://arxiv.org/abs/2310.07838v2","updated":"2023-10-17T15:35:41Z","published":"2023-10-11T19:30:08Z","title":"Towards the Fundamental Limits of Knowledge Transfer over Finite Domains","summary":"  We characterize the statistical efficiency of knowledge transfer through $n$\nsamples from a teacher to a probabilistic student classifier with input space\n$\\mathcal S$ over labels $\\mathcal A$. We show that privileged information at\nthree progressive levels accelerates the transfer. At the first level, only\nsamples with hard labels are known, via which the maximum likelihood estimator\nattains the minimax rate $\\sqrt{{|{\\mathcal S}||{\\mathcal A}|}/{n}}$. The\nsecond level has the teacher probabilities of sampled labels available in\naddition, which turns out to boost the convergence rate lower bound to\n${{|{\\mathcal S}||{\\mathcal A}|}/{n}}$. However, under this second data\nacquisition protocol, minimizing a naive adaptation of the cross-entropy loss\nresults in an asymptotically biased student. We overcome this limitation and\nachieve the fundamental limit by using a novel empirical variant of the squared\nerror logit loss. The third level further equips the student with the soft\nlabels (complete logits) on ${\\mathcal A}$ given every sampled input, thereby\nprovably enables the student to enjoy a rate ${|{\\mathcal S}|}/{n}$ free of\n$|{\\mathcal A}|$. We find any Kullback-Leibler divergence minimizer to be\noptimal in the last case. Numerical simulations distinguish the four learners\nand corroborate our theory.\n","authors":["Qingyue Zhao","Banghua Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.07838v2.pdf","comment":"38 pages, 2 figures; related works polished"},{"id":"http://arxiv.org/abs/2305.10664v2","updated":"2023-10-17T15:33:11Z","published":"2023-05-18T02:55:00Z","title":"Posterior Inference on Shallow Infinitely Wide Bayesian Neural Networks\n  under Weights with Unbounded Variance","summary":"  From the classical and influential works of Neal (1996), it is known that the\ninfinite width scaling limit of a Bayesian neural network with one hidden layer\nis a Gaussian process, \\emph{when the network weights have bounded prior\nvariance}. Neal's result has been extended to networks with multiple hidden\nlayers and to convolutional neural networks, also with Gaussian process scaling\nlimits. The tractable properties of Gaussian processes then allow\nstraightforward posterior inference and uncertainty quantification,\nconsiderably simplifying the study of the limit process compared to a network\nof finite width. Neural network weights with unbounded variance, however, pose\nunique challenges. In this case, the classical central limit theorem breaks\ndown and it is well known that the scaling limit is an $\\alpha$-stable process\nunder suitable conditions. However, current literature is primarily limited to\nforward simulations under these processes and the problem of posterior\ninference under such a scaling limit remains largely unaddressed, unlike in the\nGaussian process case. To this end, our contribution is an interpretable and\ncomputationally efficient procedure for posterior inference, using a\n\\emph{conditionally Gaussian} representation, that then allows full use of the\nGaussian process machinery for tractable posterior inference and uncertainty\nquantification in the non-Gaussian regime.\n","authors":["Jorge Loría","Anindya Bhadra"],"pdf_url":"https://arxiv.org/pdf/2305.10664v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11341v1","updated":"2023-10-17T15:24:02Z","published":"2023-10-17T15:24:02Z","title":"Dual Cognitive Architecture: Incorporating Biases and Multi-Memory\n  Systems for Lifelong Learning","summary":"  Artificial neural networks (ANNs) exhibit a narrow scope of expertise on\nstationary independent data. However, the data in the real world is continuous\nand dynamic, and ANNs must adapt to novel scenarios while also retaining the\nlearned knowledge to become lifelong learners. The ability of humans to excel\nat these tasks can be attributed to multiple factors ranging from cognitive\ncomputational structures, cognitive biases, and the multi-memory systems in the\nbrain. We incorporate key concepts from each of these to design a novel\nframework, Dual Cognitive Architecture (DUCA), which includes multiple\nsub-systems, implicit and explicit knowledge representation dichotomy,\ninductive bias, and a multi-memory system. The inductive bias learner within\nDUCA is instrumental in encoding shape information, effectively countering the\ntendency of ANNs to learn local textures. Simultaneously, the inclusion of a\nsemantic memory submodule facilitates the gradual consolidation of knowledge,\nreplicating the dynamics observed in fast and slow learning systems,\nreminiscent of the principles underpinning the complementary learning system in\nhuman cognition. DUCA shows improvement across different settings and datasets,\nand it also exhibits reduced task recency bias, without the need for extra\ninformation. To further test the versatility of lifelong learning methods on a\nchallenging distribution shift, we introduce a novel domain-incremental dataset\nDN4IL. In addition to improving performance on existing benchmarks, DUCA also\ndemonstrates superior performance on this complex dataset.\n","authors":["Shruthi Gowda","Bahram Zonooz","Elahe Arani"],"pdf_url":"https://arxiv.org/pdf/2310.11341v1.pdf","comment":"Published in Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2310.11340v1","updated":"2023-10-17T15:23:00Z","published":"2023-10-17T15:23:00Z","title":"Contextualized Machine Learning","summary":"  We examine Contextualized Machine Learning (ML), a paradigm for learning\nheterogeneous and context-dependent effects. Contextualized ML estimates\nheterogeneous functions by applying deep learning to the meta-relationship\nbetween contextual information and context-specific parametric models. This is\na form of varying-coefficient modeling that unifies existing frameworks\nincluding cluster analysis and cohort modeling by introducing two reusable\nconcepts: a context encoder which translates sample context into model\nparameters, and sample-specific model which operates on sample predictors. We\nreview the process of developing contextualized models, nonparametric inference\nfrom contextualized models, and identifiability conditions of contextualized\nmodels. Finally, we present the open-source PyTorch package ContextualizedML.\n","authors":["Benjamin Lengerich","Caleb N. Ellington","Andrea Rubbi","Manolis Kellis","Eric P. Xing"],"pdf_url":"https://arxiv.org/pdf/2310.11340v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06081v3","updated":"2023-10-17T15:20:47Z","published":"2023-05-25T09:04:31Z","title":"CARSO: Blending Adversarial Training and Purification Improves\n  Adversarial Robustness","summary":"  In this work, we propose a novel adversarial defence mechanism for image\nclassification - CARSO - blending the paradigms of adversarial training and\nadversarial purification in a mutually-beneficial, robustness-enhancing way.\nThe method builds upon an adversarially-trained classifier, and learns to map\nits internal representation associated with a potentially perturbed input onto\na distribution of tentative clean reconstructions. Multiple samples from such\ndistribution are classified by the adversarially-trained model itself, and an\naggregation of its outputs finally constitutes the robust prediction of\ninterest. Experimental evaluation by a well-established benchmark of varied,\nstrong adaptive attacks, across different image datasets and classifier\narchitectures, shows that CARSO is able to defend itself against foreseen and\nunforeseen threats, including adaptive end-to-end attacks devised for\nstochastic defences. Paying a tolerable clean accuracy toll, our method\nimproves by a significant margin the state of the art for CIFAR-10 and\nCIFAR-100 $\\ell_\\infty$ robust classification accuracy against AutoAttack. Code\nand pre-trained models are available at https://github.com/emaballarin/CARSO .\n","authors":["Emanuele Ballarin","Alessio Ansuini","Luca Bortolussi"],"pdf_url":"https://arxiv.org/pdf/2306.06081v3.pdf","comment":"19 pages, 1 figure, 9 tables"},{"id":"http://arxiv.org/abs/2310.11335v1","updated":"2023-10-17T15:13:33Z","published":"2023-10-17T15:13:33Z","title":"Non-ergodicity in reinforcement learning: robustness via ergodicity\n  transformations","summary":"  Envisioned application areas for reinforcement learning (RL) include\nautonomous driving, precision agriculture, and finance, which all require RL\nagents to make decisions in the real world. A significant challenge hindering\nthe adoption of RL methods in these domains is the non-robustness of\nconventional algorithms. In this paper, we argue that a fundamental issue\ncontributing to this lack of robustness lies in the focus on the expected value\nof the return as the sole \"correct\" optimization objective. The expected value\nis the average over the statistical ensemble of infinitely many trajectories.\nFor non-ergodic returns, this average differs from the average over a single\nbut infinitely long trajectory. Consequently, optimizing the expected value can\nlead to policies that yield exceptionally high returns with probability zero\nbut almost surely result in catastrophic outcomes. This problem can be\ncircumvented by transforming the time series of collected returns into one with\nergodic increments. This transformation enables learning robust policies by\noptimizing the long-term return for individual agents rather than the average\nacross infinitely many trajectories. We propose an algorithm for learning\nergodicity transformations from data and demonstrate its effectiveness in an\ninstructive, non-ergodic environment and on standard RL benchmarks.\n","authors":["Dominik Baumann","Erfaun Noorani","James Price","Ole Peters","Colm Connaughton","Thomas B. Schön"],"pdf_url":"https://arxiv.org/pdf/2310.11335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02554v2","updated":"2023-10-17T15:08:43Z","published":"2023-10-04T03:24:33Z","title":"zkFL: Zero-Knowledge Proof-based Gradient Aggregation for Federated\n  Learning","summary":"  Federated Learning (FL) is a machine learning paradigm, which enables\nmultiple and decentralized clients to collaboratively train a model under the\norchestration of a central aggregator. Traditional FL solutions rely on the\ntrust assumption of the centralized aggregator, which forms cohorts of clients\nin a fair and honest manner. However, a malicious aggregator, in reality, could\nabandon and replace the client's training models, or launch Sybil attacks to\ninsert fake clients. Such malicious behaviors give the aggregator more power to\ncontrol clients in the FL setting and determine the final training results. In\nthis work, we introduce zkFL, which leverages zero-knowledge proofs (ZKPs) to\ntackle the issue of a malicious aggregator during the training model\naggregation process. To guarantee the correct aggregation results, the\naggregator needs to provide a proof per round. The proof can demonstrate to the\nclients that the aggregator executes the intended behavior faithfully. To\nfurther reduce the verification cost of clients, we employ a blockchain to\nhandle the proof in a zero-knowledge way, where miners (i.e., the nodes\nvalidating and maintaining the blockchain data) can verify the proof without\nknowing the clients' local and aggregated models. The theoretical analysis and\nempirical results show that zkFL can achieve better security and privacy than\ntraditional FL, without modifying the underlying FL network structure or\nheavily compromising the training speed.\n","authors":["Zhipeng Wang","Nanqing Dong","Jiahao Sun","William Knottenbelt"],"pdf_url":"https://arxiv.org/pdf/2310.02554v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11324v1","updated":"2023-10-17T15:03:30Z","published":"2023-10-17T15:03:30Z","title":"Quantifying Language Models' Sensitivity to Spurious Features in Prompt\n  Design or: How I learned to start worrying about prompt formatting","summary":"  As large language models (LLMs) are adopted as a fundamental component of\nlanguage technologies, it is crucial to accurately characterize their\nperformance. Because choices in prompt design can strongly influence model\nbehavior, this design process is critical in effectively using any modern\npre-trained generative language model. In this work, we focus on LLM\nsensitivity to a quintessential class of meaning-preserving design choices:\nprompt formatting. We find that several widely used open-source LLMs are\nextremely sensitive to subtle changes in prompt formatting in few-shot\nsettings, with performance differences of up to 76 accuracy points when\nevaluated using LLaMA-2-13B. Sensitivity remains even when increasing model\nsize, the number of few-shot examples, or performing instruction tuning. Our\nanalysis suggests that work evaluating LLMs with prompting-based methods would\nbenefit from reporting a range of performance across plausible prompt formats,\ninstead of the currently-standard practice of reporting performance on a single\nformat. We also show that format performance only weakly correlates between\nmodels, which puts into question the methodological validity of comparing\nmodels with an arbitrarily chosen, fixed prompt format. To facilitate\nsystematic analysis we propose FormatSpread, an algorithm that rapidly\nevaluates a sampled set of plausible prompt formats for a given task, and\nreports the interval of expected performance without accessing model weights.\nFurthermore, we present a suite of analyses that characterize the nature of\nthis sensitivity, including exploring the influence of particular atomic\nperturbations and the internal representation of particular formats.\n","authors":["Melanie Sclar","Yejin Choi","Yulia Tsvetkov","Alane Suhr"],"pdf_url":"https://arxiv.org/pdf/2310.11324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07276v2","updated":"2023-10-17T14:55:58Z","published":"2023-10-11T07:57:08Z","title":"BioT5: Enriching Cross-modal Integration in Biology with Chemical\n  Knowledge and Natural Language Associations","summary":"  Recent advancements in biological research leverage the integration of\nmolecules, proteins, and natural language to enhance drug discovery. However,\ncurrent models exhibit several limitations, such as the generation of invalid\nmolecular SMILES, underutilization of contextual information, and equal\ntreatment of structured and unstructured knowledge. To address these issues, we\npropose $\\mathbf{BioT5}$, a comprehensive pre-training framework that enriches\ncross-modal integration in biology with chemical knowledge and natural language\nassociations. $\\mathbf{BioT5}$ utilizes SELFIES for $100%$ robust molecular\nrepresentations and extracts knowledge from the surrounding context of\nbio-entities in unstructured biological literature. Furthermore,\n$\\mathbf{BioT5}$ distinguishes between structured and unstructured knowledge,\nleading to more effective utilization of information. After fine-tuning, BioT5\nshows superior performance across a wide range of tasks, demonstrating its\nstrong capability of capturing underlying relations and properties of\nbio-entities. Our code is available at\n$\\href{https://github.com/QizhiPei/BioT5}{Github}$.\n","authors":["Qizhi Pei","Wei Zhang","Jinhua Zhu","Kehan Wu","Kaiyuan Gao","Lijun Wu","Yingce Xia","Rui Yan"],"pdf_url":"https://arxiv.org/pdf/2310.07276v2.pdf","comment":"Accepted by Empirical Methods in Natural Language Processing 2023\n  (EMNLP 2023)"},{"id":"http://arxiv.org/abs/2310.06763v3","updated":"2023-10-17T14:53:59Z","published":"2023-10-10T16:39:47Z","title":"FABind: Fast and Accurate Protein-Ligand Binding","summary":"  Modeling the interaction between proteins and ligands and accurately\npredicting their binding structures is a critical yet challenging task in drug\ndiscovery. Recent advancements in deep learning have shown promise in\naddressing this challenge, with sampling-based and regression-based methods\nemerging as two prominent approaches. However, these methods have notable\nlimitations. Sampling-based methods often suffer from low efficiency due to the\nneed for generating multiple candidate structures for selection. On the other\nhand, regression-based methods offer fast predictions but may experience\ndecreased accuracy. Additionally, the variation in protein sizes often requires\nexternal modules for selecting suitable binding pockets, further impacting\nefficiency. In this work, we propose $\\mathbf{FABind}$, an end-to-end model\nthat combines pocket prediction and docking to achieve accurate and fast\nprotein-ligand binding. $\\mathbf{FABind}$ incorporates a unique ligand-informed\npocket prediction module, which is also leveraged for docking pose estimation.\nThe model further enhances the docking process by incrementally integrating the\npredicted pocket to optimize protein-ligand binding, reducing discrepancies\nbetween training and inference. Through extensive experiments on benchmark\ndatasets, our proposed $\\mathbf{FABind}$ demonstrates strong advantages in\nterms of effectiveness and efficiency compared to existing methods. Our code is\navailable at $\\href{https://github.com/QizhiPei/FABind}{Github}$.\n","authors":["Qizhi Pei","Kaiyuan Gao","Lijun Wu","Jinhua Zhu","Yingce Xia","Shufang Xie","Tao Qin","Kun He","Tie-Yan Liu","Rui Yan"],"pdf_url":"https://arxiv.org/pdf/2310.06763v3.pdf","comment":"Accepted by Neural Information Processing Systems 2023 (NeurIPS 2023)"},{"id":"http://arxiv.org/abs/2310.11311v1","updated":"2023-10-17T14:34:58Z","published":"2023-10-17T14:34:58Z","title":"Elucidating The Design Space of Classifier-Guided Diffusion Generation","summary":"  Guidance in conditional diffusion generation is of great importance for\nsample quality and controllability. However, existing guidance schemes are to\nbe desired. On one hand, mainstream methods such as classifier guidance and\nclassifier-free guidance both require extra training with labeled data, which\nis time-consuming and unable to adapt to new conditions. On the other hand,\ntraining-free methods such as universal guidance, though more flexible, have\nyet to demonstrate comparable performance. In this work, through a\ncomprehensive investigation into the design space, we show that it is possible\nto achieve significant performance improvements over existing guidance schemes\nby leveraging off-the-shelf classifiers in a training-free fashion, enjoying\nthe best of both worlds. Employing calibration as a general guideline, we\npropose several pre-conditioning techniques to better exploit pretrained\noff-the-shelf classifiers for guiding diffusion generation. Extensive\nexperiments on ImageNet validate our proposed method, showing that\nstate-of-the-art diffusion models (DDPM, EDM, DiT) can be further improved (up\nto 20%) using off-the-shelf classifiers with barely any extra computational\ncost. With the proliferation of publicly available pretrained classifiers, our\nproposed approach has great potential and can be readily scaled up to\ntext-to-image generation tasks. The code is available at\nhttps://github.com/AlexMaOLS/EluCD/tree/main.\n","authors":["Jiajun Ma","Tianyang Hu","Wenjia Wang","Jiacheng Sun"],"pdf_url":"https://arxiv.org/pdf/2310.11311v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11305v1","updated":"2023-10-17T14:29:25Z","published":"2023-10-17T14:29:25Z","title":"MiniZero: Comparative Analysis of AlphaZero and MuZero on Go, Othello,\n  and Atari Games","summary":"  This paper presents MiniZero, a zero-knowledge learning framework that\nsupports four state-of-the-art algorithms, including AlphaZero, MuZero, Gumbel\nAlphaZero, and Gumbel MuZero. While these algorithms have demonstrated\nsuper-human performance in many games, it remains unclear which among them is\nmost suitable or efficient for specific tasks. Through MiniZero, we\nsystematically evaluate the performance of each algorithm in two board games,\n9x9 Go and 8x8 Othello, as well as 57 Atari games. Our empirical findings are\nsummarized as follows. For two board games, using more simulations generally\nresults in higher performance. However, the choice of AlphaZero and MuZero may\ndiffer based on game properties. For Atari games, both MuZero and Gumbel MuZero\nare worth considering. Since each game has unique characteristics, different\nalgorithms and simulations yield varying results. In addition, we introduce an\napproach, called progressive simulation, which progressively increases the\nsimulation budget during training to allocate computation more efficiently. Our\nempirical results demonstrate that progressive simulation achieves\nsignificantly superior performance in two board games. By making our framework\nand trained models publicly available, this paper contributes a benchmark for\nfuture research on zero-knowledge learning algorithms, assisting researchers in\nalgorithm selection and comparison against these zero-knowledge learning\nbaselines.\n","authors":["Ti-Rong Wu","Hung Guei","Po-Wei Huang","Pei-Chiun Peng","Ting Han Wei","Chung-Chin Shih","Yun-Jui Tsai"],"pdf_url":"https://arxiv.org/pdf/2310.11305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11291v1","updated":"2023-10-17T14:15:57Z","published":"2023-10-17T14:15:57Z","title":"An Automatic Learning Rate Schedule Algorithm for Achieving Faster\n  Convergence and Steeper Descent","summary":"  The delta-bar-delta algorithm is recognized as a learning rate adaptation\ntechnique that enhances the convergence speed of the training process in\noptimization by dynamically scheduling the learning rate based on the\ndifference between the current and previous weight updates. While this\nalgorithm has demonstrated strong competitiveness in full data optimization\nwhen compared to other state-of-the-art algorithms like Adam and SGD, it may\nencounter convergence issues in mini-batch optimization scenarios due to the\npresence of noisy gradients.\n  In this study, we thoroughly investigate the convergence behavior of the\ndelta-bar-delta algorithm in real-world neural network optimization. To address\nany potential convergence challenges, we propose a novel approach called RDBD\n(Regrettable Delta-Bar-Delta). Our approach allows for prompt correction of\nbiased learning rate adjustments and ensures the convergence of the\noptimization process. Furthermore, we demonstrate that RDBD can be seamlessly\nintegrated with any optimization algorithm and significantly improve the\nconvergence speed.\n  By conducting extensive experiments and evaluations, we validate the\neffectiveness and efficiency of our proposed RDBD approach. The results\nshowcase its capability to overcome convergence issues in mini-batch\noptimization and its potential to enhance the convergence speed of various\noptimization algorithms. This research contributes to the advancement of\noptimization techniques in neural network training, providing practitioners\nwith a reliable automatic learning rate scheduler for achieving faster\nconvergence and improved optimization outcomes.\n","authors":["Zhao Song","Chiwun Yang"],"pdf_url":"https://arxiv.org/pdf/2310.11291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02932v2","updated":"2023-10-17T14:15:07Z","published":"2023-07-06T11:43:22Z","title":"When No-Rejection Learning is Consistent for Regression with Rejection","summary":"  Learning with rejection has been a prototypical model for studying the\nhuman-AI interaction on prediction tasks. Upon the arrival of a sample\ninstance, the model first uses a rejector to decide whether to accept and use\nthe AI predictor to make a prediction or reject and defer the sample to humans.\nLearning such a model changes the structure of the original loss function and\noften results in undesirable non-convexity and inconsistency issues. For the\nclassification with rejection problem, several works develop consistent\nsurrogate losses for the joint learning of the predictor and the rejector,\nwhile there have been fewer works for the regression counterpart. This paper\nstudies the regression with rejection (RwR) problem and investigates a\nno-rejection learning strategy that uses all the data to learn the predictor.\nWe first establish the consistency for such a strategy under the weak\nrealizability condition. Then for the case without the weak realizability, we\nshow that the excessive risk can also be upper bounded with the sum of two\nparts: prediction error and calibration error. Lastly, we demonstrate the\nadvantage of such a proposed learning strategy with empirical evidence.\n","authors":["Xiaocheng Li","Shang Liu","Chunlin Sun","Hanzhao Wang"],"pdf_url":"https://arxiv.org/pdf/2307.02932v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11287v1","updated":"2023-10-17T14:09:45Z","published":"2023-10-17T14:09:45Z","title":"Evaluating the Impact of Humanitarian Aid on Food Security","summary":"  In the face of climate change-induced droughts, vulnerable regions encounter\nsevere threats to food security, demanding urgent humanitarian assistance. This\npaper introduces a causal inference framework for the Horn of Africa, aiming to\nassess the impact of cash-based interventions on food crises. Our contributions\nencompass identifying causal relationships within the food security system,\nharmonizing a comprehensive database, and estimating the causal effect of\nhumanitarian interventions on malnutrition. Our results revealed no significant\neffects, likely due to limited sample size, suboptimal data quality, and an\nimperfect causal graph resulting from our limited understanding of\nmultidisciplinary systems like food security. This underscores the need to\nenhance data collection and refine causal models with domain experts for more\neffective future interventions and policies, improving transparency and\naccountability in humanitarian aid.\n","authors":["Jordi Cerdà-Bautista","José María Tárraga","Vasileios Sitokonstantinou","Gustau Camps-Valls"],"pdf_url":"https://arxiv.org/pdf/2310.11287v1.pdf","comment":"3 pages, 1 figure, 3 tables"},{"id":"http://arxiv.org/abs/2309.12871v3","updated":"2023-10-17T14:08:53Z","published":"2023-09-22T13:52:42Z","title":"AnglE-optimized Text Embeddings","summary":"  High-quality text embedding is pivotal in improving semantic textual\nsimilarity (STS) tasks, which are crucial components in Large Language Model\n(LLM) applications. However, a common challenge existing text embedding models\nface is the problem of vanishing gradients, primarily due to their reliance on\nthe cosine function in the optimization objective, which has saturation zones.\nTo address this issue, this paper proposes a novel angle-optimized text\nembedding model called AnglE. The core idea of AnglE is to introduce angle\noptimization in a complex space. This novel approach effectively mitigates the\nadverse effects of the saturation zone in the cosine function, which can impede\ngradient and hinder optimization processes. To set up a comprehensive STS\nevaluation, we experimented on existing short-text STS datasets and a newly\ncollected long-text STS dataset from GitHub Issues. Furthermore, we examine\ndomain-specific STS scenarios with limited labeled data and explore how AnglE\nworks with LLM-annotated data. Extensive experiments were conducted on various\ntasks including short-text STS, long-text STS, and domain-specific STS tasks.\nThe results show that AnglE outperforms the state-of-the-art (SOTA) STS models\nthat ignore the cosine saturation zone. These findings demonstrate the ability\nof AnglE to generate high-quality text embeddings and the usefulness of angle\noptimization in STS.\n","authors":["Xianming Li","Jing Li"],"pdf_url":"https://arxiv.org/pdf/2309.12871v3.pdf","comment":"update llama results"},{"id":"http://arxiv.org/abs/2206.09818v3","updated":"2023-10-17T14:06:07Z","published":"2022-06-20T14:53:25Z","title":"SSM-DTA: Breaking the Barriers of Data Scarcity in Drug-Target Affinity\n  Prediction","summary":"  Accurate prediction of Drug-Target Affinity (DTA) is of vital importance in\nearly-stage drug discovery, facilitating the identification of drugs that can\neffectively interact with specific targets and regulate their activities. While\nwet experiments remain the most reliable method, they are time-consuming and\nresource-intensive, resulting in limited data availability that poses\nchallenges for deep learning approaches. Existing methods have primarily\nfocused on developing techniques based on the available DTA data, without\nadequately addressing the data scarcity issue. To overcome this challenge, we\npresent the SSM-DTA framework, which incorporates three simple yet highly\neffective strategies: (1) A multi-task training approach that combines DTA\nprediction with masked language modeling (MLM) using paired drug-target data.\n(2) A semi-supervised training method that leverages large-scale unpaired\nmolecules and proteins to enhance drug and target representations. This\napproach differs from previous methods that only employed molecules or proteins\nin pre-training. (3) The integration of a lightweight cross-attention module to\nimprove the interaction between drugs and targets, further enhancing prediction\naccuracy. Through extensive experiments on benchmark datasets such as\nBindingDB, DAVIS, and KIBA, we demonstrate the superior performance of our\nframework. Additionally, we conduct case studies on specific drug-target\nbinding activities, virtual screening experiments, drug feature visualizations,\nand real-world applications, all of which showcase the significant potential of\nour work. In conclusion, our proposed SSM-DTA framework addresses the data\nlimitation challenge in DTA prediction and yields promising results, paving the\nway for more efficient and accurate drug discovery processes. Our code is\navailable at $\\href{https://github.com/QizhiPei/SSM-DTA}{Github}$.\n","authors":["Qizhi Pei","Lijun Wu","Jinhua Zhu","Yingce Xia","Shufang Xie","Tao Qin","Haiguang Liu","Tie-Yan Liu","Rui Yan"],"pdf_url":"https://arxiv.org/pdf/2206.09818v3.pdf","comment":"Accepted by Briefings in Bioinformatics 2023"},{"id":"http://arxiv.org/abs/2212.07489v2","updated":"2023-10-17T14:05:58Z","published":"2022-12-14T20:15:19Z","title":"SMACv2: An Improved Benchmark for Cooperative Multi-Agent Reinforcement\n  Learning","summary":"  The availability of challenging benchmarks has played a key role in the\nrecent progress of machine learning. In cooperative multi-agent reinforcement\nlearning, the StarCraft Multi-Agent Challenge (SMAC) has become a popular\ntestbed for centralised training with decentralised execution. However, after\nyears of sustained improvement on SMAC, algorithms now achieve near-perfect\nperformance. In this work, we conduct new analysis demonstrating that SMAC\nlacks the stochasticity and partial observability to require complex\n*closed-loop* policies. In particular, we show that an *open-loop* policy\nconditioned only on the timestep can achieve non-trivial win rates for many\nSMAC scenarios. To address this limitation, we introduce SMACv2, a new version\nof the benchmark where scenarios are procedurally generated and require agents\nto generalise to previously unseen settings (from the same distribution) during\nevaluation. We also introduce the extended partial observability challenge\n(EPO), which augments SMACv2 to ensure meaningful partial observability. We\nshow that these changes ensure the benchmark requires the use of *closed-loop*\npolicies. We evaluate state-of-the-art algorithms on SMACv2 and show that it\npresents significant challenges not present in the original benchmark. Our\nanalysis illustrates that SMACv2 addresses the discovered deficiencies of SMAC\nand can help benchmark the next generation of MARL methods. Videos of training\nare available at https://sites.google.com/view/smacv2.\n","authors":["Benjamin Ellis","Jonathan Cook","Skander Moalla","Mikayel Samvelyan","Mingfei Sun","Anuj Mahajan","Jakob N. Foerster","Shimon Whiteson"],"pdf_url":"https://arxiv.org/pdf/2212.07489v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11281v1","updated":"2023-10-17T14:04:22Z","published":"2023-10-17T14:04:22Z","title":"Self-supervision meets kernel graph neural models: From architecture to\n  augmentations","summary":"  Graph representation learning has now become the de facto standard when\nhandling graph-structured data, with the framework of message-passing graph\nneural networks (MPNN) being the most prevailing algorithmic tool. Despite its\npopularity, the family of MPNNs suffers from several drawbacks such as\ntransparency and expressivity. Recently, the idea of designing neural models on\ngraphs using the theory of graph kernels has emerged as a more transparent as\nwell as sometimes more expressive alternative to MPNNs known as kernel graph\nneural networks (KGNNs). Developments on KGNNs are currently a nascent field of\nresearch, leaving several challenges from algorithmic design and adaptation to\nother learning paradigms such as self-supervised learning. In this paper, we\nimprove the design and learning of KGNNs. Firstly, we extend the algorithmic\nformulation of KGNNs by allowing a more flexible graph-level similarity\ndefinition that encompasses former proposals like random walk graph kernel, as\nwell as providing a smoother optimization objective that alleviates the need of\nintroducing combinatorial learning procedures. Secondly, we enhance KGNNs\nthrough the lens of self-supervision via developing a novel\nstructure-preserving graph data augmentation method called latent graph\naugmentation (LGA). Finally, we perform extensive empirical evaluations to\ndemonstrate the efficacy of our proposed mechanisms. Experimental results over\nbenchmark datasets suggest that our proposed model achieves competitive\nperformance that is comparable to or sometimes outperforming state-of-the-art\ngraph representation learning frameworks with or without self-supervision on\ngraph classification tasks. Comparisons against other previously established\ngraph data augmentation methods verify that the proposed LGA augmentation\nscheme captures better semantics of graph-level invariance.\n","authors":["Jiawang Dan","Ruofan Wu","Yunpeng Liu","Baokun Wang","Changhua Meng","Tengfei Liu","Tianyi Zhang","Ningtao Wang","Xing Fu","Qi Li","Weiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2310.11281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08738v2","updated":"2023-10-17T13:50:42Z","published":"2023-10-12T21:51:25Z","title":"Splicing Up Your Predictions with RNA Contrastive Learning","summary":"  In the face of rapidly accumulating genomic data, our understanding of the\nRNA regulatory code remains incomplete. Recent self-supervised methods in other\ndomains have demonstrated the ability to learn rules underlying the\ndata-generating process such as sentence structure in language. Inspired by\nthis, we extend contrastive learning techniques to genomic data by utilizing\nfunctional similarities between sequences generated through alternative\nsplicing and gene duplication. Our novel dataset and contrastive objective\nenable the learning of generalized RNA isoform representations. We validate\ntheir utility on downstream tasks such as RNA half-life and mean ribosome load\nprediction. Our pre-training strategy yields competitive results using linear\nprobing on both tasks, along with up to a two-fold increase in Pearson\ncorrelation in low-data conditions. Importantly, our exploration of the learned\nlatent space reveals that our contrastive objective yields semantically\nmeaningful representations, underscoring its potential as a valuable\ninitialization technique for RNA property prediction.\n","authors":["Philip Fradkin","Ruian Shi","Bo Wang","Brendan Frey","Leo J. Lee"],"pdf_url":"https://arxiv.org/pdf/2310.08738v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10553v2","updated":"2023-10-17T13:46:50Z","published":"2023-10-16T16:25:15Z","title":"TacticAI: an AI assistant for football tactics","summary":"  Identifying key patterns of tactics implemented by rival teams, and\ndeveloping effective responses, lies at the heart of modern football. However,\ndoing so algorithmically remains an open research challenge. To address this\nunmet need, we propose TacticAI, an AI football tactics assistant developed and\nevaluated in close collaboration with domain experts from Liverpool FC. We\nfocus on analysing corner kicks, as they offer coaches the most direct\nopportunities for interventions and improvements. TacticAI incorporates both a\npredictive and a generative component, allowing the coaches to effectively\nsample and explore alternative player setups for each corner kick routine and\nto select those with the highest predicted likelihood of success. We validate\nTacticAI on a number of relevant benchmark tasks: predicting receivers and shot\nattempts and recommending player position adjustments. The utility of TacticAI\nis validated by a qualitative study conducted with football domain experts at\nLiverpool FC. We show that TacticAI's model suggestions are not only\nindistinguishable from real tactics, but also favoured over existing tactics\n90% of the time, and that TacticAI offers an effective corner kick retrieval\nsystem. TacticAI achieves these results despite the limited availability of\ngold-standard data, achieving data efficiency through geometric deep learning.\n","authors":["Zhe Wang","Petar Veličković","Daniel Hennes","Nenad Tomašev","Laurel Prince","Michael Kaisers","Yoram Bachrach","Romuald Elie","Li Kevin Wenliang","Federico Piccinini","William Spearman","Ian Graham","Jerome Connor","Yi Yang","Adrià Recasens","Mina Khan","Nathalie Beauguerlange","Pablo Sprechmann","Pol Moreno","Nicolas Heess","Michael Bowling","Demis Hassabis","Karl Tuyls"],"pdf_url":"https://arxiv.org/pdf/2310.10553v2.pdf","comment":"32 pages, 10 figures"},{"id":"http://arxiv.org/abs/2209.03737v3","updated":"2023-10-17T13:43:34Z","published":"2022-09-07T13:08:38Z","title":"CGAN-ECT: Tomography Image Reconstruction from Electrical Capacitance\n  Measurements Using CGANs","summary":"  Due to the rapid growth of Electrical Capacitance Tomography (ECT)\napplications in several industrial fields, there is a crucial need for\ndeveloping high quality, yet fast, methodologies of image reconstruction from\nraw capacitance measurements. Deep learning, as an effective non-linear mapping\ntool for complicated functions, has been going viral in many fields including\nelectrical tomography. In this paper, we propose a Conditional Generative\nAdversarial Network (CGAN) model for reconstructing ECT images from capacitance\nmeasurements. The initial image of the CGAN model is constructed from the\ncapacitance measurement. To our knowledge, this is the first time to represent\nthe capacitance measurements in an image form. We have created a new massive\nECT dataset of 320K synthetic image measurements pairs for training, and\ntesting the proposed model. The feasibility and generalization ability of the\nproposed CGAN-ECT model are evaluated using testing dataset, contaminated data\nand flow patterns that are not exposed to the model during the training phase.\nThe evaluation results prove that the proposed CGAN-ECT model can efficiently\ncreate more accurate ECT images than traditional and other deep learning-based\nimage reconstruction algorithms. CGAN-ECT achieved an average image correlation\ncoefficient of more than 99.3% and an average relative image error about 0.07.\n","authors":["Wael Deabes","Alaa E. Abdel-Hakim"],"pdf_url":"https://arxiv.org/pdf/2209.03737v3.pdf","comment":"13 pages, 10 figures, 6 tables"},{"id":"http://arxiv.org/abs/2309.14107v2","updated":"2023-10-17T13:38:27Z","published":"2023-09-25T13:00:33Z","title":"Wav2vec-based Detection and Severity Level Classification of Dysarthria\n  from Speech","summary":"  Automatic detection and severity level classification of dysarthria directly\nfrom acoustic speech signals can be used as a tool in medical diagnosis. In\nthis work, the pre-trained wav2vec 2.0 model is studied as a feature extractor\nto build detection and severity level classification systems for dysarthric\nspeech. The experiments were carried out with the popularly used UA-speech\ndatabase. In the detection experiments, the results revealed that the best\nperformance was obtained using the embeddings from the first layer of the\nwav2vec model that yielded an absolute improvement of 1.23% in accuracy\ncompared to the best performing baseline feature (spectrogram). In the studied\nseverity level classification task, the results revealed that the embeddings\nfrom the final layer gave an absolute improvement of 10.62% in accuracy\ncompared to the best baseline features (mel-frequency cepstral coefficients).\n","authors":["Farhad Javanmardi","Saska Tirronen","Manila Kodali","Sudarsana Reddy Kadiri","Paavo Alku"],"pdf_url":"https://arxiv.org/pdf/2309.14107v2.pdf","comment":"copyright 2023 IEEE. Personal use of this material is permitted.\n  Permission from IEEE must be obtained for all other uses, in any current or\n  future media, including reprinting/republishing this material for advertising\n  or promotional purposes, creating new collective works, for resale or\n  redistribution to servers or lists, or reuse of any copyrighted component of\n  this work in other works"},{"id":"http://arxiv.org/abs/2309.14080v2","updated":"2023-10-17T13:36:36Z","published":"2023-09-25T12:14:25Z","title":"Analysis and Detection of Pathological Voice using Glottal Source\n  Features","summary":"  Automatic detection of voice pathology enables objective assessment and\nearlier intervention for the diagnosis. This study provides a systematic\nanalysis of glottal source features and investigates their effectiveness in\nvoice pathology detection. Glottal source features are extracted using glottal\nflows estimated with the quasi-closed phase (QCP) glottal inverse filtering\nmethod, using approximate glottal source signals computed with the zero\nfrequency filtering (ZFF) method, and using acoustic voice signals directly. In\naddition, we propose to derive mel-frequency cepstral coefficients (MFCCs) from\nthe glottal source waveforms computed by QCP and ZFF to effectively capture the\nvariations in glottal source spectra of pathological voice. Experiments were\ncarried out using two databases, the Hospital Universitario Principe de\nAsturias (HUPA) database and the Saarbrucken Voice Disorders (SVD) database.\nAnalysis of features revealed that the glottal source contains information that\ndiscriminates normal and pathological voice. Pathology detection experiments\nwere carried out using support vector machine (SVM). From the detection\nexperiments it was observed that the performance achieved with the studied\nglottal source features is comparable or better than that of conventional MFCCs\nand perceptual linear prediction (PLP) features. The best detection performance\nwas achieved when the glottal source features were combined with the\nconventional MFCCs and PLP features, which indicates the complementary nature\nof the features.\n","authors":["Sudarsana Reddy Kadiri","Paavo Alku"],"pdf_url":"https://arxiv.org/pdf/2309.14080v2.pdf","comment":"Copyright 2020 IEEE. Personal use of this material is permitted.\n  Permission from IEEE must be obtained for all other uses, in any current or\n  future media, including reprinting/republishing this material for advertising\n  or promotional purposes, creating new collective works, for resale or\n  redistribution to servers or lists, or reuse of any copyrighted component of\n  this work in other works"},{"id":"http://arxiv.org/abs/2303.10019v2","updated":"2023-10-17T13:24:28Z","published":"2023-03-17T14:47:55Z","title":"Multivariate Probabilistic CRPS Learning with an Application to\n  Day-Ahead Electricity Prices","summary":"  This paper presents a new method for combining (or aggregating or ensembling)\nmultivariate probabilistic forecasts, considering dependencies between\nquantiles and marginals through a smoothing procedure that allows for online\nlearning. We discuss two smoothing methods: dimensionality reduction using\nBasis matrices and penalized smoothing. The new online learning algorithm\ngeneralizes the standard CRPS learning framework into multivariate dimensions.\nIt is based on Bernstein Online Aggregation (BOA) and yields optimal asymptotic\nlearning properties. The procedure uses horizontal aggregation, i.e.,\naggregation across quantiles. We provide an in-depth discussion on possible\nextensions of the algorithm and several nested cases related to the existing\nliterature on online forecast combination. We apply the proposed methodology to\nforecasting day-ahead electricity prices, which are 24-dimensional\ndistributional forecasts. The proposed method yields significant improvements\nover uniform combination in terms of continuous ranked probability score\n(CRPS). We discuss the temporal evolution of the weights and hyperparameters\nand present the results of reduced versions of the preferred model. A fast C++\nimplementation of the proposed algorithm will be made available in connection\nwith this paper as an open-source R-Package on CRAN.\n","authors":["Jonathan Berrisch","Florian Ziel"],"pdf_url":"https://arxiv.org/pdf/2303.10019v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11256v1","updated":"2023-10-17T13:22:36Z","published":"2023-10-17T13:22:36Z","title":"Gromov-Wassertein-like Distances in the Gaussian Mixture Models Space","summary":"  In this paper, we introduce two Gromov-Wasserstein-type distances on the set\nof Gaussian mixture models. The first one takes the form of a\nGromov-Wasserstein distance between two discrete distributionson the space of\nGaussian measures. This distance can be used as an alternative to\nGromov-Wasserstein for applications which only require to evaluate how far the\ndistributions are from each other but does not allow to derive directly an\noptimal transportation plan between clouds of points. To design a way to define\nsuch a transportation plan, we introduce another distance between measures\nliving in incomparable spaces that turns out to be closely related to\nGromov-Wasserstein. When restricting the set of admissible transportation\ncouplings to be themselves Gaussian mixture models in this latter, this defines\nanother distance between Gaussian mixture models that can be used as another\nalternative to Gromov-Wasserstein and which allows to derive an optimal\nassignment between points. Finally, we design a transportation plan associated\nwith the first distance by analogy with the second, and we illustrate their\npractical uses on medium-to-large scale problems such as shape matching and\nhyperspectral image color transfer.\n","authors":["Antoine Salmona","Julie Delon","Agnès Desolneux"],"pdf_url":"https://arxiv.org/pdf/2310.11256v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2310.11248v1","updated":"2023-10-17T13:18:01Z","published":"2023-10-17T13:18:01Z","title":"CrossCodeEval: A Diverse and Multilingual Benchmark for Cross-File Code\n  Completion","summary":"  Code completion models have made significant progress in recent years, yet\ncurrent popular evaluation datasets, such as HumanEval and MBPP, predominantly\nfocus on code completion tasks within a single file. This over-simplified\nsetting falls short of representing the real-world software development\nscenario where repositories span multiple files with numerous cross-file\ndependencies, and accessing and understanding cross-file context is often\nrequired to complete the code correctly.\n  To fill in this gap, we propose CrossCodeEval, a diverse and multilingual\ncode completion benchmark that necessitates an in-depth cross-file contextual\nunderstanding to complete the code accurately. CrossCodeEval is built on a\ndiverse set of real-world, open-sourced, permissively-licensed repositories in\nfour popular programming languages: Python, Java, TypeScript, and C#. To create\nexamples that strictly require cross-file context for accurate completion, we\npropose a straightforward yet efficient static-analysis-based approach to\npinpoint the use of cross-file context within the current file.\n  Extensive experiments on state-of-the-art code language models like CodeGen\nand StarCoder demonstrate that CrossCodeEval is extremely challenging when the\nrelevant cross-file context is absent, and we see clear improvements when\nadding these context into the prompt. However, despite such improvements, the\npinnacle of performance remains notably unattained even with the\nhighest-performing model, indicating that CrossCodeEval is also capable of\nassessing model's capability in leveraging extensive context to make better\ncode completion. Finally, we benchmarked various methods in retrieving\ncross-file context, and show that CrossCodeEval can also be used to measure the\ncapability of code retrievers.\n","authors":["Yangruibo Ding","Zijian Wang","Wasi Uddin Ahmad","Hantian Ding","Ming Tan","Nihal Jain","Murali Krishna Ramanathan","Ramesh Nallapati","Parminder Bhatia","Dan Roth","Bing Xiang"],"pdf_url":"https://arxiv.org/pdf/2310.11248v1.pdf","comment":"To appear at NeurIPS 2023 (Datasets and Benchmarks Track)"},{"id":"http://arxiv.org/abs/2310.11244v1","updated":"2023-10-17T13:12:32Z","published":"2023-10-17T13:12:32Z","title":"Entity Matching using Large Language Models","summary":"  Entity Matching is the task of deciding whether two entity descriptions refer\nto the same real-world entity. Entity Matching is a central step in most data\nintegration pipelines and an enabler for many e-commerce applications which\nrequire to match products offers from different vendors. State-of-the-art\nentity matching methods often rely on pre-trained language models (PLMs) such\nas BERT or RoBERTa. Two major drawbacks of these models for entity matching are\nthat (i) the models require significant amounts of task-specific training data\nand (ii) the fine-tuned models are not robust concerning out-of-distribution\nentities. In this paper, we investigate using large language models (LLMs) for\nentity matching as a less domain-specific training data reliant and more robust\nalternative to PLM-based matchers. Our study covers hosted LLMs, such as GPT3.5\nand GPT4, as well as open source LLMs based on Llama2 which can be run locally.\nWe evaluate these models in a zero-shot scenario as well as a scenario where\ntask-specific training data is available. We compare different prompt designs\nas well as the prompt sensitivity of the models in the zero-shot scenario. We\ninvestigate (i) the selection of in-context demonstrations, (ii) the generation\nof matching rules, as well as (iii) fine-tuning GPT3.5 in the second scenario\nusing the same pool of training data across the different approaches. Our\nexperiments show that GPT4 without any task-specific training data outperforms\nfine-tuned PLMs (RoBERTa and Ditto) on three out of five benchmark datasets\nreaching F1 scores around 90%. The experiments with in-context learning and\nrule generation show that all models beside of GPT4 benefit from these\ntechniques (on average 5.9% and 2.2% F1), while GPT4 does not need such\nadditional guidance in most cases...\n","authors":["Ralph Peeters","Christian Bizer"],"pdf_url":"https://arxiv.org/pdf/2310.11244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.04693v2","updated":"2023-10-17T13:12:03Z","published":"2023-03-08T16:34:12Z","title":"A path in regression Random Forest looking for spatial dependence: a\n  taxonomy and a systematic review","summary":"  Random Forest (RF) is a well-known data-driven algorithm applied in several\nfields thanks to its flexibility in modeling the relationship between the\nresponse variable and the predictors, also in case of strong non-linearities.\nIn environmental applications, it often occurs that the phenomenon of interest\nmay present spatial and/or temporal dependence that is not taken explicitly\ninto account by RF in its standard version. In this work, we propose a taxonomy\nto classify strategies according to when (Pre-, In- and/or Post-processing)\nthey try to include the spatial information into regression RF. Moreover, we\nprovide a systematic review and classify the most recent strategies adopted to\n\"adjust\" regression RF to spatially dependent data, based on the criteria\nprovided by the Preferred Reporting Items for Systematic reviews and\nMeta-Analysis (PRISMA). The latter consists of a reproducible methodology for\ncollecting and processing existing literature on a specified topic from\ndifferent sources. PRISMA starts with a query and ends with a set of scientific\ndocuments to review: we performed an online query on the 25$^{th}$ October 2022\nand, in the end, 32 documents were considered for review. The employed\nmethodological strategies and the application fields considered in the 32\nscientific documents are described and discussed. This work falls inside the\nAgriculture Impact On Italian Air (AgrImOnIA) project.\n","authors":["Luca Patelli","Michela Cameletti","Natalia Golini","Rosaria Ignaccolo"],"pdf_url":"https://arxiv.org/pdf/2303.04693v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11232v1","updated":"2023-10-17T13:03:49Z","published":"2023-10-17T13:03:49Z","title":"Learning to Sample Better","summary":"  These lecture notes provide an introduction to recent advances in generative\nmodeling methods based on the dynamical transportation of measures, by means of\nwhich samples from a simple base measure are mapped to samples from a target\nmeasure of interest. Special emphasis is put on the applications of these\nmethods to Monte-Carlo (MC) sampling techniques, such as importance sampling\nand Markov Chain Monte-Carlo (MCMC) schemes. In this context, it is shown how\nthe maps can be learned variationally using data generated by MC sampling, and\nhow they can in turn be used to improve such sampling in a positive feedback\nloop.\n","authors":["Michael S. Albergo","Eric Vanden-Eijnden"],"pdf_url":"https://arxiv.org/pdf/2310.11232v1.pdf","comment":"Les Houches 2022 Summer School on Statistical Physics and Machine\n  Learning"},{"id":"http://arxiv.org/abs/2310.11230v1","updated":"2023-10-17T13:01:10Z","published":"2023-10-17T13:01:10Z","title":"Zipformer: A faster and better encoder for automatic speech recognition","summary":"  The Conformer has become the most popular encoder model for automatic speech\nrecognition (ASR). It adds convolution modules to a transformer to learn both\nlocal and global dependencies. In this work we describe a faster, more\nmemory-efficient, and better-performing transformer, called Zipformer. Modeling\nchanges include: 1) a U-Net-like encoder structure where middle stacks operate\nat lower frame rates; 2) reorganized block structure with more modules, within\nwhich we re-use attention weights for efficiency; 3) a modified form of\nLayerNorm called BiasNorm allows us to retain some length information; 4) new\nactivation functions SwooshR and SwooshL work better than Swish. We also\npropose a new optimizer, called ScaledAdam, which scales the update by each\ntensor's current scale to keep the relative change about the same, and also\nexplictly learns the parameter scale. It achieves faster convergence and better\nperformance than Adam. Extensive experiments on LibriSpeech, Aishell-1, and\nWenetSpeech datasets demonstrate the effectiveness of our proposed Zipformer\nover other state-of-the-art ASR models. Our code is publicly available at\nhttps://github.com/k2-fsa/icefall.\n","authors":["Zengwei Yao","Liyong Guo","Xiaoyu Yang","Wei Kang","Fangjun Kuang","Yifan Yang","Zengrui Jin","Long Lin","Daniel Povey"],"pdf_url":"https://arxiv.org/pdf/2310.11230v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00600v2","updated":"2023-10-17T12:48:58Z","published":"2023-06-01T12:16:26Z","title":"Rotating Features for Object Discovery","summary":"  The binding problem in human cognition, concerning how the brain represents\nand connects objects within a fixed network of neural connections, remains a\nsubject of intense debate. Most machine learning efforts addressing this issue\nin an unsupervised setting have focused on slot-based methods, which may be\nlimiting due to their discrete nature and difficulty to express uncertainty.\nRecently, the Complex AutoEncoder was proposed as an alternative that learns\ncontinuous and distributed object-centric representations. However, it is only\napplicable to simple toy data. In this paper, we present Rotating Features, a\ngeneralization of complex-valued features to higher dimensions, and a new\nevaluation procedure for extracting objects from distributed representations.\nAdditionally, we show the applicability of our approach to pre-trained\nfeatures. Together, these advancements enable us to scale distributed\nobject-centric representations from simple toy to real-world data. We believe\nthis work advances a new paradigm for addressing the binding problem in machine\nlearning and has the potential to inspire further innovation in the field.\n","authors":["Sindy Löwe","Phillip Lippe","Francesco Locatello","Max Welling"],"pdf_url":"https://arxiv.org/pdf/2306.00600v2.pdf","comment":"Oral presentation at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.11211v1","updated":"2023-10-17T12:40:53Z","published":"2023-10-17T12:40:53Z","title":"Understanding Fairness Surrogate Functions in Algorithmic Fairness","summary":"  It has been observed that machine learning algorithms exhibit biased\npredictions against certain population groups. To mitigate such bias while\nachieving comparable accuracy, a promising approach is to introduce surrogate\nfunctions of the concerned fairness definition and solve a constrained\noptimization problem. However, an intriguing issue in previous work is that\nsuch fairness surrogate functions may yield unfair results. In this work, in\norder to deeply understand this issue, taking a widely used fairness\ndefinition, demographic parity as an example, we both theoretically and\nempirically show that there is a surrogate-fairness gap between the fairness\ndefinition and the fairness surrogate function. The \"gap\" directly determines\nwhether a surrogate function is an appropriate substitute for a fairness\ndefinition. Also, the theoretical analysis and experimental results about the\n\"gap\" motivate us that the unbounded surrogate functions will be affected by\nthe points far from the decision boundary, which is the large margin points\nissue investigated in this paper. To address it, we propose the general sigmoid\nsurrogate with a rigorous and reliable fairness guarantee. Interestingly, the\ntheory also provides insights into two important issues that deal with the\nlarge margin points as well as obtaining a more balanced dataset are beneficial\nto fairness. Furthermore, we elaborate a novel and general algorithm called\nBalanced Surrogate, which iteratively reduces the \"gap\" to improve fairness.\nFinally, we provide empirical evidence showing that our methods achieve better\nfairness performance in three real-world datasets.\n","authors":["Wei Yao","Zhanke Zhou","Zhicong Li","Bo Han","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2310.11211v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2310.11207v1","updated":"2023-10-17T12:34:32Z","published":"2023-10-17T12:34:32Z","title":"Can Large Language Models Explain Themselves? A Study of LLM-Generated\n  Self-Explanations","summary":"  Large language models (LLMs) such as ChatGPT have demonstrated superior\nperformance on a variety of natural language processing (NLP) tasks including\nsentiment analysis, mathematical reasoning and summarization. Furthermore,\nsince these models are instruction-tuned on human conversations to produce\n\"helpful\" responses, they can and often will produce explanations along with\nthe response, which we call self-explanations. For example, when analyzing the\nsentiment of a movie review, the model may output not only the positivity of\nthe sentiment, but also an explanation (e.g., by listing the sentiment-laden\nwords such as \"fantastic\" and \"memorable\" in the review). How good are these\nautomatically generated self-explanations? In this paper, we investigate this\nquestion on the task of sentiment analysis and for feature attribution\nexplanation, one of the most commonly studied settings in the interpretability\nliterature (for pre-ChatGPT models). Specifically, we study different ways to\nelicit the self-explanations, evaluate their faithfulness on a set of\nevaluation metrics, and compare them to traditional explanation methods such as\nocclusion or LIME saliency maps. Through an extensive set of experiments, we\nfind that ChatGPT's self-explanations perform on par with traditional ones, but\nare quite different from them according to various agreement metrics, meanwhile\nbeing much cheaper to produce (as they are generated along with the\nprediction). In addition, we identified several interesting characteristics of\nthem, which prompt us to rethink many current model interpretability practices\nin the era of ChatGPT(-like) LLMs.\n","authors":["Shiyuan Huang","Siddarth Mamidanna","Shreedhar Jangam","Yilun Zhou","Leilani H. Gilpin"],"pdf_url":"https://arxiv.org/pdf/2310.11207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11203v1","updated":"2023-10-17T12:29:29Z","published":"2023-10-17T12:29:29Z","title":"Federated Learning with Nonvacuous Generalisation Bounds","summary":"  We introduce a novel strategy to train randomised predictors in federated\nlearning, where each node of the network aims at preserving its privacy by\nreleasing a local predictor but keeping secret its training dataset with\nrespect to the other nodes. We then build a global randomised predictor which\ninherits the properties of the local private predictors in the sense of a\nPAC-Bayesian generalisation bound. We consider the synchronous case where all\nnodes share the same training objective (derived from a generalisation bound),\nand the asynchronous case where each node may have its own personalised\ntraining objective. We show through a series of numerical experiments that our\napproach achieves a comparable predictive performance to that of the batch\napproach where all datasets are shared across nodes. Moreover the predictors\nare supported by numerically nonvacuous generalisation bounds while preserving\nprivacy for each node. We explicitly compute the increment on predictive\nperformance and generalisation bounds between batch and federated settings,\nhighlighting the price to pay to preserve privacy.\n","authors":["Pierre Jobic","Maxime Haddouche","Benjamin Guedj"],"pdf_url":"https://arxiv.org/pdf/2310.11203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11198v1","updated":"2023-10-17T12:25:31Z","published":"2023-10-17T12:25:31Z","title":"EEG motor imagery decoding: A framework for comparative analysis with\n  channel attention mechanisms","summary":"  The objective of this study is to investigate the application of various\nchannel attention mechanisms within the domain of brain-computer interface\n(BCI) for motor imagery decoding. Channel attention mechanisms can be seen as a\npowerful evolution of spatial filters traditionally used for motor imagery\ndecoding. This study systematically compares such mechanisms by integrating\nthem into a lightweight architecture framework to evaluate their impact. We\ncarefully construct a straightforward and lightweight baseline architecture\ndesigned to seamlessly integrate different channel attention mechanisms. This\napproach is contrary to previous works which only investigate one attention\nmechanism and usually build a very complex, sometimes nested architecture. Our\nframework allows us to evaluate and compare the impact of different attention\nmechanisms under the same circumstances. The easy integration of different\nchannel attention mechanisms as well as the low computational complexity\nenables us to conduct a wide range of experiments on three datasets to\nthoroughly assess the effectiveness of the baseline model and the attention\nmechanisms. Our experiments demonstrate the strength and generalizability of\nour architecture framework as well as how channel attention mechanisms can\nimprove the performance while maintaining the small memory footprint and low\ncomputational complexity of our baseline architecture. Our architecture\nemphasizes simplicity, offering easy integration of channel attention\nmechanisms, while maintaining a high degree of generalizability across\ndatasets, making it a versatile and efficient solution for EEG motor imagery\ndecoding within brain-computer interfaces.\n","authors":["Martin Wimpff","Leonardo Gizzi","Jan Zerfowski","Bin Yang"],"pdf_url":"https://arxiv.org/pdf/2310.11198v1.pdf","comment":"20 pages, 4 figures, submitted to: Journal of Neural Engineering"},{"id":"http://arxiv.org/abs/2308.15452v3","updated":"2023-10-17T12:17:41Z","published":"2023-08-29T17:22:39Z","title":"When Do Program-of-Thoughts Work for Reasoning?","summary":"  The reasoning capabilities of Large Language Models (LLMs) play a pivotal\nrole in the realm of embodied artificial intelligence. Although there are\neffective methods like program-of-thought prompting for LLMs which uses\nprogramming language to tackle complex reasoning tasks, the specific impact of\ncode data on the improvement of reasoning capabilities remains under-explored.\nTo address this gap, we propose complexity-impacted reasoning score (CIRS),\nwhich combines structural and logical attributes, to measure the correlation\nbetween code and reasoning abilities. Specifically, we use the abstract syntax\ntree to encode the structural information and calculate logical complexity by\nconsidering the difficulty and the cyclomatic complexity. Through an empirical\nanalysis, we find not all code data of complexity can be learned or understood\nby LLMs. Optimal level of complexity is critical to the improvement of\nreasoning abilities by program-aided prompting. Then we design an\nauto-synthesizing and stratifying algorithm, and apply it to instruction\ngeneration for mathematical reasoning and code data filtering for code\ngeneration tasks. Extensive results demonstrates the effectiveness of our\nproposed approach. Code will be integrated into the EasyInstruct framework at\nhttps://github.com/zjunlp/EasyInstruct.\n","authors":["Zhen Bi","Ningyu Zhang","Yinuo Jiang","Shumin Deng","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.15452v3.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2305.11490v4","updated":"2023-10-17T12:16:03Z","published":"2023-05-19T07:44:39Z","title":"LLM-CXR: Instruction-Finetuned LLM for CXR Image Understanding and\n  Generation","summary":"  Following the impressive development of LLMs, vision-language alignment in\nLLMs is actively being researched to enable multimodal reasoning and visual IO.\nThis direction of research is particularly relevant to medical imaging because\nmedical image analysis and generation consist of reasoning based on a\ncombination of visual features and prior knowledge. Many recent works have\nfocused on training adapter networks that serve as an information bridge\nbetween image processing networks and LLMs; but presumably, in order to achieve\nmaximum reasoning potential of LLMs on visual information as well, visual and\nlanguage features should be allowed to interact more freely. This is especially\nimportant in the medical domain because understanding and generating medical\nimages such as chest X-rays (CXR) require not only accurate visual and\nlanguage-based reasoning but also a more intimate mapping between the two\nmodalities. Thus, taking inspiration from previous work on the transformer and\nVQ-GAN combination for bidirectional image and text generation, we build upon\nthis approach and develop a method for instruction-tuning an LLM pre-trained\nonly on text to gain vision-language capabilities for medical images.\nSpecifically, we leverage a pretrained LLM's existing question-answering and\ninstruction-following abilities to teach it to understand visual inputs by\ninstructing it to answer questions about image inputs and, symmetrically,\noutput both text and image responses appropriate to a given query by tuning the\nLLM with diverse tasks that encompass image-based text-generation and\ntext-based image-generation. We show that our model, LLM-CXR, trained in this\napproach shows better image-text alignment in both CXR understanding and\ngeneration tasks while being smaller in size compared to previously developed\nmodels that perform a narrower range of tasks. The code is at\nhttps://github.com/hyn2028/llm-cxr.\n","authors":["Suhyeon Lee","Won Jun Kim","Jinho Chang","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2305.11490v4.pdf","comment":"20 pages, 8 figures"},{"id":"http://arxiv.org/abs/2310.11188v1","updated":"2023-10-17T12:08:15Z","published":"2023-10-17T12:08:15Z","title":"A Modified EXP3 and Its Adaptive Variant in Adversarial Bandits with\n  Multi-User Delayed Feedback","summary":"  For the adversarial multi-armed bandit problem with delayed feedback, we\nconsider that the delayed feedback results are from multiple users and are\nunrestricted on internal distribution. As the player picks an arm, feedback\nfrom multiple users may not be received instantly yet after an arbitrary delay\nof time which is unknown to the player in advance. For different users in a\nround, the delays in feedback have no latent correlation. Thus, we formulate an\nadversarial multi-armed bandit problem with multi-user delayed feedback and\ndesign a modified EXP3 algorithm named MUD-EXP3, which makes a decision at each\nround by considering the importance-weighted estimator of the received feedback\nfrom different users. On the premise of known terminal round index $T$, the\nnumber of users $M$, the number of arms $N$, and upper bound of delay\n$d_{max}$, we prove a regret of\n$\\mathcal{O}(\\sqrt{TM^2\\ln{N}(N\\mathrm{e}+4d_{max})})$. Furthermore, for the\nmore common case of unknown $T$, an adaptive algorithm named AMUD-EXP3 is\nproposed with a sublinear regret with respect to $T$. Finally, extensive\nexperiments are conducted to indicate the correctness and effectiveness of our\nalgorithms.\n","authors":["Yandi Li","Jianxiong Guo"],"pdf_url":"https://arxiv.org/pdf/2310.11188v1.pdf","comment":"This is an extended version of \"A Modified EXP3 in Adversarial\n  Bandits with Multi-User Delayed Feedback\" published in COCOON 2023"},{"id":"http://arxiv.org/abs/2310.11186v1","updated":"2023-10-17T12:07:14Z","published":"2023-10-17T12:07:14Z","title":"Efficiently Visualizing Large Graphs","summary":"  Most existing graph visualization methods based on dimension reduction are\nlimited to relatively small graphs due to performance issues. In this work, we\npropose a novel dimension reduction method for graph visualization, called\nt-Distributed Stochastic Graph Neighbor Embedding (t-SGNE). t-SGNE is\nspecifically designed to visualize cluster structures in the graph. As a\nvariant of the standard t-SNE method, t-SGNE avoids the time-consuming\ncomputations of pairwise similarity. Instead, it uses the neighbor structures\nof the graph to reduce the time complexity from quadratic to linear, thus\nsupporting larger graphs. In addition, to suit t-SGNE, we combined Laplacian\nEigenmaps with the shortest path algorithm in graphs to form the graph\nembedding algorithm ShortestPath Laplacian Eigenmaps Embedding (SPLEE).\nPerforming SPLEE to obtain a high-dimensional embedding of the large-scale\ngraph and then using t-SGNE to reduce its dimension for visualization, we are\nable to visualize graphs with up to 300K nodes and 1M edges within 5 minutes\nand achieve approximately 10% improvement in visualization quality. Codes and\ndata are available at\nhttps://github.com/Charlie-XIAO/embedding-visualization-test.\n","authors":["Xinyu Li","Yao Xiao","Yuchen Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.11186v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10021v2","updated":"2023-10-17T12:01:17Z","published":"2023-10-16T02:43:47Z","title":"Bootstrap Your Own Skills: Learning to Solve New Tasks with Large\n  Language Model Guidance","summary":"  We propose BOSS, an approach that automatically learns to solve new\nlong-horizon, complex, and meaningful tasks by growing a learned skill library\nwith minimal supervision. Prior work in reinforcement learning require expert\nsupervision, in the form of demonstrations or rich reward functions, to learn\nlong-horizon tasks. Instead, our approach BOSS (BOotStrapping your own Skills)\nlearns to accomplish new tasks by performing \"skill bootstrapping,\" where an\nagent with a set of primitive skills interacts with the environment to practice\nnew skills without receiving reward feedback for tasks outside of the initial\nskill set. This bootstrapping phase is guided by large language models (LLMs)\nthat inform the agent of meaningful skills to chain together. Through this\nprocess, BOSS builds a wide range of complex and useful behaviors from a basic\nset of primitive skills. We demonstrate through experiments in realistic\nhousehold environments that agents trained with our LLM-guided bootstrapping\nprocedure outperform those trained with naive bootstrapping as well as prior\nunsupervised skill acquisition methods on zero-shot execution of unseen,\nlong-horizon tasks in new environments. Website at clvrai.com/boss.\n","authors":["Jesse Zhang","Jiahui Zhang","Karl Pertsch","Ziyi Liu","Xiang Ren","Minsuk Chang","Shao-Hua Sun","Joseph J. Lim"],"pdf_url":"https://arxiv.org/pdf/2310.10021v2.pdf","comment":"CoRL 2023 (Oral); 24 pages, 11 figures"},{"id":"http://arxiv.org/abs/2310.11169v1","updated":"2023-10-17T11:37:40Z","published":"2023-10-17T11:37:40Z","title":"MST-GAT: A Multimodal Spatial-Temporal Graph Attention Network for Time\n  Series Anomaly Detection","summary":"  Multimodal time series (MTS) anomaly detection is crucial for maintaining the\nsafety and stability of working devices (e.g., water treatment system and\nspacecraft), whose data are characterized by multivariate time series with\ndiverse modalities. Although recent deep learning methods show great potential\nin anomaly detection, they do not explicitly capture spatial-temporal\nrelationships between univariate time series of different modalities, resulting\nin more false negatives and false positives. In this paper, we propose a\nmultimodal spatial-temporal graph attention network (MST-GAT) to tackle this\nproblem. MST-GAT first employs a multimodal graph attention network (M-GAT) and\na temporal convolution network to capture the spatial-temporal correlation in\nmultimodal time series. Specifically, M-GAT uses a multi-head attention module\nand two relational attention modules (i.e., intra- and inter-modal attention)\nto model modal correlations explicitly. Furthermore, MST-GAT optimizes the\nreconstruction and prediction modules simultaneously. Experimental results on\nfour multimodal benchmarks demonstrate that MST-GAT outperforms the\nstate-of-the-art baselines. Further analysis indicates that MST-GAT strengthens\nthe interpretability of detected anomalies by locating the most anomalous\nunivariate time series.\n","authors":["Chaoyue Ding","Shiliang Sun","Jing Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.11169v1.pdf","comment":"Information Fusion 2023 accepted"},{"id":"http://arxiv.org/abs/2306.04810v3","updated":"2023-10-17T11:32:52Z","published":"2023-06-07T22:14:33Z","title":"Correlative Information Maximization: A Biologically Plausible Approach\n  to Supervised Deep Neural Networks without Weight Symmetry","summary":"  The backpropagation algorithm has experienced remarkable success in training\nlarge-scale artificial neural networks; however, its biological plausibility\nhas been strongly criticized, and it remains an open question whether the brain\nemploys supervised learning mechanisms akin to it. Here, we propose correlative\ninformation maximization between layer activations as an alternative normative\napproach to describe the signal propagation in biological neural networks in\nboth forward and backward directions. This new framework addresses many\nconcerns about the biological-plausibility of conventional artificial neural\nnetworks and the backpropagation algorithm. The coordinate descent-based\noptimization of the corresponding objective, combined with the mean square\nerror loss function for fitting labeled supervision data, gives rise to a\nneural network structure that emulates a more biologically realistic network of\nmulti-compartment pyramidal neurons with dendritic processing and lateral\ninhibitory neurons. Furthermore, our approach provides a natural resolution to\nthe weight symmetry problem between forward and backward signal propagation\npaths, a significant critique against the plausibility of the conventional\nbackpropagation algorithm. This is achieved by leveraging two alternative, yet\nequivalent forms of the correlative mutual information objective. These\nalternatives intrinsically lead to forward and backward prediction networks\nwithout weight symmetry issues, providing a compelling solution to this\nlong-standing challenge.\n","authors":["Bariscan Bozkurt","Cengiz Pehlevan","Alper T Erdogan"],"pdf_url":"https://arxiv.org/pdf/2306.04810v3.pdf","comment":"Preprint, 38 pages"},{"id":"http://arxiv.org/abs/2310.11165v1","updated":"2023-10-17T11:31:29Z","published":"2023-10-17T11:31:29Z","title":"Serenade: A Model for Human-in-the-loop Automatic Chord Estimation","summary":"  Computational harmony analysis is important for MIR tasks such as automatic\nsegmentation, corpus analysis and automatic chord label estimation. However,\nrecent research into the ambiguous nature of musical harmony, causing limited\ninter-rater agreement, has made apparent that there is a glass ceiling for\ncommon metrics such as accuracy. Commonly, these issues are addressed either in\nthe training data itself by creating majority-rule annotations or during the\ntraining phase by learning soft targets. We propose a novel alternative\napproach in which a human and an autoregressive model together co-create a\nharmonic annotation for an audio track. After automatically generating harmony\npredictions, a human sparsely annotates parts with low model confidence and the\nmodel then adjusts its predictions following human guidance. We evaluate our\nmodel on a dataset of popular music and we show that, with this\nhuman-in-the-loop approach, harmonic analysis performance improves over a\nmodel-only approach. The human contribution is amplified by the second,\nconstrained prediction of the model.\n","authors":["Hendrik Vincent Koops","Gianluca Micchi","Ilaria Manco","Elio Quinton"],"pdf_url":"https://arxiv.org/pdf/2310.11165v1.pdf","comment":"Accepted at MMRP23. 7 pages, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2301.11916v3","updated":"2023-10-17T11:24:33Z","published":"2023-01-27T18:59:01Z","title":"Large Language Models Are Latent Variable Models: Explaining and Finding\n  Good Demonstrations for In-Context Learning","summary":"  In recent years, pre-trained large language models (LLMs) have demonstrated\nremarkable efficiency in achieving an inference-time few-shot learning\ncapability known as in-context learning. However, existing literature has\nhighlighted the sensitivity of this capability to the selection of few-shot\ndemonstrations. Current understandings of the underlying mechanisms by which\nthis capability arises from regular language model pretraining objectives\nremain disconnected from the real-world LLMs. This study aims to examine the\nin-context learning phenomenon through a Bayesian lens, viewing real-world LLMs\nas latent variable models. On this premise, we propose an algorithm to select\noptimal demonstrations from a set of annotated data with a small LM, and then\ndirectly generalize the selected demonstrations to larger LMs. We demonstrate\nsignificant improvement over baselines, averaged over eight GPT models on eight\nreal-world text classification datasets. We also demonstrate the real-world\nusefulness of our algorithm on GSM8K, a math word problem dataset. Our\nempirical findings support our hypothesis that LLMs implicitly infer a latent\nvariable containing task information.\n","authors":["Xinyi Wang","Wanrong Zhu","Michael Saxon","Mark Steyvers","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2301.11916v3.pdf","comment":"code at:\n  https://github.com/WANGXinyiLinda/concept-based-demonstration-selection\n  Accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.11158v1","updated":"2023-10-17T11:23:32Z","published":"2023-10-17T11:23:32Z","title":"Probing the Creativity of Large Language Models: Can models produce\n  divergent semantic association?","summary":"  Large language models possess remarkable capacity for processing language,\nbut it remains unclear whether these models can further generate creative\ncontent. The present study aims to investigate the creative thinking of large\nlanguage models through a cognitive perspective. We utilize the divergent\nassociation task (DAT), an objective measurement of creativity that asks models\nto generate unrelated words and calculates the semantic distance between them.\nWe compare the results across different models and decoding strategies. Our\nfindings indicate that: (1) When using the greedy search strategy, GPT-4\noutperforms 96% of humans, while GPT-3.5-turbo exceeds the average human level.\n(2) Stochastic sampling and temperature scaling are effective to obtain higher\nDAT scores for models except GPT-4, but face a trade-off between creativity and\nstability. These results imply that advanced large language models have\ndivergent semantic associations, which is a fundamental process underlying\ncreativity.\n","authors":["Honghua Chen","Nai Ding"],"pdf_url":"https://arxiv.org/pdf/2310.11158v1.pdf","comment":"Accepted for publication in Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2305.05465v3","updated":"2023-10-17T11:20:44Z","published":"2023-05-09T14:04:42Z","title":"The emergence of clusters in self-attention dynamics","summary":"  Viewing Transformers as interacting particle systems, we describe the\ngeometry of learned representations when the weights are not time dependent. We\nshow that particles, representing tokens, tend to cluster toward particular\nlimiting objects as time tends to infinity. Cluster locations are determined by\nthe initial tokens, confirming context-awareness of representations learned by\nTransformers. Using techniques from dynamical systems and partial differential\nequations, we show that the type of limiting object that emerges depends on the\nspectrum of the value matrix. Additionally, in the one-dimensional case we\nprove that the self-attention matrix converges to a low-rank Boolean matrix.\nThe combination of these results mathematically confirms the empirical\nobservation made by Vaswani et al. [VSP'17] that leaders appear in a sequence\nof tokens when processed by Transformers.\n","authors":["Borjan Geshkovski","Cyril Letrouit","Yury Polyanskiy","Philippe Rigollet"],"pdf_url":"https://arxiv.org/pdf/2305.05465v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11143v1","updated":"2023-10-17T10:51:05Z","published":"2023-10-17T10:51:05Z","title":"A new high-resolution indoor radon map for Germany using a machine\n  learning based probabilistic exposure model","summary":"  Radon is a carcinogenic, radioactive gas that can accumulate indoors. Indoor\nradon exposure at the national scale is usually estimated on the basis of\nextensive measurement campaigns. However, characteristics of the sample often\ndiffer from the characteristics of the population due to the large number of\nrelevant factors such as the availability of geogenic radon or floor level.\nFurthermore, the sample size usually does not allow exposure estimation with\nhigh spatial resolution. We propose a model-based approach that allows a more\nrealistic estimation of indoor radon distribution with a higher spatial\nresolution than a purely data-based approach. We applied a two-stage modelling\napproach: 1) a quantile regression forest using environmental and building data\nas predictors was applied to estimate the probability distribution function of\nindoor radon for each floor level of each residential building in Germany; (2)\na probabilistic Monte Carlo sampling technique enabled the combination and\npopulation weighting of floor-level predictions. In this way, the uncertainty\nof the individual predictions is effectively propagated into the estimate of\nvariability at the aggregated level. The results give an arithmetic mean of 63\nBq/m3, a geometric mean of 41 Bq/m3 and a 95 %ile of 180 Bq/m3. The exceedance\nprobability for 100 Bq/m3 and 300 Bq/m3 are 12.5 % (10.5 million people) and\n2.2 % (1.9 million people), respectively. In large cities, individual indoor\nradon exposure is generally lower than in rural areas, which is a due to the\ndifferent distribution of the population on floor levels. The advantages of our\napproach are 1) an accurate exposure estimation even if the survey was not\nfully representative with respect to the main controlling factors, and 2) an\nestimate of the exposure distribution with a much higher spatial resolution\nthan basic descriptive statistics.\n","authors":["Eric Petermann","Peter Bossew","Joachim Kemski","Valeria Gruber","Nils Suhr","Bernd Hoffmann"],"pdf_url":"https://arxiv.org/pdf/2310.11143v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11142v1","updated":"2023-10-17T10:45:28Z","published":"2023-10-17T10:45:28Z","title":"BayesDiff: Estimating Pixel-wise Uncertainty in Diffusion via Bayesian\n  Inference","summary":"  Diffusion models have impressive image generation capability, but low-quality\ngenerations still exist, and their identification remains challenging due to\nthe lack of a proper sample-wise metric. To address this, we propose BayesDiff,\na pixel-wise uncertainty estimator for generations from diffusion models based\non Bayesian inference. In particular, we derive a novel uncertainty iteration\nprinciple to characterize the uncertainty dynamics in diffusion, and leverage\nthe last-layer Laplace approximation for efficient Bayesian inference. The\nestimated pixel-wise uncertainty can not only be aggregated into a sample-wise\nmetric to filter out low-fidelity images but also aids in augmenting successful\ngenerations and rectifying artifacts in failed generations in text-to-image\ntasks. Extensive experiments demonstrate the efficacy of BayesDiff and its\npromise for practical applications.\n","authors":["Siqi Kou","Lei Gan","Dequan Wang","Chongxuan Li","Zhijie Deng"],"pdf_url":"https://arxiv.org/pdf/2310.11142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.17181v4","updated":"2023-10-17T10:41:12Z","published":"2023-06-19T10:22:12Z","title":"Unsupervised Text Embedding Space Generation Using Generative\n  Adversarial Networks for Text Synthesis","summary":"  Generative Adversarial Networks (GAN) is a model for data synthesis, which\ncreates plausible data through the competition of generator and discriminator.\nAlthough GAN application to image synthesis is extensively studied, it has\ninherent limitations to natural language generation. Because natural language\nis composed of discrete tokens, a generator has difficulty updating its\ngradient through backpropagation; therefore, most text-GAN studies generate\nsentences starting with a random token based on a reward system. Thus, the\ngenerators of previous studies are pre-trained in an autoregressive way before\nadversarial training, causing data memorization that synthesized sentences\nreproduce the training data. In this paper, we synthesize sentences using a\nframework similar to the original GAN. More specifically, we propose Text\nEmbedding Space Generative Adversarial Networks (TESGAN) which generate\ncontinuous text embedding spaces instead of discrete tokens to solve the\ngradient backpropagation problem. Furthermore, TESGAN conducts unsupervised\nlearning which does not directly refer to the text of the training data to\novercome the data memorization issue. By adopting this novel method, TESGAN can\nsynthesize new sentences, showing the potential of unsupervised learning for\ntext synthesis. We expect to see extended research combining Large Language\nModels with a new perspective of viewing text as an continuous space.\n","authors":["Jun-Min Lee","Tae-Bin Ha"],"pdf_url":"https://arxiv.org/pdf/2306.17181v4.pdf","comment":"NEJLT accpeted"},{"id":"http://arxiv.org/abs/2310.11138v1","updated":"2023-10-17T10:40:05Z","published":"2023-10-17T10:40:05Z","title":"Keep Various Trajectories: Promoting Exploration of Ensemble Policies in\n  Continuous Control","summary":"  The combination of deep reinforcement learning (DRL) with ensemble methods\nhas been proved to be highly effective in addressing complex sequential\ndecision-making problems. This success can be primarily attributed to the\nutilization of multiple models, which enhances both the robustness of the\npolicy and the accuracy of value function estimation. However, there has been\nlimited analysis of the empirical success of current ensemble RL methods thus\nfar. Our new analysis reveals that the sample efficiency of previous ensemble\nDRL algorithms may be limited by sub-policies that are not as diverse as they\ncould be. Motivated by these findings, our study introduces a new ensemble RL\nalgorithm, termed \\textbf{T}rajectories-awar\\textbf{E} \\textbf{E}nsemble\nexploratio\\textbf{N} (TEEN). The primary goal of TEEN is to maximize the\nexpected return while promoting more diverse trajectories. Through extensive\nexperiments, we demonstrate that TEEN not only enhances the sample diversity of\nthe ensemble policy compared to using sub-policies alone but also improves the\nperformance over ensemble RL algorithms. On average, TEEN outperforms the\nbaseline ensemble DRL algorithms by 41\\% in performance on the tested\nrepresentative environments.\n","authors":["Chao Li","Chen Gong","Qiang He","Xinwen Hou"],"pdf_url":"https://arxiv.org/pdf/2310.11138v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.02083v2","updated":"2023-10-17T10:29:33Z","published":"2022-12-05T07:55:22Z","title":"On the Overlooked Structure of Stochastic Gradients","summary":"  Stochastic gradients closely relate to both optimization and generalization\nof deep neural networks (DNNs). Some works attempted to explain the success of\nstochastic optimization for deep learning by the arguably heavy-tail properties\nof gradient noise, while other works presented theoretical and empirical\nevidence against the heavy-tail hypothesis on gradient noise. Unfortunately,\nformal statistical tests for analyzing the structure and heavy tails of\nstochastic gradients in deep learning are still under-explored. In this paper,\nwe mainly make two contributions. First, we conduct formal statistical tests on\nthe distribution of stochastic gradients and gradient noise across both\nparameters and iterations. Our statistical tests reveal that dimension-wise\ngradients usually exhibit power-law heavy tails, while iteration-wise gradients\nand stochastic gradient noise caused by minibatch training usually do not\nexhibit power-law heavy tails. Second, we further discover that the covariance\nspectra of stochastic gradients have the power-law structures overlooked by\nprevious studies and present its theoretical implications for training of DNNs.\nWhile previous studies believed that the anisotropic structure of stochastic\ngradients matters to deep learning, they did not expect the gradient covariance\ncan have such an elegant mathematical structure. Our work challenges the\nexisting belief and provides novel insights on the structure of stochastic\ngradients in deep learning.\n","authors":["Zeke Xie","Qian-Yuan Tang","Mingming Sun","Ping Li"],"pdf_url":"https://arxiv.org/pdf/2212.02083v2.pdf","comment":"NeurIPS 2023. 20 pages, 16 figures, 17 Tables; Key Words: Deep\n  Learning, Stochastic Gradient, Optimization. arXiv admin note: text overlap\n  with arXiv:2201.13011"},{"id":"http://arxiv.org/abs/2310.11132v1","updated":"2023-10-17T10:29:23Z","published":"2023-10-17T10:29:23Z","title":"Non-parametric Conditional Independence Testing for Mixed\n  Continuous-Categorical Variables: A Novel Method and Numerical Evaluation","summary":"  Conditional independence testing (CIT) is a common task in machine learning,\ne.g., for variable selection, and a main component of constraint-based causal\ndiscovery. While most current CIT approaches assume that all variables are\nnumerical or all variables are categorical, many real-world applications\ninvolve mixed-type datasets that include numerical and categorical variables.\nNon-parametric CIT can be conducted using conditional mutual information (CMI)\nestimators combined with a local permutation scheme. Recently, two novel CMI\nestimators for mixed-type datasets based on k-nearest-neighbors (k-NN) have\nbeen proposed. As with any k-NN method, these estimators rely on the definition\nof a distance metric. One approach computes distances by a one-hot encoding of\nthe categorical variables, essentially treating categorical variables as\ndiscrete-numerical, while the other expresses CMI by entropy terms where the\ncategorical variables appear as conditions only. In this work, we study these\nestimators and propose a variation of the former approach that does not treat\ncategorical variables as numeric. Our numerical experiments show that our\nvariant detects dependencies more robustly across different data distributions\nand preprocessing types.\n","authors":["Oana-Iuliana Popescu","Andreas Gerhardus","Jakob Runge"],"pdf_url":"https://arxiv.org/pdf/2310.11132v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11131v1","updated":"2023-10-17T10:28:28Z","published":"2023-10-17T10:28:28Z","title":"FROST: Towards Energy-efficient AI-on-5G Platforms -- A GPU Power\n  Capping Evaluation","summary":"  The Open Radio Access Network (O-RAN) is a burgeoning market with projected\ngrowth in the upcoming years. RAN has the highest CAPEX impact on the network\nand, most importantly, consumes 73% of its total energy. That makes it an ideal\ntarget for optimisation through the integration of Machine Learning (ML).\nHowever, the energy consumption of ML is frequently overlooked in such\necosystems. Our work addresses this critical aspect by presenting FROST -\nFlexible Reconfiguration method with Online System Tuning - a solution for\nenergy-aware ML pipelines that adhere to O-RAN's specifications and principles.\nFROST is capable of profiling the energy consumption of an ML pipeline and\noptimising the hardware accordingly, thereby limiting the power draw. Our\nfindings indicate that FROST can achieve energy savings of up to 26.4% without\ncompromising the model's accuracy or introducing significant time delays.\n","authors":["Ioannis Mavromatis","Stefano De Feo","Pietro Carnelli","Robert J. Piechocki","Aftab Khan"],"pdf_url":"https://arxiv.org/pdf/2310.11131v1.pdf","comment":"IEEE CSCN 2023, Munich, Germany"},{"id":"http://arxiv.org/abs/2310.11130v1","updated":"2023-10-17T10:28:00Z","published":"2023-10-17T10:28:00Z","title":"Topological Expressivity of ReLU Neural Networks","summary":"  We study the expressivity of ReLU neural networks in the setting of a binary\nclassification problem from a topological perspective. Recently, empirical\nstudies showed that neural networks operate by changing topology, transforming\na topologically complicated data set into a topologically simpler one as it\npasses through the layers. This topological simplification has been measured by\nBetti numbers, which are algebraic invariants of a topological space. We use\nthe same measure to establish lower and upper bounds on the topological\nsimplification a ReLU neural network can achieve with a given architecture. We\ntherefore contribute to a better understanding of the expressivity of ReLU\nneural networks in the context of binary classification problems by shedding\nlight on their ability to capture the underlying topological structure of the\ndata. In particular the results show that deep ReLU neural networks are\nexponentially more powerful than shallow ones in terms of topological\nsimplification. This provides a mathematically rigorous explanation why deeper\nnetworks are better equipped to handle complex and topologically rich datasets.\n","authors":["Ekin Ergen","Moritz Grillo"],"pdf_url":"https://arxiv.org/pdf/2310.11130v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12128v2","updated":"2023-10-17T10:27:12Z","published":"2023-09-21T14:48:02Z","title":"Convergence and Recovery Guarantees of Unsupervised Neural Networks for\n  Inverse Problems","summary":"  Neural networks have become a prominent approach to solve inverse problems in\nrecent years. While a plethora of such methods was developed to solve inverse\nproblems empirically, we are still lacking clear theoretical guarantees for\nthese methods. On the other hand, many works proved convergence to optimal\nsolutions of neural networks in a more general setting using\noverparametrization as a way to control the Neural Tangent Kernel. In this work\nwe investigate how to bridge these two worlds and we provide deterministic\nconvergence and recovery guarantees for the class of unsupervised feedforward\nmultilayer neural networks trained to solve inverse problems. We also derive\noverparametrization bounds under which a two-layers Deep Inverse Prior network\nwith smooth activation function will benefit from our guarantees.\n","authors":["Nathan Buskulic","Jalal Fadili","Yvain Quéau"],"pdf_url":"https://arxiv.org/pdf/2309.12128v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13035v3","updated":"2023-10-17T10:23:46Z","published":"2023-05-22T13:39:28Z","title":"Getting ViT in Shape: Scaling Laws for Compute-Optimal Model Design","summary":"  Scaling laws have been recently employed to derive compute-optimal model size\n(number of parameters) for a given compute duration. We advance and refine such\nmethods to infer compute-optimal model shapes, such as width and depth, and\nsuccessfully implement this in vision transformers. Our shape-optimized vision\ntransformer, SoViT, achieves results competitive with models that exceed twice\nits size, despite being pre-trained with an equivalent amount of compute. For\nexample, SoViT-400m/14 achieves 90.3% fine-tuning accuracy on ILSRCV2012,\nsurpassing the much larger ViT-g/14 and approaching ViT-G/14 under identical\nsettings, with also less than half the inference cost. We conduct a thorough\nevaluation across multiple tasks, such as image classification, captioning, VQA\nand zero-shot transfer, demonstrating the effectiveness of our model across a\nbroad range of domains and identifying limitations. Overall, our findings\nchallenge the prevailing approach of blindly scaling up vision models and pave\na path for a more informed scaling.\n","authors":["Ibrahim Alabdulmohsin","Xiaohua Zhai","Alexander Kolesnikov","Lucas Beyer"],"pdf_url":"https://arxiv.org/pdf/2305.13035v3.pdf","comment":"10 pages, 7 figures, 9 tables. Version 2: Layout fixes"},{"id":"http://arxiv.org/abs/2310.11122v1","updated":"2023-10-17T10:14:10Z","published":"2023-10-17T10:14:10Z","title":"Sensitivity-Aware Amortized Bayesian Inference","summary":"  Bayesian inference is a powerful framework for making probabilistic\ninferences and decisions under uncertainty. Fundamental choices in modern\nBayesian workflows concern the specification of the likelihood function and\nprior distributions, the posterior approximator, and the data. Each choice can\nsignificantly influence model-based inference and subsequent decisions, thereby\nnecessitating sensitivity analysis. In this work, we propose a multifaceted\napproach to integrate sensitivity analyses into amortized Bayesian inference\n(ABI, i.e., simulation-based inference with neural networks). First, we utilize\nweight sharing to encode the structural similarities between alternative\nlikelihood and prior specifications in the training process with minimal\ncomputational overhead. Second, we leverage the rapid inference of neural\nnetworks to assess sensitivity to various data perturbations or pre-processing\nprocedures. In contrast to most other Bayesian approaches, both steps\ncircumvent the costly bottleneck of refitting the model(s) for each choice of\nlikelihood, prior, or dataset. Finally, we propose to use neural network\nensembles to evaluate variation in results induced by unreliable approximation\non unseen data. We demonstrate the effectiveness of our method in applied\nmodeling problems, ranging from the estimation of disease outbreak dynamics and\nglobal warming thresholds to the comparison of human decision-making models.\nOur experiments showcase how our approach enables practitioners to effectively\nunveil hidden relationships between modeling choices and inferential\nconclusions.\n","authors":["Lasse Elsemüller","Hans Olischläger","Marvin Schmitt","Paul-Christian Bürkner","Ullrich Köthe","Stefan T. Radev"],"pdf_url":"https://arxiv.org/pdf/2310.11122v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07440v2","updated":"2023-10-17T09:53:45Z","published":"2023-05-11T11:55:16Z","title":"Optimizing Memory Mapping Using Deep Reinforcement Learning","summary":"  Resource scheduling and allocation is a critical component of many high\nimpact systems ranging from congestion control to cloud computing. Finding more\noptimal solutions to these problems often has significant impact on resource\nand time savings, reducing device wear-and-tear, and even potentially improving\ncarbon emissions. In this paper, we focus on a specific instance of a\nscheduling problem, namely the memory mapping problem that occurs during\ncompilation of machine learning programs: That is, mapping tensors to different\nmemory layers to optimize execution time.\n  We introduce an approach for solving the memory mapping problem using\nReinforcement Learning. RL is a solution paradigm well-suited for sequential\ndecision making problems that are amenable to planning, and combinatorial\nsearch spaces with high-dimensional data inputs. We formulate the problem as a\nsingle-player game, which we call the mallocGame, such that high-reward\ntrajectories of the game correspond to efficient memory mappings on the target\nhardware. We also introduce a Reinforcement Learning agent, mallocMuZero, and\nshow that it is capable of playing this game to discover new and improved\nmemory mapping solutions that lead to faster execution times on real ML\nworkloads on ML accelerators. We compare the performance of mallocMuZero to the\ndefault solver used by the Accelerated Linear Algebra (XLA) compiler on a\nbenchmark of realistic ML workloads. In addition, we show that mallocMuZero is\ncapable of improving the execution time of the recently published AlphaTensor\nmatrix multiplication model.\n","authors":["Pengming Wang","Mikita Sazanovich","Berkin Ilbeyi","Phitchaya Mangpo Phothilimthana","Manish Purohit","Han Yang Tay","Ngân Vũ","Miaosen Wang","Cosmin Paduraru","Edouard Leurent","Anton Zhernov","Po-Sen Huang","Julian Schrittwieser","Thomas Hubert","Robert Tung","Paula Kurylowicz","Kieran Milan","Oriol Vinyals","Daniel J. Mankowitz"],"pdf_url":"https://arxiv.org/pdf/2305.07440v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11112v1","updated":"2023-10-17T09:52:54Z","published":"2023-10-17T09:52:54Z","title":"Super resolution of histopathological frozen sections via deep learning\n  preserving tissue structure","summary":"  Histopathology plays a pivotal role in medical diagnostics. In contrast to\npreparing permanent sections for histopathology, a time-consuming process,\npreparing frozen sections is significantly faster and can be performed during\nsurgery, where the sample scanning time should be optimized. Super-resolution\ntechniques allow imaging the sample in lower magnification and sparing scanning\ntime. In this paper, we present a new approach to super resolution for\nhistopathological frozen sections, with focus on achieving better distortion\nmeasures, rather than pursuing photorealistic images that may compromise\ncritical diagnostic information. Our deep-learning architecture focuses on\nlearning the error between interpolated images and real images, thereby it\ngenerates high-resolution images while preserving critical image details,\nreducing the risk of diagnostic misinterpretation. This is done by leveraging\nthe loss functions in the frequency domain, assigning higher weights to the\nreconstruction of complex, high-frequency components. In comparison to existing\nmethods, we obtained significant improvements in terms of Structural Similarity\nIndex (SSIM) and Peak Signal-to-Noise Ratio (PSNR), as well as indicated\ndetails that lost in the low-resolution frozen-section images, affecting the\npathologist's clinical decisions. Our approach has a great potential in\nproviding more-rapid frozen-section imaging, with less scanning, while\npreserving the high resolution in the imaged sample.\n","authors":["Elad Yoshai","Gil Goldinger","Miki Haifler","Natan T. Shaked"],"pdf_url":"https://arxiv.org/pdf/2310.11112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.05290v3","updated":"2023-10-17T09:51:36Z","published":"2023-01-20T23:42:25Z","title":"Removing Structured Noise with Diffusion Models","summary":"  Solving ill-posed inverse problems requires careful formulation of prior\nbeliefs over the signals of interest and an accurate description of their\nmanifestation into noisy measurements. Handcrafted signal priors based on e.g.\nsparsity are increasingly replaced by data-driven deep generative models, and\nseveral groups have recently shown that state-of-the-art score-based diffusion\nmodels yield particularly strong performance and flexibility. In this paper, we\nshow that the powerful paradigm of posterior sampling with diffusion models can\nbe extended to include rich, structured, noise models. To that end, we propose\na joint conditional reverse diffusion process with learned scores for the noise\nand signal-generating distribution. We demonstrate strong performance gains\nacross various inverse problems with structured noise, outperforming\ncompetitive baselines that use normalizing flows and adversarial networks. This\nopens up new opportunities and relevant practical applications of diffusion\nmodeling for inverse problems in the context of non-Gaussian measurement\nmodels.\n","authors":["Tristan S. W. Stevens","Hans van Gorp","Faik C. Meral","Junseob Shin","Jason Yu","Jean-Luc Robert","Ruud J. G. van Sloun"],"pdf_url":"https://arxiv.org/pdf/2302.05290v3.pdf","comment":"11 pages, 7 figures, preprint"},{"id":"http://arxiv.org/abs/2310.11110v1","updated":"2023-10-17T09:50:31Z","published":"2023-10-17T09:50:31Z","title":"Minimally Informed Linear Discriminant Analysis: training an LDA model\n  with unlabelled data","summary":"  Linear Discriminant Analysis (LDA) is one of the oldest and most popular\nlinear methods for supervised classification problems. In this paper, we\ndemonstrate that it is possible to compute the exact projection vector from LDA\nmodels based on unlabelled data, if some minimal prior information is\navailable. More precisely, we show that only one of the following three pieces\nof information is actually sufficient to compute the LDA projection vector if\nonly unlabelled data are available: (1) the class average of one of the two\nclasses, (2) the difference between both class averages (up to a scaling), or\n(3) the class covariance matrices (up to a scaling). These theoretical results\nare validated in numerical experiments, demonstrating that this minimally\ninformed Linear Discriminant Analysis (MILDA) model closely matches the\nperformance of a supervised LDA model. Furthermore, we show that the MILDA\nprojection vector can be computed in a closed form with a computational cost\ncomparable to LDA and is able to quickly adapt to non-stationary data, making\nit well-suited to use as an adaptive classifier.\n","authors":["Nicolas Heintz","Tom Francart","Alexander Bertrand"],"pdf_url":"https://arxiv.org/pdf/2310.11110v1.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2111.02355v4","updated":"2023-10-17T09:42:05Z","published":"2021-11-03T17:18:49Z","title":"A Theoretical Analysis on Independence-driven Importance Weighting for\n  Covariate-shift Generalization","summary":"  Covariate-shift generalization, a typical case in out-of-distribution (OOD)\ngeneralization, requires a good performance on the unknown test distribution,\nwhich varies from the accessible training distribution in the form of covariate\nshift. Recently, independence-driven importance weighting algorithms in stable\nlearning literature have shown empirical effectiveness to deal with\ncovariate-shift generalization on several learning models, including regression\nalgorithms and deep neural networks, while their theoretical analyses are\nmissing. In this paper, we theoretically prove the effectiveness of such\nalgorithms by explaining them as feature selection processes. We first specify\na set of variables, named minimal stable variable set, that is the minimal and\noptimal set of variables to deal with covariate-shift generalization for common\nloss functions, such as the mean squared loss and binary cross-entropy loss.\nAfterward, we prove that under ideal conditions, independence-driven importance\nweighting algorithms could identify the variables in this set. Analysis of\nasymptotic properties is also provided. These theories are further validated in\nseveral synthetic experiments.\n","authors":["Renzhe Xu","Xingxuan Zhang","Zheyan Shen","Tong Zhang","Peng Cui"],"pdf_url":"https://arxiv.org/pdf/2111.02355v4.pdf","comment":"ICML 2022"},{"id":"http://arxiv.org/abs/2310.11104v1","updated":"2023-10-17T09:37:16Z","published":"2023-10-17T09:37:16Z","title":"Local Lipschitz Constant Computation of ReLU-FNNs: Upper Bound\n  Computation with Exactness Verification","summary":"  This paper is concerned with the computation of the local Lipschitz constant\nof feedforward neural networks (FNNs) with activation functions being rectified\nlinear units (ReLUs). The local Lipschitz constant of an FNN for a target input\nis a reasonable measure for its quantitative evaluation of the reliability. By\nfollowing a standard procedure using multipliers that capture the behavior of\nReLUs,we first reduce the upper bound computation problem of the local\nLipschitz constant into a semidefinite programming problem (SDP). Here we newly\nintroduce copositive multipliers to capture the ReLU behavior accurately. Then,\nby considering the dual of the SDP for the upper bound computation, we second\nderive a viable test to conclude the exactness of the computed upper bound.\nHowever, these SDPs are intractable for practical FNNs with hundreds of ReLUs.\nTo address this issue, we further propose a method to construct a reduced order\nmodel whose input-output property is identical to the original FNN over a\nneighborhood of the target input. We finally illustrate the effectiveness of\nthe model reduction and exactness verification methods with numerical examples\nof practical FNNs.\n","authors":["Yoshio Ebihara","Xin Dai","Victor Magron","Dimitri Peaucelle","Sophie Tarbouriech"],"pdf_url":"https://arxiv.org/pdf/2310.11104v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2302.08002v2","updated":"2023-10-17T09:37:05Z","published":"2023-02-16T00:20:43Z","title":"Deep Learning Enhanced Realized GARCH","summary":"  We propose a new approach to volatility modeling by combining deep learning\n(LSTM) and realized volatility measures. This LSTM-enhanced realized GARCH\nframework incorporates and distills modeling advances from financial\neconometrics, high frequency trading data and deep learning. Bayesian inference\nvia the Sequential Monte Carlo method is employed for statistical inference and\nforecasting. The new framework can jointly model the returns and realized\nvolatility measures, has an excellent in-sample fit and superior predictive\nperformance compared to several benchmark models, while being able to adapt\nwell to the stylized facts in volatility. The performance of the new framework\nis tested using a wide range of metrics, from marginal likelihood, volatility\nforecasting, to tail risk forecasting and option pricing. We report on a\ncomprehensive empirical study using 31 widely traded stock indices over a time\nperiod that includes COVID-19 pandemic.\n","authors":["Chen Liu","Chao Wang","Minh-Ngoc Tran","Robert Kohn"],"pdf_url":"https://arxiv.org/pdf/2302.08002v2.pdf","comment":"47 pages, 12 tables"},{"id":"http://arxiv.org/abs/2310.11102v1","updated":"2023-10-17T09:34:34Z","published":"2023-10-17T09:34:34Z","title":"HGCVAE: Integrating Generative and Contrastive Learning for\n  Heterogeneous Graph Learning","summary":"  Generative self-supervised learning (SSL) has exhibited significant potential\nand garnered increasing interest in graph learning. In this study, we aim to\nexplore the problem of generative SSL in the context of heterogeneous graph\nlearning (HGL). The previous SSL approaches for heterogeneous graphs have\nprimarily relied on contrastive learning, necessitating the design of complex\nviews to capture heterogeneity. However, existing generative SSL methods have\nnot fully leveraged the capabilities of generative models to address the\nchallenges of HGL. In this paper, we present HGCVAE, a novel contrastive\nvariational graph auto-encoder that liberates HGL from the burden of intricate\nheterogeneity capturing. Instead of focusing on complicated heterogeneity,\nHGCVAE harnesses the full potential of generative SSL. HGCVAE innovatively\nconsolidates contrastive learning with generative SSL, introducing several key\ninnovations. Firstly, we employ a progressive mechanism to generate\nhigh-quality hard negative samples for contrastive learning, utilizing the\npower of variational inference. Additionally, we present a dynamic mask\nstrategy to ensure effective and stable learning. Moreover, we propose an\nenhanced scaled cosine error as the criterion for better attribute\nreconstruction. As an initial step in combining generative and contrastive SSL,\nHGCVAE achieves remarkable results compared to various state-of-the-art\nbaselines, confirming its superiority.\n","authors":["Yulan Hu","Zhirui Yang","Sheng Ouyang","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2310.11102v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.13734v2","updated":"2023-10-17T09:34:30Z","published":"2023-04-26T02:49:38Z","title":"The Internal State of an LLM Knows When It's Lying","summary":"  While Large Language Models (LLMs) have shown exceptional performance in\nvarious tasks, one of their most prominent drawbacks is generating inaccurate\nor false information with a confident tone. In this paper, we provide evidence\nthat the LLM's internal state can be used to reveal the truthfulness of\nstatements. This includes both statements provided to the LLM, and statements\nthat the LLM itself generates. Our approach is to train a classifier that\noutputs the probability that a statement is truthful, based on the hidden layer\nactivations of the LLM as it reads or generates the statement. Experiments\ndemonstrate that given a set of test sentences, of which half are true and half\nfalse, our trained classifier achieves an average of 71\\% to 83\\% accuracy\nlabeling which sentences are true versus false, depending on the LLM base\nmodel. Furthermore, we explore the relationship between our classifier's\nperformance and approaches based on the probability assigned to the sentence by\nthe LLM. We show that while LLM-assigned sentence probability is related to\nsentence truthfulness, this probability is also dependent on sentence length\nand the frequencies of words in the sentence, resulting in our trained\nclassifier providing a more reliable approach to detecting truthfulness,\nhighlighting its potential to enhance the reliability of LLM-generated content\nand its practical applicability in real-world scenarios.\n","authors":["Amos Azaria","Tom Mitchell"],"pdf_url":"https://arxiv.org/pdf/2304.13734v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11096v1","updated":"2023-10-17T09:25:17Z","published":"2023-10-17T09:25:17Z","title":"Sparse-DySta: Sparsity-Aware Dynamic and Static Scheduling for Sparse\n  Multi-DNN Workloads","summary":"  Running multiple deep neural networks (DNNs) in parallel has become an\nemerging workload in both edge devices, such as mobile phones where multiple\ntasks serve a single user for daily activities, and data centers, where various\nrequests are raised from millions of users, as seen with large language models.\nTo reduce the costly computational and memory requirements of these workloads,\nvarious efficient sparsification approaches have been introduced, resulting in\nwidespread sparsity across different types of DNN models. In this context,\nthere is an emerging need for scheduling sparse multi-DNN workloads, a problem\nthat is largely unexplored in previous literature. This paper systematically\nanalyses the use-cases of multiple sparse DNNs and investigates the\nopportunities for optimizations. Based on these findings, we propose Dysta, a\nnovel bi-level dynamic and static scheduler that utilizes both static sparsity\npatterns and dynamic sparsity information for the sparse multi-DNN scheduling.\nBoth static and dynamic components of Dysta are jointly designed at the\nsoftware and hardware levels, respectively, to improve and refine the\nscheduling approach. To facilitate future progress in the study of this class\nof workloads, we construct a public benchmark that contains sparse multi-DNN\nworkloads across different deployment scenarios, spanning from mobile phones\nand AR/VR wearables to data centers. A comprehensive evaluation on the sparse\nmulti-DNN benchmark demonstrates that our proposed approach outperforms the\nstate-of-the-art methods with up to 10% decrease in latency constraint\nviolation rate and nearly 4X reduction in average normalized turnaround time.\nOur artifacts and code are publicly available at:\nhttps://github.com/SamsungLabs/Sparse-Multi-DNN-Scheduling.\n","authors":["Hongxiang Fan","Stylianos I. Venieris","Alexandros Kouris","Nicholas D. Lane"],"pdf_url":"https://arxiv.org/pdf/2310.11096v1.pdf","comment":"Paper accepted by MICRO'23"},{"id":"http://arxiv.org/abs/2310.11094v1","updated":"2023-10-17T09:22:22Z","published":"2023-10-17T09:22:22Z","title":"Relearning Forgotten Knowledge: on Forgetting, Overfit and Training-Free\n  Ensembles of DNNs","summary":"  The infrequent occurrence of overfit in deep neural networks is perplexing.\nOn the one hand, theory predicts that as models get larger they should\neventually become too specialized for a specific training set, with ensuing\ndecrease in generalization. In contrast, empirical results in image\nclassification indicate that increasing the training time of deep models or\nusing bigger models almost never hurts generalization. Is it because the way we\nmeasure overfit is too limited? Here, we introduce a novel score for\nquantifying overfit, which monitors the forgetting rate of deep models on\nvalidation data. Presumably, this score indicates that even while\ngeneralization improves overall, there are certain regions of the data space\nwhere it deteriorates. When thus measured, we show that overfit can occur with\nand without a decrease in validation accuracy, and may be more common than\npreviously appreciated. This observation may help to clarify the aforementioned\nconfusing picture. We use our observations to construct a new ensemble method,\nbased solely on the training history of a single network, which provides\nsignificant improvement in performance without any additional cost in training\ntime. An extensive empirical evaluation with modern deep models shows our\nmethod's utility on multiple datasets, neural networks architectures and\ntraining schemes, both when training from scratch and when using pre-trained\nnetworks in transfer learning. Notably, our method outperforms comparable\nmethods while being easier to implement and use, and further improves the\nperformance of competitive networks on Imagenet by 1\\%.\n","authors":["Uri Stern","Daphna Weinshall"],"pdf_url":"https://arxiv.org/pdf/2310.11094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11093v1","updated":"2023-10-17T09:22:20Z","published":"2023-10-17T09:22:20Z","title":"SODA: Robust Training of Test-Time Data Adaptors","summary":"  Adapting models deployed to test distributions can mitigate the performance\ndegradation caused by distribution shifts. However, privacy concerns may render\nmodel parameters inaccessible. One promising approach involves utilizing\nzeroth-order optimization (ZOO) to train a data adaptor to adapt the test data\nto fit the deployed models. Nevertheless, the data adaptor trained with ZOO\ntypically brings restricted improvements due to the potential corruption of\ndata features caused by the data adaptor. To address this issue, we revisit ZOO\nin the context of test-time data adaptation. We find that the issue directly\nstems from the unreliable estimation of the gradients used to optimize the data\nadaptor, which is inherently due to the unreliable nature of the pseudo-labels\nassigned to the test data. Based on this observation, we propose\npseudo-label-robust data adaptation (SODA) to improve the performance of data\nadaptation. Specifically, SODA leverages high-confidence predicted labels as\nreliable labels to optimize the data adaptor with ZOO for label prediction. For\ndata with low-confidence predictions, SODA encourages the adaptor to preserve\ndata information to mitigate data corruption. Empirical results indicate that\nSODA can significantly enhance the performance of deployed models in the\npresence of distribution shifts without requiring access to model parameters.\n","authors":["Zige Wang","Yonggang Zhang","Zhen Fang","Long Lan","Wenjing Yang","Bo Han"],"pdf_url":"https://arxiv.org/pdf/2310.11093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11088v1","updated":"2023-10-17T09:13:24Z","published":"2023-10-17T09:13:24Z","title":"MeKB-Rec: Personal Knowledge Graph Learning for Cross-Domain\n  Recommendation","summary":"  It is a long-standing challenge in modern recommender systems to effectively\nmake recommendations for new users, namely the cold-start problem. Cross-Domain\nRecommendation (CDR) has been proposed to address this challenge, but current\nways to represent users' interests across systems are still severely limited.\nWe introduce Personal Knowledge Graph (PKG) as a domain-invariant interest\nrepresentation, and propose a novel CDR paradigm named MeKB-Rec. We first link\nusers and entities in a knowledge base to construct a PKG of users' interests,\nnamed MeKB. Then we learn a semantic representation of MeKB for the\ncross-domain recommendation. To efficiently utilize limited training data in\nCDR, MeKB-Rec employs Pretrained Language Models to inject world knowledge into\nunderstanding users' interests. Beyond most existing systems, our approach\nbuilds a semantic mapping across domains which breaks the requirement for\nin-domain user behaviors, enabling zero-shot recommendations for new users in a\nlow-resource domain. We experiment MeKB-Rec on well-established public CDR\ndatasets, and demonstrate that the new formulation % is more powerful than\nprevious approaches, achieves a new state-of-the-art that significantly\nimproves HR@10 and NDCG@10 metrics over best previous approaches by 24\\%--91\\%,\nwith a 105\\% improvement for HR@10 of zero-shot users with no behavior in the\ntarget domain. We deploy MeKB-Rec in WeiXin recommendation scenarios and\nachieve significant gains in core online metrics. MeKB-Rec is now serving\nhundreds of millions of users in real-world products.\n","authors":["Xin Su","Yao Zhou","Zifei Shan","Qian Chen"],"pdf_url":"https://arxiv.org/pdf/2310.11088v1.pdf","comment":"13 pages, 4 figures, conference"},{"id":"http://arxiv.org/abs/2310.11087v1","updated":"2023-10-17T09:13:10Z","published":"2023-10-17T09:13:10Z","title":"Feature Pyramid biLSTM: Using Smartphone Sensors for Transportation Mode\n  Detection","summary":"  The widespread utilization of smartphones has provided extensive availability\nto Inertial Measurement Units, providing a wide range of sensory data that can\nbe advantageous for the detection of transportation modes. The objective of\nthis study is to propose a novel end-to-end approach to effectively explore a\nreduced amount of sensory data collected from a smartphone to achieve accurate\nmode detection in common daily traveling activities. Our approach, called\nFeature Pyramid biLSTM (FPbiLSTM), is characterized by its ability to reduce\nthe number of sensors required and processing demands, resulting in a more\nefficient modeling process without sacrificing the quality of the outcomes than\nthe other current models. FPbiLSTM extends an existing CNN biLSTM model with\nthe Feature Pyramid Network, leveraging the advantages of both shallow layer\nrichness and deeper layer feature resilience for capturing temporal moving\npatterns in various transportation modes. It exhibits an excellent performance\nby employing the data collected from only three out of seven sensors, i.e.\naccelerometers, gyroscopes, and magnetometers, in the 2018 Sussex-Huawei\nLocomotion (SHL) challenge dataset, attaining a noteworthy accuracy of 95.1%\nand an F1-score of 94.7% in detecting eight different transportation modes.\n","authors":["Qinrui Tang","Hao Cheng"],"pdf_url":"https://arxiv.org/pdf/2310.11087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11085v1","updated":"2023-10-17T09:10:27Z","published":"2023-10-17T09:10:27Z","title":"In-Context Few-Shot Relation Extraction via Pre-Trained Language Models","summary":"  Relation extraction aims at inferring structured human knowledge from textual\ndocuments. State-of-the-art methods based on language models commonly have two\nlimitations: (1) they require named entities to be either given as input or\ninfer them, which introduces additional noise, and (2) they require human\nannotations of documents. As a remedy, we present a novel framework for\nin-context few-shot relation extraction via pre-trained language models. To the\nbest of our knowledge, we are the first to reformulate the relation extraction\ntask as a tailored in-context few-shot learning paradigm. Thereby, we achieve\ncrucial benefits in that we eliminate the need for both named entity\nrecognition and human annotation of documents. Unlike existing methods based on\nfine-tuning, our framework is flexible in that it can be easily updated for a\nnew set of relations without re-training. We evaluate our framework using\nDocRED, the largest publicly available dataset for document-level relation\nextraction, and demonstrate that our framework achieves state-of-the-art\nperformance. Finally, our framework allows us to identify missing annotations,\nand we thus show that our framework actually performs much better than the\noriginal labels from the development set of DocRED.\n","authors":["Yilmazcan Ozyurt","Stefan Feuerriegel","Ce Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.11085v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.04900v3","updated":"2023-10-17T09:09:53Z","published":"2023-01-12T09:44:59Z","title":"How accurate are neural approximations of complex network dynamics?","summary":"  Data-driven approximations of ordinary differential equations offer a\npromising alternative to classical methods of discovering a dynamical system\nmodel, particularly in complex systems lacking explicit first principles. This\npaper focuses on a complex system whose dynamics is described with a system of\nsuch equations, coupled through a complex network. Numerous real-world systems,\nincluding financial, social, and neural systems, belong to this class of\ndynamical models. We propose essential elements for approximating these\ndynamical systems using neural networks, including necessary biases and an\nappropriate neural architecture. Emphasizing the differences from static\nsupervised learning, we advocate for evaluating generalization beyond classical\nassumptions of statistical learning theory. To estimate confidence in\nprediction during inference time, we introduce a dedicated null model. By\nstudying various complex network dynamics, we demonstrate that the neural\napproximations of dynamics generalize across complex network structures, sizes,\nand statistical properties of inputs. Our comprehensive framework enables\naccurate and reliable deep learning approximations of high-dimensional,\nnonlinear dynamical systems.\n","authors":["Vaiva Vasiliauskaite","Nino Antulov-Fantulin"],"pdf_url":"https://arxiv.org/pdf/2301.04900v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11083v1","updated":"2023-10-17T09:08:33Z","published":"2023-10-17T09:08:33Z","title":"CSG: Curriculum Representation Learning for Signed Graph","summary":"  Signed graphs are valuable for modeling complex relationships with positive\nand negative connections, and Signed Graph Neural Networks (SGNNs) have become\ncrucial tools for their analysis. However, prior to our work, no specific\ntraining plan existed for SGNNs, and the conventional random sampling approach\ndid not address varying learning difficulties within the graph's structure. We\nproposed a curriculum-based training approach, where samples progress from easy\nto complex, inspired by human learning. To measure learning difficulty, we\nintroduced a lightweight mechanism and created the Curriculum representation\nlearning framework for Signed Graphs (CSG). This framework optimizes the order\nin which samples are presented to the SGNN model. Empirical validation across\nsix real-world datasets showed impressive results, enhancing SGNN model\naccuracy by up to 23.7% in link sign prediction (AUC) and significantly\nimproving stability with an up to 8.4 reduction in the standard deviation of\nAUC scores.\n","authors":["Zeyu Zhang","Jiamou Liu","Kaiqi Zhao","Yifei Wang","Pengqian Han","Xianda Zheng","Qiqi Wang","Zijian Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.11083v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11082v1","updated":"2023-10-17T09:06:41Z","published":"2023-10-17T09:06:41Z","title":"Multi-omics Sampling-based Graph Transformer for Synthetic Lethality\n  Prediction","summary":"  Synthetic lethality (SL) prediction is used to identify if the co-mutation of\ntwo genes results in cell death. The prevalent strategy is to abstract SL\nprediction as an edge classification task on gene nodes within SL data and\nachieve it through graph neural networks (GNNs). However, GNNs suffer from\nlimitations in their message passing mechanisms, including over-smoothing and\nover-squashing issues. Moreover, harnessing the information of non-SL gene\nrelationships within large-scale multi-omics data to facilitate SL prediction\nposes a non-trivial challenge. To tackle these issues, we propose a new\nmulti-omics sampling-based graph transformer for SL prediction (MSGT-SL).\nConcretely, we introduce a shallow multi-view GNN to acquire local structural\npatterns from both SL and multi-omics data. Further, we input gene features\nthat encode multi-view information into the standard self-attention to capture\nlong-range dependencies. Notably, starting with batch genes from SL data, we\nadopt parallel random walk sampling across multiple omics gene graphs\nencompassing them. Such sampling effectively and modestly incorporates genes\nfrom omics in a structure-aware manner before using self-attention. We showcase\nthe effectiveness of MSGT-SL on real-world SL tasks, demonstrating the\nempirical benefits gained from the graph transformer and multi-omics data.\n","authors":["Xusheng Zhao","Hao Liu","Qiong Dai","Hao Peng","Xu Bai","Huailiang Peng"],"pdf_url":"https://arxiv.org/pdf/2310.11082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07184v2","updated":"2023-10-17T09:00:22Z","published":"2023-10-11T04:20:32Z","title":"NeuroInspect: Interpretable Neuron-based Debugging Framework through\n  Class-conditional Visualizations","summary":"  Despite deep learning (DL) has achieved remarkable progress in various\ndomains, the DL models are still prone to making mistakes. This issue\nnecessitates effective debugging tools for DL practitioners to interpret the\ndecision-making process within the networks. However, existing debugging\nmethods often demand extra data or adjustments to the decision process,\nlimiting their applicability. To tackle this problem, we present NeuroInspect,\nan interpretable neuron-based debugging framework with three key stages:\ncounterfactual explanations, feature visualizations, and false correlation\nmitigation. Our debugging framework first pinpoints neurons responsible for\nmistakes in the network and then visualizes features embedded in the neurons to\nbe human-interpretable. To provide these explanations, we introduce\nCLIP-Illusion, a novel feature visualization method that generates images\nrepresenting features conditioned on classes to examine the connection between\nneurons and the decision layer. We alleviate convoluted explanations of the\nconventional visualization approach by employing class information, thereby\nisolating mixed properties. This process offers more human-interpretable\nexplanations for model errors without altering the trained network or requiring\nadditional data. Furthermore, our framework mitigates false correlations\nlearned from a dataset under a stochastic perspective, modifying decisions for\nthe neurons considered as the main causes. We validate the effectiveness of our\nframework by addressing false correlations and improving inferences for classes\nwith the worst performance in real-world settings. Moreover, we demonstrate\nthat NeuroInspect helps debug the mistakes of DL models through evaluation for\nhuman understanding. The code is openly available at\nhttps://github.com/yeongjoonJu/NeuroInspect.\n","authors":["Yeong-Joon Ju","Ji-Hoon Park","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2310.07184v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11077v1","updated":"2023-10-17T08:51:44Z","published":"2023-10-17T08:51:44Z","title":"United We Stand: Using Epoch-wise Agreement of Ensembles to Combat\n  Overfit","summary":"  Deep neural networks have become the method of choice for solving many image\nclassification tasks, largely because they can fit very complex functions\ndefined over raw images. The downside of such powerful learners is the danger\nof overfitting the training set, leading to poor generalization, which is\nusually avoided by regularization and \"early stopping\" of the training. In this\npaper, we propose a new deep network ensemble classifier that is very effective\nagainst overfit. We begin with the theoretical analysis of a regression model,\nwhose predictions - that the variance among classifiers increases when overfit\noccurs - is demonstrated empirically in deep networks in common use. Guided by\nthese results, we construct a new ensemble-based prediction method designed to\ncombat overfit, where the prediction is determined by the most consensual\nprediction throughout the training. On multiple image and text classification\ndatasets, we show that when regular ensembles suffer from overfit, our method\neliminates the harmful reduction in generalization due to overfit, and often\neven surpasses the performance obtained by early stopping. Our method is easy\nto implement, and can be integrated with any training scheme and architecture,\nwithout additional prior knowledge beyond the training set. Accordingly, it is\na practical and useful tool to overcome overfit.\n","authors":["Uri Stern","Daniel Shwartz","Daphna Weinshall"],"pdf_url":"https://arxiv.org/pdf/2310.11077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.12978v3","updated":"2023-10-17T08:39:07Z","published":"2021-08-30T03:37:36Z","title":"Private Multi-Task Learning: Formulation and Applications to Federated\n  Learning","summary":"  Many problems in machine learning rely on multi-task learning (MTL), in which\nthe goal is to solve multiple related machine learning tasks simultaneously.\nMTL is particularly relevant for privacy-sensitive applications in areas such\nas healthcare, finance, and IoT computing, where sensitive data from multiple,\nvaried sources are shared for the purpose of learning. In this work, we\nformalize notions of client-level privacy for MTL via joint differential\nprivacy (JDP), a relaxation of differential privacy for mechanism design and\ndistributed optimization. We then propose an algorithm for mean-regularized\nMTL, an objective commonly used for applications in personalized federated\nlearning, subject to JDP. We analyze our objective and solver, providing\ncertifiable guarantees on both privacy and utility. Empirically, we find that\nour method provides improved privacy/utility trade-offs relative to global\nbaselines across common federated learning benchmarks.\n","authors":["Shengyuan Hu","Zhiwei Steven Wu","Virginia Smith"],"pdf_url":"https://arxiv.org/pdf/2108.12978v3.pdf","comment":"Accepted to TMLR. Transactions on Machine Learning Research (2022)"},{"id":"http://arxiv.org/abs/2310.11065v1","updated":"2023-10-17T08:18:10Z","published":"2023-10-17T08:18:10Z","title":"Resampling Stochastic Gradient Descent Cheaply for Efficient Uncertainty\n  Quantification","summary":"  Stochastic gradient descent (SGD) or stochastic approximation has been widely\nused in model training and stochastic optimization. While there is a huge\nliterature on analyzing its convergence, inference on the obtained solutions\nfrom SGD has only been recently studied, yet is important due to the growing\nneed for uncertainty quantification. We investigate two computationally cheap\nresampling-based methods to construct confidence intervals for SGD solutions.\nOne uses multiple, but few, SGDs in parallel via resampling with replacement\nfrom the data, and another operates this in an online fashion. Our methods can\nbe regarded as enhancements of established bootstrap schemes to substantially\nreduce the computation effort in terms of resampling requirements, while at the\nsame time bypassing the intricate mixing conditions in existing batching\nmethods. We achieve these via a recent so-called cheap bootstrap idea and\nBerry-Esseen-type bound for SGD.\n","authors":["Henry Lam","Zitong Wang"],"pdf_url":"https://arxiv.org/pdf/2310.11065v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11060v1","updated":"2023-10-17T08:06:08Z","published":"2023-10-17T08:06:08Z","title":"Locally Differentially Private Graph Embedding","summary":"  Graph embedding has been demonstrated to be a powerful tool for learning\nlatent representations for nodes in a graph. However, despite its superior\nperformance in various graph-based machine learning tasks, learning over graphs\ncan raise significant privacy concerns when graph data involves sensitive\ninformation. To address this, in this paper, we investigate the problem of\ndeveloping graph embedding algorithms that satisfy local differential privacy\n(LDP). We propose LDP-GE, a novel privacy-preserving graph embedding framework,\nto protect the privacy of node data. Specifically, we propose an LDP mechanism\nto obfuscate node data and adopt personalized PageRank as the proximity measure\nto learn node representations. Then, we theoretically analyze the privacy\nguarantees and utility of the LDP-GE framework. Extensive experiments conducted\nover several real-world graph datasets demonstrate that LDP-GE achieves\nfavorable privacy-utility trade-offs and significantly outperforms existing\napproaches in both node classification and link prediction tasks.\n","authors":["Zening Li","Rong-Hua Li","Meihao Liao","Fusheng Jin","Guoren Wang"],"pdf_url":"https://arxiv.org/pdf/2310.11060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11059v1","updated":"2023-10-17T08:04:45Z","published":"2023-10-17T08:04:45Z","title":"Causal Feature Selection via Transfer Entropy","summary":"  Machine learning algorithms are designed to capture complex relationships\nbetween features. In this context, the high dimensionality of data often\nresults in poor model performance, with the risk of overfitting. Feature\nselection, the process of selecting a subset of relevant and non-redundant\nfeatures, is, therefore, an essential step to mitigate these issues. However,\nclassical feature selection approaches do not inspect the causal relationship\nbetween selected features and target, which can lead to misleading results in\nreal-world applications. Causal discovery, instead, aims to identify causal\nrelationships between features with observational data. In this paper, we\npropose a novel methodology at the intersection between feature selection and\ncausal discovery, focusing on time series. We introduce a new causal feature\nselection approach that relies on the forward and backward feature selection\nprocedures and leverages transfer entropy to estimate the causal flow of\ninformation from the features to the target in time series. Our approach\nenables the selection of features not only in terms of mere model performance\nbut also captures the causal information flow. In this context, we provide\ntheoretical guarantees on the regression and classification errors for both the\nexact and the finite-sample cases. Finally, we present numerical validations on\nsynthetic and real-world regression problems, showing results competitive\nw.r.t. the considered baselines.\n","authors":["Paolo Bonetti","Alberto Maria Metelli","Marcello Restelli"],"pdf_url":"https://arxiv.org/pdf/2310.11059v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19130v3","updated":"2023-10-17T08:01:34Z","published":"2023-05-30T15:41:47Z","title":"Adaptation of Tongue Ultrasound-Based Silent Speech Interfaces Using\n  Spatial Transformer Networks","summary":"  Thanks to the latest deep learning algorithms, silent speech interfaces (SSI)\nare now able to synthesize intelligible speech from articulatory movement data\nunder certain conditions. However, the resulting models are rather\nspeaker-specific, making a quick switch between users troublesome. Even for the\nsame speaker, these models perform poorly cross-session, i.e. after dismounting\nand re-mounting the recording equipment. To aid quick speaker and session\nadaptation of ultrasound tongue imaging-based SSI models, we extend our deep\nnetworks with a spatial transformer network (STN) module, capable of performing\nan affine transformation on the input images. Although the STN part takes up\nonly about 10% of the network, our experiments show that adapting just the STN\nmodule might allow to reduce MSE by 88% on the average, compared to retraining\nthe whole network. The improvement is even larger (around 92%) when adapting\nthe network to different recording sessions from the same speaker.\n","authors":["László Tóth","Amin Honarmandi Shandiz","Gábor Gosztolya","Csapó Tamás Gábor"],"pdf_url":"https://arxiv.org/pdf/2305.19130v3.pdf","comment":"5 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2310.09486v2","updated":"2023-10-17T07:55:22Z","published":"2023-10-14T04:21:52Z","title":"Mirage: Model-Agnostic Graph Distillation for Graph Classification","summary":"  GNNs, like other deep learning models, are data and computation hungry. There\nis a pressing need to scale training of GNNs on large datasets to enable their\nusage on low-resource environments. Graph distillation is an effort in that\ndirection with the aim to construct a smaller synthetic training set from the\noriginal training data without significantly compromising model performance.\nWhile initial efforts are promising, this work is motivated by two key\nobservations: (1) Existing graph distillation algorithms themselves rely on\ntraining with the full dataset, which undermines the very premise of graph\ndistillation. (2) The distillation process is specific to the target GNN\narchitecture and hyper-parameters and thus not robust to changes in the\nmodeling pipeline. We circumvent these limitations by designing a distillation\nalgorithm called Mirage for graph classification. Mirage is built on the\ninsight that a message-passing GNN decomposes the input graph into a multiset\nof computation trees. Furthermore, the frequency distribution of computation\ntrees is often skewed in nature, enabling us to condense this data into a\nconcise distilled summary. By compressing the computation data itself, as\nopposed to emulating gradient flows on the original training set-a prevalent\napproach to date-Mirage transforms into an unsupervised and\narchitecture-agnostic distillation algorithm. Extensive benchmarking on\nreal-world datasets underscores Mirage's superiority, showcasing enhanced\ngeneralization accuracy, data compression, and distillation efficiency when\ncompared to state-of-the-art baselines.\n","authors":["Mridul Gupta","Sahil Manchanda","Hariprasad Kodamana","Sayan Ranu"],"pdf_url":"https://arxiv.org/pdf/2310.09486v2.pdf","comment":"14 pages, 14 figures"},{"id":"http://arxiv.org/abs/2302.01068v3","updated":"2023-10-17T07:53:46Z","published":"2023-02-02T12:56:46Z","title":"FedLAP-DP: Federated Learning by Sharing Differentially Private Loss\n  Approximations","summary":"  This work proposes FedLAP-DP, a novel privacy-preserving approach for\nfederated learning. Unlike previous linear point-wise gradient-sharing schemes,\nsuch as FedAvg, our formulation enables a type of global optimization by\nleveraging synthetic samples received from clients. These synthetic samples,\nserving as loss surrogates, approximate local loss landscapes by simulating the\nutility of real images within a local region. We additionally introduce an\napproach to measure effective approximation regions reflecting the quality of\nthe approximation. Therefore, the server can recover an approximation of the\nglobal loss landscape and optimize the model globally. Moreover, motivated by\nthe emerging privacy concerns, we demonstrate that our approach seamlessly\nworks with record-level differential privacy (DP), granting theoretical privacy\nguarantees for every data record on the clients. Extensive results validate the\nefficacy of our formulation on various datasets with highly skewed\ndistributions. Our method consistently improves over the baselines, especially\nconsidering highly skewed distributions and noisy gradients due to DP. The\nsource code and setup will be released upon publication.\n","authors":["Hui-Po Wang","Dingfan Chen","Raouf Kerkouche","Mario Fritz"],"pdf_url":"https://arxiv.org/pdf/2302.01068v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12961v2","updated":"2023-10-17T07:44:33Z","published":"2023-05-22T12:11:07Z","title":"Enhanced Meta Label Correction for Coping with Label Corruption","summary":"  Traditional methods for learning with the presence of noisy labels have\nsuccessfully handled datasets with artificially injected noise but still fall\nshort of adequately handling real-world noise. With the increasing use of\nmeta-learning in the diverse fields of machine learning, researchers leveraged\nauxiliary small clean datasets to meta-correct the training labels.\nNonetheless, existing meta-label correction approaches are not fully exploiting\ntheir potential. In this study, we propose an Enhanced Meta Label Correction\napproach abbreviated as EMLC for the learning with noisy labels (LNL) problem.\nWe re-examine the meta-learning process and introduce faster and more accurate\nmeta-gradient derivations. We propose a novel teacher architecture tailored\nexplicitly to the LNL problem, equipped with novel training objectives. EMLC\noutperforms prior approaches and achieves state-of-the-art results in all\nstandard benchmarks. Notably, EMLC enhances the previous art on the noisy\nreal-world dataset Clothing1M by $1.52\\%$ while requiring $\\times 0.5$ the time\nper epoch and with much faster convergence of the meta-objective when compared\nto the baseline approach.\n","authors":["Mitchell Keren Taraday","Chaim Baskin"],"pdf_url":"https://arxiv.org/pdf/2305.12961v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2305.14342v3","updated":"2023-10-17T07:44:16Z","published":"2023-05-23T17:59:21Z","title":"Sophia: A Scalable Stochastic Second-order Optimizer for Language Model\n  Pre-training","summary":"  Given the massive cost of language model pre-training, a non-trivial\nimprovement of the optimization algorithm would lead to a material reduction on\nthe time and cost of training. Adam and its variants have been state-of-the-art\nfor years, and more sophisticated second-order (Hessian-based) optimizers often\nincur too much per-step overhead. In this paper, we propose Sophia,\nSecond-order Clipped Stochastic Optimization, a simple scalable second-order\noptimizer that uses a light-weight estimate of the diagonal Hessian as the\npre-conditioner. The update is the moving average of the gradients divided by\nthe moving average of the estimated Hessian, followed by element-wise clipping.\nThe clipping controls the worst-case update size and tames the negative impact\nof non-convexity and rapid change of Hessian along the trajectory. Sophia only\nestimates the diagonal Hessian every handful of iterations, which has\nnegligible average per-step time and memory overhead. On language modeling with\nGPT models of sizes ranging from 125M to 1.5B, Sophia achieves a 2x speed-up\ncompared to Adam in the number of steps, total compute, and wall-clock time,\nachieving the same perplexity with 50% fewer steps, less total compute, and\nreduced wall-clock time. Theoretically, we show that Sophia, in a much\nsimplified setting, adapts to the heterogeneous curvatures in different\nparameter dimensions, and thus has a run-time bound that does not depend on the\ncondition number of the loss.\n","authors":["Hong Liu","Zhiyuan Li","David Hall","Percy Liang","Tengyu Ma"],"pdf_url":"https://arxiv.org/pdf/2305.14342v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11049v1","updated":"2023-10-17T07:35:11Z","published":"2023-10-17T07:35:11Z","title":"Nonet at SemEval-2023 Task 6: Methodologies for Legal Evaluation","summary":"  This paper describes our submission to the SemEval-2023 for Task 6 on\nLegalEval: Understanding Legal Texts. Our submission concentrated on three\nsubtasks: Legal Named Entity Recognition (L-NER) for Task-B, Legal Judgment\nPrediction (LJP) for Task-C1, and Court Judgment Prediction with Explanation\n(CJPE) for Task-C2. We conducted various experiments on these subtasks and\npresented the results in detail, including data statistics and methodology. It\nis worth noting that legal tasks, such as those tackled in this research, have\nbeen gaining importance due to the increasing need to automate legal analysis\nand support. Our team obtained competitive rankings of 15$^{th}$, 11$^{th}$,\nand 1$^{st}$ in Task-B, Task-C1, and Task-C2, respectively, as reported on the\nleaderboard.\n","authors":["Shubham Kumar Nigam","Aniket Deroy","Noel Shallum","Ayush Kumar Mishra","Anup Roy","Shubham Kumar Mishra","Arnab Bhattacharya","Saptarshi Ghosh","Kripabandhu Ghosh"],"pdf_url":"https://arxiv.org/pdf/2310.11049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11048v1","updated":"2023-10-17T07:32:59Z","published":"2023-10-17T07:32:59Z","title":"Understanding Contrastive Learning via Distributionally Robust\n  Optimization","summary":"  This study reveals the inherent tolerance of contrastive learning (CL)\ntowards sampling bias, wherein negative samples may encompass similar semantics\n(\\eg labels). However, existing theories fall short in providing explanations\nfor this phenomenon. We bridge this research gap by analyzing CL through the\nlens of distributionally robust optimization (DRO), yielding several key\ninsights: (1) CL essentially conducts DRO over the negative sampling\ndistribution, thus enabling robust performance across a variety of potential\ndistributions and demonstrating robustness to sampling bias; (2) The design of\nthe temperature $\\tau$ is not merely heuristic but acts as a Lagrange\nCoefficient, regulating the size of the potential distribution set; (3) A\ntheoretical connection is established between DRO and mutual information, thus\npresenting fresh evidence for ``InfoNCE as an estimate of MI'' and a new\nestimation approach for $\\phi$-divergence-based generalized mutual information.\nWe also identify CL's potential shortcomings, including over-conservatism and\nsensitivity to outliers, and introduce a novel Adjusted InfoNCE loss (ADNCE) to\nmitigate these issues. It refines potential distribution, improving performance\nand accelerating convergence. Extensive experiments on various domains (image,\nsentence, and graphs) validate the effectiveness of the proposal. The code is\navailable at \\url{https://github.com/junkangwu/ADNCE}.\n","authors":["Junkang Wu","Jiawei Chen","Jiancan Wu","Wentao Shi","Xiang Wang","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2310.11048v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2105.01331v3","updated":"2023-10-17T07:30:40Z","published":"2021-05-04T07:27:42Z","title":"BLM-17m: A Large-Scale Dataset for Black Lives Matter Topic Detection on\n  Twitter","summary":"  Protection of human rights is one of the most important problems of our\nworld. In this paper, our aim is to provide a dataset which covers one of the\nmost significant human rights contradiction in recent months affected the whole\nworld, George Floyd incident. We propose a labeled dataset for topic detection\nthat contains 17 million tweets. These Tweets are collected from 25 May 2020 to\n21 August 2020 that covers 89 days from start of this incident. We labeled the\ndataset by monitoring most trending news topics from global and local\nnewspapers. Apart from that, we present two baselines, TF-IDF and LDA. We\nevaluated the results of these two methods with three different k values for\nmetrics of precision, recall and f1-score. The collected dataset is available\nat https://github.com/MeysamAsgariC/BLMT.\n","authors":["Hasan Kemik","Nusret Özateş","Meysam Asgari-Chenaghlu","Yang Li","Erik Cambria"],"pdf_url":"https://arxiv.org/pdf/2105.01331v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11046v1","updated":"2023-10-17T07:25:59Z","published":"2023-10-17T07:25:59Z","title":"Fast Graph Condensation with Structure-based Neural Tangent Kernel","summary":"  The rapid development of Internet technology has given rise to a vast amount\nof graph-structured data. Graph Neural Networks (GNNs), as an effective method\nfor various graph mining tasks, incurs substantial computational resource costs\nwhen dealing with large-scale graph data. A data-centric manner solution is\nproposed to condense the large graph dataset into a smaller one without\nsacrificing the predictive performance of GNNs. However, existing efforts\ncondense graph-structured data through a computational intensive bi-level\noptimization architecture also suffer from massive computation costs. In this\npaper, we propose reforming the graph condensation problem as a Kernel Ridge\nRegression (KRR) task instead of iteratively training GNNs in the inner loop of\nbi-level optimization. More specifically, We propose a novel dataset\ncondensation framework (GC-SNTK) for graph-structured data, where a\nStructure-based Neural Tangent Kernel (SNTK) is developed to capture the\ntopology of graph and serves as the kernel function in KRR paradigm.\nComprehensive experiments demonstrate the effectiveness of our proposed model\nin accelerating graph condensation while maintaining high prediction\nperformance.\n","authors":["Lin Wang","Wenqi Fan","Jiatong Li","Yao Ma","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2310.11046v1.pdf","comment":"15 pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2309.10150v2","updated":"2023-10-17T07:00:46Z","published":"2023-09-18T21:00:38Z","title":"Q-Transformer: Scalable Offline Reinforcement Learning via\n  Autoregressive Q-Functions","summary":"  In this work, we present a scalable reinforcement learning method for\ntraining multi-task policies from large offline datasets that can leverage both\nhuman demonstrations and autonomously collected data. Our method uses a\nTransformer to provide a scalable representation for Q-functions trained via\noffline temporal difference backups. We therefore refer to the method as\nQ-Transformer. By discretizing each action dimension and representing the\nQ-value of each action dimension as separate tokens, we can apply effective\nhigh-capacity sequence modeling techniques for Q-learning. We present several\ndesign decisions that enable good performance with offline RL training, and\nshow that Q-Transformer outperforms prior offline RL algorithms and imitation\nlearning techniques on a large diverse real-world robotic manipulation task\nsuite. The project's website and videos can be found at\nhttps://qtransformer.github.io\n","authors":["Yevgen Chebotar","Quan Vuong","Alex Irpan","Karol Hausman","Fei Xia","Yao Lu","Aviral Kumar","Tianhe Yu","Alexander Herzog","Karl Pertsch","Keerthana Gopalakrishnan","Julian Ibarz","Ofir Nachum","Sumedh Sontakke","Grecia Salazar","Huong T Tran","Jodilyn Peralta","Clayton Tan","Deeksha Manjunath","Jaspiar Singht","Brianna Zitkovich","Tomas Jackson","Kanishka Rao","Chelsea Finn","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2309.10150v2.pdf","comment":"See website at https://qtransformer.github.io"},{"id":"http://arxiv.org/abs/2310.11028v1","updated":"2023-10-17T06:56:57Z","published":"2023-10-17T06:56:57Z","title":"Matrix Compression via Randomized Low Rank and Low Precision\n  Factorization","summary":"  Matrices are exceptionally useful in various fields of study as they provide\na convenient framework to organize and manipulate data in a structured manner.\nHowever, modern matrices can involve billions of elements, making their storage\nand processing quite demanding in terms of computational resources and memory\nusage. Although prohibitively large, such matrices are often approximately low\nrank. We propose an algorithm that exploits this structure to obtain a low rank\ndecomposition of any matrix $\\mathbf{A}$ as $\\mathbf{A} \\approx\n\\mathbf{L}\\mathbf{R}$, where $\\mathbf{L}$ and $\\mathbf{R}$ are the low rank\nfactors. The total number of elements in $\\mathbf{L}$ and $\\mathbf{R}$ can be\nsignificantly less than that in $\\mathbf{A}$. Furthermore, the entries of\n$\\mathbf{L}$ and $\\mathbf{R}$ are quantized to low precision formats $--$\ncompressing $\\mathbf{A}$ by giving us a low rank and low precision\nfactorization. Our algorithm first computes an approximate basis of the range\nspace of $\\mathbf{A}$ by randomly sketching its columns, followed by a\nquantization of the vectors constituting this basis. It then computes\napproximate projections of the columns of $\\mathbf{A}$ onto this quantized\nbasis. We derive upper bounds on the approximation error of our algorithm, and\nanalyze the impact of target rank and quantization bit-budget. The tradeoff\nbetween compression ratio and approximation accuracy allows for flexibility in\nchoosing these parameters based on specific application requirements. We\nempirically demonstrate the efficacy of our algorithm in image compression,\nnearest neighbor classification of image and text embeddings, and compressing\nthe layers of LlaMa-$7$b. Our results illustrate that we can achieve\ncompression ratios as aggressive as one bit per matrix coordinate, all while\nsurpassing or maintaining the performance of traditional compression\ntechniques.\n","authors":["Rajarshi Saha","Varun Srivastava","Mert Pilanci"],"pdf_url":"https://arxiv.org/pdf/2310.11028v1.pdf","comment":"Accepted to the 37th Conference on Neural Information Processing\n  Systems (NeurIPS 2023)"},{"id":"http://arxiv.org/abs/2306.04265v3","updated":"2023-10-17T06:49:31Z","published":"2023-06-07T09:05:56Z","title":"Permutation Equivariant Graph Framelets for Heterophilous Graph Learning","summary":"  The nature of heterophilous graphs is significantly different from that of\nhomophilous graphs, which causes difficulties in early graph neural network\nmodels and suggests aggregations beyond the 1-hop neighborhood. In this paper,\nwe develop a new way to implement multi-scale extraction via constructing\nHaar-type graph framelets with desired properties of permutation equivariance,\nefficiency, and sparsity, for deep learning tasks on graphs. We further design\na graph framelet neural network model PEGFAN (Permutation Equivariant Graph\nFramelet Augmented Network) based on our constructed graph framelets. The\nexperiments are conducted on a synthetic dataset and 9 benchmark datasets to\ncompare performance with other state-of-the-art models. The result shows that\nour model can achieve the best performance on certain datasets of heterophilous\ngraphs (including the majority of heterophilous datasets with relatively larger\nsizes and denser connections) and competitive performance on the remaining.\n","authors":["Jianfei Li","Ruigang Zheng","Han Feng","Ming Li","Xiaosheng Zhuang"],"pdf_url":"https://arxiv.org/pdf/2306.04265v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11025v1","updated":"2023-10-17T06:42:11Z","published":"2023-10-17T06:42:11Z","title":"SignGT: Signed Attention-based Graph Transformer for Graph\n  Representation Learning","summary":"  The emerging graph Transformers have achieved impressive performance for\ngraph representation learning over graph neural networks (GNNs). In this work,\nwe regard the self-attention mechanism, the core module of graph Transformers,\nas a two-step aggregation operation on a fully connected graph. Due to the\nproperty of generating positive attention values, the self-attention mechanism\nis equal to conducting a smooth operation on all nodes, preserving the\nlow-frequency information. However, only capturing the low-frequency\ninformation is inefficient in learning complex relations of nodes on diverse\ngraphs, such as heterophily graphs where the high-frequency information is\ncrucial. To this end, we propose a Signed Attention-based Graph Transformer\n(SignGT) to adaptively capture various frequency information from the graphs.\nSpecifically, SignGT develops a new signed self-attention mechanism (SignSA)\nthat produces signed attention values according to the semantic relevance of\nnode pairs. Hence, the diverse frequency information between different node\npairs could be carefully preserved. Besides, SignGT proposes a structure-aware\nfeed-forward network (SFFN) that introduces the neighborhood bias to preserve\nthe local topology information. In this way, SignGT could learn informative\nnode representations from both long-range dependencies and local topology\ninformation. Extensive empirical results on both node-level and graph-level\ntasks indicate the superiority of SignGT against state-of-the-art graph\nTransformers as well as advanced GNNs.\n","authors":["Jinsong Chen","Gaichao Li","John E. Hopcroft","Kun He"],"pdf_url":"https://arxiv.org/pdf/2310.11025v1.pdf","comment":"Submitted to a conference"},{"id":"http://arxiv.org/abs/2310.10505v2","updated":"2023-10-17T06:39:21Z","published":"2023-10-16T15:25:14Z","title":"ReMax: A Simple, Effective, and Efficient Reinforcement Learning Method\n  for Aligning Large Language Models","summary":"  Alignment is of critical importance for training large language models\n(LLMs). The predominant strategy to address this is through Reinforcement\nLearning from Human Feedback (RLHF), where PPO serves as the de-facto\nalgorithm. Yet, PPO is known to suffer from computational inefficiency, which\nis a challenge that this paper aims to address. We identify three important\nproperties in RLHF tasks: fast simulation, deterministic transitions, and\ntrajectory-level rewards, which are not leveraged in PPO. Based on such\nobservations, we develop a new algorithm tailored for RLHF, called ReMax. The\nalgorithm design of ReMax is built on a celebrated algorithm REINFORCE but is\nequipped with a new variance-reduction technique.\n  Our method has three-fold advantages over PPO: first, ReMax is simple to\nimplement and removes many hyper-parameters in PPO, which are scale-sensitive\nand laborious to tune. Second, ReMax saves about 50% memory usage in principle.\nAs a result, PPO runs out-of-memory when fine-tuning a Llama2 (7B) model on\n8xA100-40GB GPUs, whereas ReMax can afford training. This memory improvement is\nachieved by removing the value model in PPO. Third, based on our calculations,\nwe find that even assuming PPO can afford the training of Llama2 (7B), it would\nstill run about 2x slower than ReMax. This is due to the computational overhead\nof the value model, which does not exist in ReMax. Importantly, the above\ncomputational improvements do not sacrifice the performance. We hypothesize\nthese advantages can be maintained in larger-scaled models. Our implementation\nof ReMax is available at https://github.com/liziniu/ReMax\n","authors":["Ziniu Li","Tian Xu","Yushun Zhang","Yang Yu","Ruoyu Sun","Zhi-Quan Luo"],"pdf_url":"https://arxiv.org/pdf/2310.10505v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11022v1","updated":"2023-10-17T06:29:09Z","published":"2023-10-17T06:29:09Z","title":"Compatible Transformer for Irregularly Sampled Multivariate Time Series","summary":"  To analyze multivariate time series, most previous methods assume regular\nsubsampling of time series, where the interval between adjacent measurements\nand the number of samples remain unchanged. Practically, data collection\nsystems could produce irregularly sampled time series due to sensor failures\nand interventions. However, existing methods designed for regularly sampled\nmultivariate time series cannot directly handle irregularity owing to\nmisalignment along both temporal and variate dimensions. To fill this gap, we\npropose Compatible Transformer (CoFormer), a transformer-based encoder to\nachieve comprehensive temporal-interaction feature learning for each individual\nsample in irregular multivariate time series. In CoFormer, we view each sample\nas a unique variate-time point and leverage intra-variate/inter-variate\nattentions to learn sample-wise temporal/interaction features based on\nintra-variate/inter-variate neighbors. With CoFormer as the core, we can\nanalyze irregularly sampled multivariate time series for many downstream tasks,\nincluding classification and prediction. We conduct extensive experiments on 3\nreal-world datasets and validate that the proposed CoFormer significantly and\nconsistently outperforms existing methods.\n","authors":["Yuxi Wei","Juntong Peng","Tong He","Chenxin Xu","Jian Zhang","Shirui Pan","Siheng Chen"],"pdf_url":"https://arxiv.org/pdf/2310.11022v1.pdf","comment":"Accepted at the IEEE International Conference on Data Mining (ICDM)\n  2023 as short paper"},{"id":"http://arxiv.org/abs/2310.09903v2","updated":"2023-10-17T06:14:03Z","published":"2023-10-15T18:09:09Z","title":"Evaluation of feature selection performance for identification of best\n  effective technical indicators on stock market price prediction","summary":"  Due to the influence of many factors, including technical indicators on stock\nmarket prediction, feature selection is important to choose the best\nindicators. One of the feature selection methods that consider the performance\nof models during feature selection is the wrapper feature selection method. The\naim of this research is to identify a combination of the best stock market\nindicators through feature selection to predict the stock market price with the\nleast error. In order to evaluate the impact of wrapper feature selection\ntechniques on stock market prediction, in this paper SFS and SBS with 10\nestimators and 123 technical indicators have been examined on the last 13 years\nof Apple Company. Also, by the proposed method, the data created by the 3-day\ntime window were converted to the appropriate input for regression methods.\nBased on the results observed: (1) Each wrapper feature selection method has\ndifferent results with different machine learning methods, and each method is\nmore correlated with a specific set of technical indicators of the stock\nmarket. (2) Ridge and LR estimates alone, and with two methods of the wrapper\nfeature selection, namely SFS and SBS; They had the best results with all\nassessment criteria for market forecast. (3)The Ridge and LR method with all\nthe R2, MSE, RMSE, MAE and MAPE have the best stock market prediction results.\nAlso, the MLP Regression Method, along with the Sequential Forwards Selection\nand the MSE, had the best performance. SVR regression, along with the SFS and\nthe MSE, has improved greatly compared to the SVR regression with all\nindicators. (4) It was also observed that different features are selected by\ndifferent ML methods with different evaluation parameters. (5) Most ML methods\nhave used the Squeeze_pro, Percentage Price Oscillator, Thermo, Decay, Archer\nOn-Balance Volume, Bollinger Bands, Squeeze and Ichimoku indicator.\n","authors":["Fatemeh Moodi","Amir Jahangard-Rafsanjani"],"pdf_url":"https://arxiv.org/pdf/2310.09903v2.pdf","comment":"18 pages, 8 figures,4 tables,45 references"},{"id":"http://arxiv.org/abs/2310.11015v1","updated":"2023-10-17T06:04:00Z","published":"2023-10-17T06:04:00Z","title":"Pure Exploration in Asynchronous Federated Bandits","summary":"  We study the federated pure exploration problem of multi-armed bandits and\nlinear bandits, where $M$ agents cooperatively identify the best arm via\ncommunicating with the central server. To enhance the robustness against\nlatency and unavailability of agents that are common in practice, we propose\nthe first federated asynchronous multi-armed bandit and linear bandit\nalgorithms for pure exploration with fixed confidence. Our theoretical analysis\nshows the proposed algorithms achieve near-optimal sample complexities and\nefficient communication costs in a fully asynchronous environment. Moreover,\nexperimental results based on synthetic and real-world data empirically\nelucidate the effectiveness and communication cost-efficiency of the proposed\nalgorithms.\n","authors":["Zichen Wang","Chuanhao Li","Chenyu Song","Lianghui Wang","Quanquan Gu","Huazheng Wang"],"pdf_url":"https://arxiv.org/pdf/2310.11015v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11014v1","updated":"2023-10-17T06:03:45Z","published":"2023-10-17T06:03:45Z","title":"Hyperspectral In-Memory Computing with Optical Frequency Combs and\n  Programmable Optical Memories","summary":"  The rapid advancements in machine learning across numerous industries have\namplified the demand for extensive matrix-vector multiplication operations,\nthereby challenging the capacities of traditional von Neumann computing\narchitectures. To address this, researchers are currently exploring\nalternatives such as in-memory computing systems to develop faster and more\nenergy-efficient hardware. In particular, there is renewed interest in\ncomputing systems based on optics, which could potentially handle matrix-vector\nmultiplication in a more energy-efficient way. Despite promising initial\nresults, developing a highly parallel, programmable, and scalable optical\ncomputing system capable of rivaling electronic computing hardware still\nremains elusive. In this context, we propose a hyperspectral in-memory\ncomputing architecture that integrates space multiplexing with frequency\nmultiplexing of optical frequency combs and uses spatial light modulators as a\nprogrammable optical memory, thereby boosting the computational throughput and\nthe energy efficiency. We have experimentally demonstrated multiply-accumulate\noperations with higher than 4-bit precision in both matrix-vector and\nmatrix-matrix multiplications, which suggests the system's potential for a wide\nvariety of deep learning and optimization tasks. This system exhibits\nextraordinary modularity, scalability, and programmability, effectively\ntranscending the traditional limitations of optics-based computing\narchitectures. Our approach demonstrates the potential to scale beyond peta\noperations per second, marking a significant step towards achieving\nhigh-throughput energy-efficient optical computing.\n","authors":["Mostafa Honari Latifpour","Byoung Jun Park","Yoshihisa Yamamoto","Myoung-Gyun Suh"],"pdf_url":"https://arxiv.org/pdf/2310.11014v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11011v1","updated":"2023-10-17T05:45:32Z","published":"2023-10-17T05:45:32Z","title":"From Identifiable Causal Representations to Controllable Counterfactual\n  Generation: A Survey on Causal Generative Modeling","summary":"  Deep generative models have shown tremendous success in data density\nestimation and data generation from finite samples. While these models have\nshown impressive performance by learning correlations among features in the\ndata, some fundamental shortcomings are their lack of explainability, the\ntendency to induce spurious correlations, and poor out-of-distribution\nextrapolation. In an effort to remedy such challenges, one can incorporate the\ntheory of causality in deep generative modeling. Structural causal models\n(SCMs) describe data-generating processes and model complex causal\nrelationships and mechanisms among variables in a system. Thus, SCMs can\nnaturally be combined with deep generative models. Causal models offer several\nbeneficial properties to deep generative models, such as distribution shift\nrobustness, fairness, and interoperability. We provide a technical survey on\ncausal generative modeling categorized into causal representation learning and\ncontrollable counterfactual generation methods. We focus on fundamental theory,\nformulations, drawbacks, datasets, metrics, and applications of causal\ngenerative models in fairness, privacy, out-of-distribution generalization, and\nprecision medicine. We also discuss open problems and fruitful research\ndirections for future work in the field.\n","authors":["Aneesh Komanduri","Xintao Wu","Yongkai Wu","Feng Chen"],"pdf_url":"https://arxiv.org/pdf/2310.11011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.06312v2","updated":"2023-10-17T05:37:29Z","published":"2022-12-13T01:39:14Z","title":"Policy learning for many outcomes of interest: Combining optimal policy\n  trees with multi-objective Bayesian optimisation","summary":"  Methods for learning optimal policies use causal machine learning models to\ncreate human-interpretable rules for making choices around the allocation of\ndifferent policy interventions. However, in realistic policy-making contexts,\ndecision-makers often care about trade-offs between outcomes, not just\nsingle-mindedly maximising utility for one outcome. This paper proposes an\napproach termed Multi-Objective Policy Learning (MOPoL) which combines optimal\ndecision trees for policy learning with a multi-objective Bayesian optimisation\napproach to explore the trade-off between multiple outcomes. It does this by\nbuilding a Pareto frontier of non-dominated models for different hyperparameter\nsettings which govern outcome weighting. The key here is that a low-cost greedy\ntree can be an accurate proxy for the very computationally costly optimal tree\nfor the purposes of making decisions which means models can be repeatedly fit\nto learn a Pareto frontier. The method is applied to a real-world case-study of\nnon-price rationing of anti-malarial medication in Kenya.\n","authors":["Patrick Rehill","Nicholas Biddle"],"pdf_url":"https://arxiv.org/pdf/2212.06312v2.pdf","comment":"24 pages, 7 figures"},{"id":"http://arxiv.org/abs/2310.11009v1","updated":"2023-10-17T05:36:46Z","published":"2023-10-17T05:36:46Z","title":"Adaptive Pairwise Encodings for Link Prediction","summary":"  Link prediction is a common task on graph-structured data that has seen\napplications in a variety of domains. Classically, hand-crafted heuristics were\nused for this task. Heuristic measures are chosen such that they correlate well\nwith the underlying factors related to link formation. In recent years, a new\nclass of methods has emerged that combines the advantages of message-passing\nneural networks (MPNN) and heuristics methods. These methods perform\npredictions by using the output of an MPNN in conjunction with a \"pairwise\nencoding\" that captures the relationship between nodes in the candidate link.\nThey have been shown to achieve strong performance on numerous datasets.\nHowever, current pairwise encodings often contain a strong inductive bias,\nusing the same underlying factors to classify all links. This limits the\nability of existing methods to learn how to properly classify a variety of\ndifferent links that may form from different factors. To address this\nlimitation, we propose a new method, LPFormer, which attempts to adaptively\nlearn the pairwise encodings for each link. LPFormer models the link factors\nvia an attention module that learns the pairwise encoding that exists between\nnodes by modeling multiple factors integral to link prediction. Extensive\nexperiments demonstrate that LPFormer can achieve SOTA performance on numerous\ndatasets while maintaining efficiency.\n","authors":["Harry Shomer","Yao Ma","Haitao Ma","Juanhui Li","Bo Wu","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2310.11009v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11003v1","updated":"2023-10-17T05:10:39Z","published":"2023-10-17T05:10:39Z","title":"Correction Focused Language Model Training for Speech Recognition","summary":"  Language models (LMs) have been commonly adopted to boost the performance of\nautomatic speech recognition (ASR) particularly in domain adaptation tasks.\nConventional way of LM training treats all the words in corpora equally,\nresulting in suboptimal improvements in ASR performance. In this work, we\nintroduce a novel correction focused LM training approach which aims to\nprioritize ASR fallible words. The word-level ASR fallibility score,\nrepresenting the likelihood of ASR mis-recognition, is defined and shaped as a\nprior word distribution to guide the LM training. To enable correction focused\ntraining with text-only corpora, large language models (LLMs) are employed as\nfallibility score predictors and text generators through multi-task\nfine-tuning. Experimental results for domain adaptation tasks demonstrate the\neffectiveness of our proposed method. Compared with conventional LMs,\ncorrection focused training achieves up to relatively 5.5% word error rate\n(WER) reduction in sufficient text scenarios. In insufficient text scenarios,\nLM training with LLM-generated text achieves up to relatively 13% WER\nreduction, while correction focused training further obtains up to relatively\n6% WER reduction.\n","authors":["Yingyi Ma","Zhe Liu","Ozlem Kalinli"],"pdf_url":"https://arxiv.org/pdf/2310.11003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11001v1","updated":"2023-10-17T05:04:53Z","published":"2023-10-17T05:04:53Z","title":"Spatially-resolved hyperlocal weather prediction and anomaly detection\n  using IoT sensor networks and machine learning techniques","summary":"  Accurate and timely hyperlocal weather predictions are essential for various\napplications, ranging from agriculture to disaster management. In this paper,\nwe propose a novel approach that combines hyperlocal weather prediction and\nanomaly detection using IoT sensor networks and advanced machine learning\ntechniques. Our approach leverages data from multiple spatially-distributed yet\nrelatively close locations and IoT sensors to create high-resolution weather\nmodels capable of predicting short-term, localized weather conditions such as\ntemperature, pressure, and humidity. By monitoring changes in weather\nparameters across these locations, our system is able to enhance the spatial\nresolution of predictions and effectively detect anomalies in real-time.\nAdditionally, our system employs unsupervised learning algorithms to identify\nunusual weather patterns, providing timely alerts. Our findings indicate that\nthis system has the potential to enhance decision-making.\n","authors":["Anita B. Agarwal","Rohit Rajesh","Nitin Arul"],"pdf_url":"https://arxiv.org/pdf/2310.11001v1.pdf","comment":"Submitted to IEEE Modelling Simulation & Intelligent Computing, 2023"},{"id":"http://arxiv.org/abs/2310.10998v1","updated":"2023-10-17T05:03:00Z","published":"2023-10-17T05:03:00Z","title":"Accelerating Scalable Graph Neural Network Inference with Node-Adaptive\n  Propagation","summary":"  Graph neural networks (GNNs) have exhibited exceptional efficacy in a diverse\narray of applications. However, the sheer size of large-scale graphs presents a\nsignificant challenge to real-time inference with GNNs. Although existing\nScalable GNNs leverage linear propagation to preprocess the features and\naccelerate the training and inference procedure, these methods still suffer\nfrom scalability issues when making inferences on unseen nodes, as the feature\npreprocessing requires the graph to be known and fixed. To further accelerate\nScalable GNNs inference in this inductive setting, we propose an online\npropagation framework and two novel node-adaptive propagation methods that can\ncustomize the optimal propagation depth for each node based on its topological\ninformation and thereby avoid redundant feature propagation. The trade-off\nbetween accuracy and latency can be flexibly managed through simple\nhyper-parameters to accommodate various latency constraints. Moreover, to\ncompensate for the inference accuracy loss caused by the potential early\ntermination of propagation, we further propose Inception Distillation to\nexploit the multi-scale receptive field information within graphs. The rigorous\nand comprehensive experimental study on public datasets with varying scales and\ncharacteristics demonstrates that the proposed inference acceleration framework\noutperforms existing state-of-the-art graph inference acceleration methods in\nterms of accuracy and efficiency. Particularly, the superiority of our approach\nis notable on datasets with larger scales, yielding a 75x inference speedup on\nthe largest Ogbn-products dataset.\n","authors":["Xinyi Gao","Wentao Zhang","Junliang Yu","Yingxia Shao","Quoc Viet Hung Nguyen","Bin Cui","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2310.10998v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2211.00495"},{"id":"http://arxiv.org/abs/2308.16470v2","updated":"2023-10-17T04:57:23Z","published":"2023-08-31T05:26:08Z","title":"Domain-adaptive Message Passing Graph Neural Network","summary":"  Cross-network node classification (CNNC), which aims to classify nodes in a\nlabel-deficient target network by transferring the knowledge from a source\nnetwork with abundant labels, draws increasing attention recently. To address\nCNNC, we propose a domain-adaptive message passing graph neural network\n(DM-GNN), which integrates graph neural network (GNN) with conditional\nadversarial domain adaptation. DM-GNN is capable of learning informative\nrepresentations for node classification that are also transferrable across\nnetworks. Firstly, a GNN encoder is constructed by dual feature extractors to\nseparate ego-embedding learning from neighbor-embedding learning so as to\njointly capture commonality and discrimination between connected nodes.\nSecondly, a label propagation node classifier is proposed to refine each node's\nlabel prediction by combining its own prediction and its neighbors' prediction.\nIn addition, a label-aware propagation scheme is devised for the labeled source\nnetwork to promote intra-class propagation while avoiding inter-class\npropagation, thus yielding label-discriminative source embeddings. Thirdly,\nconditional adversarial domain adaptation is performed to take the\nneighborhood-refined class-label information into account during adversarial\ndomain adaptation, so that the class-conditional distributions across networks\ncan be better matched. Comparisons with eleven state-of-the-art methods\ndemonstrate the effectiveness of the proposed DM-GNN.\n","authors":["Xiao Shen","Shirui Pan","Kup-Sze Choi","Xi Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.16470v2.pdf","comment":"This version rectifies the numerical inaccuracies of Table 3 and 4 in\n  the printed version (https://doi.org/10.1016/j.neunet.2023.04.038). See our\n  corrigendum at https://doi.org/10.1016/j.neunet.2023.09.026"},{"id":"http://arxiv.org/abs/2304.02806v2","updated":"2023-10-17T04:50:19Z","published":"2023-04-06T01:09:36Z","title":"Graph Mixture of Experts: Learning on Large-Scale Graphs with Explicit\n  Diversity Modeling","summary":"  Graph neural networks (GNNs) have found extensive applications in learning\nfrom graph data. However, real-world graphs often possess diverse structures\nand comprise nodes and edges of varying types. To bolster the generalization\ncapacity of GNNs, it has become customary to augment training graph structures\nthrough techniques like graph augmentations and large-scale pre-training on a\nwider array of graphs. Balancing this diversity while avoiding increased\ncomputational costs and the notorious trainability issues of GNNs is crucial.\nThis study introduces the concept of Mixture-of-Experts (MoE) to GNNs, with the\naim of augmenting their capacity to adapt to a diverse range of training graph\nstructures, without incurring explosive computational overhead. The proposed\nGraph Mixture of Experts (GMoE) model empowers individual nodes in the graph to\ndynamically and adaptively select more general information aggregation experts.\nThese experts are trained to capture distinct subgroups of graph structures and\nto incorporate information with varying hop sizes, where those with larger hop\nsizes specialize in gathering information over longer distances. The\neffectiveness of GMoE is validated through a series of experiments on a diverse\nset of tasks, including graph, node, and link prediction, using the OGB\nbenchmark. Notably, it enhances ROC-AUC by $1.81\\%$ in ogbg-molhiv and by\n$1.40\\%$ in ogbg-molbbbp, when compared to the non-MoE baselines. Our code is\npublicly available at https://github.com/VITA-Group/Graph-Mixture-of-Experts.\n","authors":["Haotao Wang","Ziyu Jiang","Yuning You","Yan Han","Gaowen Liu","Jayanth Srinivasa","Ramana Rao Kompella","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2304.02806v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2211.03363v3","updated":"2023-10-17T04:27:42Z","published":"2022-11-07T08:34:35Z","title":"Over-The-Air Clustered Wireless Federated Learning","summary":"  Privacy and bandwidth constraints have led to the use of federated learning\n(FL) in wireless systems, where training a machine learning (ML) model is\naccomplished collaboratively without sharing raw data. While using\nbandwidth-constrained uplink wireless channels, over-the-air (OTA) FL is\npreferred since the clients can transmit parameter updates simultaneously to a\nserver. A powerful server may not be available for parameter aggregation due to\nincreased latency and server failures. In the absence of a powerful server,\ndecentralised strategy is employed where clients communicate with their\nneighbors to obtain a consensus ML model while incurring huge communication\ncost. In this work, we propose the OTA semi-decentralised clustered wireless FL\n(CWFL) and CWFL-Prox algorithms, which is communication efficient as compared\nto the decentralised FL strategy, while the parameter updates converge to\nglobal minima as O(1/T) for each cluster. Using the MNIST and CIFAR10 datasets,\nwe demonstrate the accuracy performance of CWFL is comparable to the\ncentral-server based COTAF and proximal constraint based methods, while beating\nsingle-client based ML model by vast margins in accuracy.\n","authors":["Ayush Madhan-Sohini","Divin Dominic","Nazreen Shah","Ranjitha Prasad"],"pdf_url":"https://arxiv.org/pdf/2211.03363v3.pdf","comment":"Accepted at IEEE GlobeCom 2023, ELNextGen Workshop"},{"id":"http://arxiv.org/abs/2310.10987v1","updated":"2023-10-17T04:20:00Z","published":"2023-10-17T04:20:00Z","title":"Why Do Students Drop Out? University Dropout Prediction and Associated\n  Factor Analysis Using Machine Learning Techniques","summary":"  Graduation and dropout rates have always been a serious consideration for\neducational institutions and students. High dropout rates negatively impact\nboth the lives of individual students and institutions. To address this\nproblem, this study examined university dropout prediction using academic,\ndemographic, socioeconomic, and macroeconomic data types. Additionally, we\nperformed associated factor analysis to analyze which type of data would be\nmost influential on the performance of machine learning models in predicting\ngraduation and dropout status. These features were used to train four binary\nclassifiers to determine if students would graduate or drop out. The overall\nperformance of the classifiers in predicting dropout status had an average\nROC-AUC score of 0.935. The data type most influential to the model performance\nwas found to be academic data, with the average ROC-AUC score dropping from\n0.935 to 0.811 when excluding all academic-related features from the data set.\nPreliminary results indicate that a correlation does exist between data types\nand dropout status.\n","authors":["Sean Kim","Eliot Yoo","Samuel Kim"],"pdf_url":"https://arxiv.org/pdf/2310.10987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.04867v4","updated":"2023-10-17T04:13:57Z","published":"2023-02-09T18:59:48Z","title":"UniPC: A Unified Predictor-Corrector Framework for Fast Sampling of\n  Diffusion Models","summary":"  Diffusion probabilistic models (DPMs) have demonstrated a very promising\nability in high-resolution image synthesis. However, sampling from a\npre-trained DPM is time-consuming due to the multiple evaluations of the\ndenoising network, making it more and more important to accelerate the sampling\nof DPMs. Despite recent progress in designing fast samplers, existing methods\nstill cannot generate satisfying images in many applications where fewer steps\n(e.g., $<$10) are favored. In this paper, we develop a unified corrector (UniC)\nthat can be applied after any existing DPM sampler to increase the order of\naccuracy without extra model evaluations, and derive a unified predictor (UniP)\nthat supports arbitrary order as a byproduct. Combining UniP and UniC, we\npropose a unified predictor-corrector framework called UniPC for the fast\nsampling of DPMs, which has a unified analytical form for any order and can\nsignificantly improve the sampling quality over previous methods, especially in\nextremely few steps. We evaluate our methods through extensive experiments\nincluding both unconditional and conditional sampling using pixel-space and\nlatent-space DPMs. Our UniPC can achieve 3.87 FID on CIFAR10 (unconditional)\nand 7.51 FID on ImageNet 256$\\times$256 (conditional) with only 10 function\nevaluations. Code is available at https://github.com/wl-zhao/UniPC.\n","authors":["Wenliang Zhao","Lujia Bai","Yongming Rao","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2302.04867v4.pdf","comment":"Accepted by NeurIPS 2023. Project page:\n  https://unipc.ivg-research.xyz"},{"id":"http://arxiv.org/abs/2211.01607v2","updated":"2023-10-17T04:08:28Z","published":"2022-11-03T06:37:10Z","title":"ImageCAS: A Large-Scale Dataset and Benchmark for Coronary Artery\n  Segmentation based on Computed Tomography Angiography Images","summary":"  Cardiovascular disease (CVD) accounts for about half of non-communicable\ndiseases. Vessel stenosis in the coronary artery is considered to be the major\nrisk of CVD. Computed tomography angiography (CTA) is one of the widely used\nnoninvasive imaging modalities in coronary artery diagnosis due to its superior\nimage resolution. Clinically, segmentation of coronary arteries is essential\nfor the diagnosis and quantification of coronary artery disease. Recently, a\nvariety of works have been proposed to address this problem. However, on one\nhand, most works rely on in-house datasets, and only a few works published\ntheir datasets to the public which only contain tens of images. On the other\nhand, their source code have not been published, and most follow-up works have\nnot made comparison with existing works, which makes it difficult to judge the\neffectiveness of the methods and hinders the further exploration of this\nchallenging yet critical problem in the community. In this paper, we propose a\nlarge-scale dataset for coronary artery segmentation on CTA images. In\naddition, we have implemented a benchmark in which we have tried our best to\nimplement several typical existing methods. Furthermore, we propose a strong\nbaseline method which combines multi-scale patch fusion and two-stage\nprocessing to extract the details of vessels. Comprehensive experiments show\nthat the proposed method achieves better performance than existing works on the\nproposed large-scale dataset. The benchmark and the dataset are published at\nhttps://github.com/XiaoweiXu/ImageCAS-A-Large-Scale-Dataset-and-Benchmark-for-Coronary-Artery-Segmentation-based-on-CT.\n","authors":["An Zeng","Chunbiao Wu","Meiping Huang","Jian Zhuang","Shanshan Bi","Dan Pan","Najeeb Ullah","Kaleem Nawaz Khan","Tianchen Wang","Yiyu Shi","Xiaomeng Li","Guisen Lin","Xiaowei Xu"],"pdf_url":"https://arxiv.org/pdf/2211.01607v2.pdf","comment":"17 pages, 12 figures, 4 tables"},{"id":"http://arxiv.org/abs/2206.02346v2","updated":"2023-10-17T04:02:54Z","published":"2022-06-06T04:28:04Z","title":"Convergence and sample complexity of natural policy gradient primal-dual\n  methods for constrained MDPs","summary":"  We study sequential decision making problems aimed at maximizing the expected\ntotal reward while satisfying a constraint on the expected total utility. We\nemploy the natural policy gradient method to solve the discounted\ninfinite-horizon optimal control problem for Constrained Markov Decision\nProcesses (constrained MDPs). Specifically, we propose a new Natural Policy\nGradient Primal-Dual (NPG-PD) method that updates the primal variable via\nnatural policy gradient ascent and the dual variable via projected sub-gradient\ndescent. Although the underlying maximization involves a nonconcave objective\nfunction and a nonconvex constraint set, under the softmax policy\nparametrization we prove that our method achieves global convergence with\nsublinear rates regarding both the optimality gap and the constraint violation.\nSuch convergence is independent of the size of the state-action space, i.e., it\nis~dimension-free. Furthermore, for log-linear and general smooth policy\nparametrizations, we establish sublinear convergence rates up to a function\napproximation error caused by restricted policy parametrization. We also\nprovide convergence and finite-sample complexity guarantees for two\nsample-based NPG-PD algorithms. Finally, we use computational experiments to\nshowcase the merits and the effectiveness of our approach.\n","authors":["Dongsheng Ding","Kaiqing Zhang","Jiali Duan","Tamer Başar","Mihailo R. Jovanović"],"pdf_url":"https://arxiv.org/pdf/2206.02346v2.pdf","comment":"72 pages, 4 figures, 2 tables; revised sample complexity and\n  computational experiments, and added zero constraint violation"},{"id":"http://arxiv.org/abs/2303.09767v3","updated":"2023-10-17T04:02:02Z","published":"2023-03-17T04:18:03Z","title":"It Is All About Data: A Survey on the Effects of Data on Adversarial\n  Robustness","summary":"  Adversarial examples are inputs to machine learning models that an attacker\nhas intentionally designed to confuse the model into making a mistake. Such\nexamples pose a serious threat to the applicability of machine-learning-based\nsystems, especially in life- and safety-critical domains. To address this\nproblem, the area of adversarial robustness investigates mechanisms behind\nadversarial attacks and defenses against these attacks. This survey reviews a\nparticular subset of this literature that focuses on investigating properties\nof training data in the context of model robustness under evasion attacks. It\nfirst summarizes the main properties of data leading to adversarial\nvulnerability. It then discusses guidelines and techniques for improving\nadversarial robustness by enhancing the data representation and learning\nprocedures, as well as techniques for estimating robustness guarantees given\nparticular data. Finally, it discusses gaps of knowledge and promising future\nresearch directions in this area.\n","authors":["Peiyu Xiong","Michael Tegegn","Jaskeerat Singh Sarin","Shubhraneel Pal","Julia Rubin"],"pdf_url":"https://arxiv.org/pdf/2303.09767v3.pdf","comment":"Accepted to ACM Computing Surveys, 40 pages, 24 figures"},{"id":"http://arxiv.org/abs/1804.09133v3","updated":"2023-10-17T04:00:18Z","published":"2018-04-24T16:50:54Z","title":"Improving Native Ads CTR Prediction by Large Scale Event Embedding and\n  Recurrent Networks","summary":"  Click through rate (CTR) prediction is very important for Native\nadvertisement but also hard as there is no direct query intent. In this paper\nwe propose a large-scale event embedding scheme to encode the each user\nbrowsing event by training a Siamese network with weak supervision on the\nusers' consecutive events. The CTR prediction problem is modeled as a\nsupervised recurrent neural network, which naturally model the user history as\na sequence of events. Our proposed recurrent models utilizing pretrained event\nembedding vectors and an attention layer to model the user history. Our\nexperiments demonstrate that our model significantly outperforms the baseline\nand some variants.\n","authors":["Mehul Parsana","Krishna Poola","Yajun Wang","Zhiguang Wang"],"pdf_url":"https://arxiv.org/pdf/1804.09133v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.01382v3","updated":"2023-10-17T03:47:29Z","published":"2021-06-02T18:00:04Z","title":"From Undecidability of Non-Triviality and Finiteness to Undecidability\n  of Learnability","summary":"  Machine learning researchers and practitioners steadily enlarge the multitude\nof successful learning models. They achieve this through in-depth theoretical\nanalyses and experiential heuristics. However, there is no known\ngeneral-purpose procedure for rigorously evaluating whether newly proposed\nmodels indeed successfully learn from data. We show that such a procedure\ncannot exist. For PAC binary classification, uniform and universal online\nlearning, and exact learning through teacher-learner interactions, learnability\nis in general undecidable, both in the sense of independence of the axioms in a\nformal system and in the sense of uncomputability. Our proofs proceed via\ncomputable constructions that encode the consistency problem for formal systems\nand the halting problem for Turing machines into whether certain function\nclasses are trivial/finite or highly complex, which we then relate to whether\nthese classes are learnable via established characterizations of learnability\nthrough complexity measures. Our work shows that undecidability appears in the\ntheoretical foundations of artificial intelligence: There is no\none-size-fits-all algorithm for deciding whether a machine learning model can\nbe successful. We cannot in general automatize the process of assessing new\nlearning models.\n","authors":["Matthias C. Caro"],"pdf_url":"https://arxiv.org/pdf/2106.01382v3.pdf","comment":"23 pages (main body) + 11 pages (references and appendix); 1 figure;\n  Close to published version"},{"id":"http://arxiv.org/abs/2310.10976v1","updated":"2023-10-17T03:44:29Z","published":"2023-10-17T03:44:29Z","title":"Exact nonlinear state estimation","summary":"  The majority of data assimilation (DA) methods in the geosciences are based\non Gaussian assumptions. While these assumptions facilitate efficient\nalgorithms, they cause analysis biases and subsequent forecast degradations.\nNon-parametric, particle-based DA algorithms have superior accuracy, but their\napplication to high-dimensional models still poses operational challenges.\nDrawing inspiration from recent advances in the field of generative artificial\nintelligence (AI), this article introduces a new nonlinear estimation theory\nwhich attempts to bridge the existing gap in DA methodology. Specifically, a\nConjugate Transform Filter (CTF) is derived and shown to generalize the\ncelebrated Kalman filter to arbitrarily non-Gaussian distributions. The new\nfilter has several desirable properties, such as its ability to preserve\nstatistical relationships in the prior state and convergence to highly accurate\nobservations. An ensemble approximation of the new theory (ECTF) is also\npresented and validated using idealized statistical experiments that feature\nbounded quantities with non-Gaussian distributions, a prevalent challenge in\nEarth system models. Results from these experiments indicate that the greatest\nbenefits from ECTF occur when observation errors are small relative to the\nforecast uncertainty and when state variables exhibit strong nonlinear\ndependencies. Ultimately, the new filtering theory offers exciting avenues for\nimproving conventional DA algorithms through their principled integration with\nAI techniques.\n","authors":["Hristo G. Chipilski"],"pdf_url":"https://arxiv.org/pdf/2310.10976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10971v1","updated":"2023-10-17T03:35:27Z","published":"2023-10-17T03:35:27Z","title":"Context-Aware Meta-Learning","summary":"  Large Language Models like ChatGPT demonstrate a remarkable capacity to learn\nnew concepts during inference without any fine-tuning. However, visual models\ntrained to detect new objects during inference have been unable to replicate\nthis ability, and instead either perform poorly or require meta-training and/or\nfine-tuning on similar objects. In this work, we propose a meta-learning\nalgorithm that emulates Large Language Models by learning new visual concepts\nduring inference without fine-tuning. Our approach leverages a frozen\npre-trained feature extractor, and analogous to in-context learning, recasts\nmeta-learning as sequence modeling over datapoints with known labels and a test\ndatapoint with an unknown label. On 8 out of 11 meta-learning benchmarks, our\napproach -- without meta-training or fine-tuning -- exceeds or matches the\nstate-of-the-art algorithm, P>M>F, which is meta-trained on these benchmarks.\n","authors":["Christopher Fifty","Dennis Duan","Ronald G. Junkins","Ehsan Amid","Jure Leskovec","Christopher Ré","Sebastian Thrun"],"pdf_url":"https://arxiv.org/pdf/2310.10971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10970v1","updated":"2023-10-17T03:31:47Z","published":"2023-10-17T03:31:47Z","title":"SD-PINN: Deep Learning based Spatially Dependent PDEs Recovery","summary":"  The physics-informed neural network (PINN) is capable of recovering partial\ndifferential equation (PDE) coefficients that remain constant throughout the\nspatial domain directly from physical measurements. In this work, we propose a\nspatially dependent physics-informed neural network (SD-PINN), which enables\nthe recovery of coefficients in spatially-dependent PDEs using a single neural\nnetwork, eliminating the requirement for domain-specific physical expertise.\nThe proposed method exhibits robustness to noise owing to the incorporation of\nphysical constraints. It can also incorporate the low-rank assumption of the\nspatial variation for the PDE coefficients to recover the coefficients at\nlocations without available measurements.\n","authors":["Ruixian Liu","Peter Gerstoft"],"pdf_url":"https://arxiv.org/pdf/2310.10970v1.pdf","comment":"11 pages, 15 figures"},{"id":"http://arxiv.org/abs/2310.10965v1","updated":"2023-10-17T03:26:35Z","published":"2023-10-17T03:26:35Z","title":"The neural network models with delays for solving absolute value\n  equations","summary":"  An inverse-free neural network model with mixed delays is proposed for\nsolving the absolute value equation (AVE) $Ax -|x| - b =0$, which includes an\ninverse-free neural network model with discrete delay as a special case. By\nusing the Lyapunov-Krasovskii theory and the linear matrix inequality (LMI)\nmethod, the developed neural network models are proved to be exponentially\nconvergent to the solution of the AVE. Compared with the existing neural\nnetwork models for solving the AVE, the proposed models feature the ability of\nsolving a class of AVE with $\\|A^{-1}\\|>1$. Numerical simulations are given to\nshow the effectiveness of the two delayed neural network models.\n","authors":["Dongmei Yu","Gehao Zhang","Cairong Chen","Deren Han"],"pdf_url":"https://arxiv.org/pdf/2310.10965v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11632v2","updated":"2023-10-17T03:26:22Z","published":"2023-03-21T07:00:13Z","title":"An Embarrassingly Simple Approach for Wafer Feature Extraction and\n  Defect Pattern Recognition","summary":"  Identifying defect patterns in a wafer map during manufacturing is crucial to\nfind the root cause of the underlying issue and provides valuable insights on\nimproving yield in the foundry. Currently used methods use deep neural networks\nto identify the defects. These methods are generally very huge and have\nsignificant inference time. They also require GPU support to efficiently\noperate. All these issues make these models not fit for on-line prediction in\nthe manufacturing foundry. In this paper, we propose an extremely simple yet\neffective technique to extract features from wafer images. The proposed method\nis extremely fast, intuitive, and non-parametric while being explainable. The\nexperiment results show that the proposed pipeline outperforms conventional\ndeep learning models. Our feature extraction requires no training or\nfine-tuning while preserving the relative shape and location of data points as\nrevealed by our interpretability analysis.\n","authors":["Nitish Shukla"],"pdf_url":"https://arxiv.org/pdf/2303.11632v2.pdf","comment":"study is not relevant"},{"id":"http://arxiv.org/abs/2303.13827v2","updated":"2023-10-17T03:26:07Z","published":"2023-03-24T06:24:07Z","title":"Efficient Mixed-Type Wafer Defect Pattern Recognition Using Compact\n  Deformable Convolutional Transformers","summary":"  Manufacturing wafers is an intricate task involving thousands of steps.\nDefect Pattern Recognition (DPR) of wafer maps is crucial to find the root\ncause of the issue and further improving the yield in the wafer foundry.\nMixed-type DPR is much more complicated compared to single-type DPR due to\nvaried spatial features, the uncertainty of defects, and the number of defects\npresent. To accurately predict the number of defects as well as the types of\ndefects, we propose a novel compact deformable convolutional transformer (DC\nTransformer). Specifically, DC Transformer focuses on the global features\npresent in the wafer map by virtue of learnable deformable kernels and\nmulti-head attention to the global features. The proposed method succinctly\nmodels the internal relationship between the wafer maps and the defects. DC\nTransformer is evaluated on a real dataset containing 38 defect patterns.\nExperimental results show that DC Transformer performs exceptionally well in\nrecognizing both single and mixed-type defects. The proposed method outperforms\nthe current state of the models by a considerable margin\n","authors":["Nitish Shukla"],"pdf_url":"https://arxiv.org/pdf/2303.13827v2.pdf","comment":"Study is not relevant"},{"id":"http://arxiv.org/abs/2306.14079v2","updated":"2023-10-17T03:17:56Z","published":"2023-06-24T23:40:58Z","title":"Fighting Uncertainty with Gradients: Offline Reinforcement Learning via\n  Diffusion Score Matching","summary":"  Gradient-based methods enable efficient search capabilities in high\ndimensions. However, in order to apply them effectively in offline optimization\nparadigms such as offline Reinforcement Learning (RL) or Imitation Learning\n(IL), we require a more careful consideration of how uncertainty estimation\ninterplays with first-order methods that attempt to minimize them. We study\nsmoothed distance to data as an uncertainty metric, and claim that it has two\nbeneficial properties: (i) it allows gradient-based methods that attempt to\nminimize uncertainty to drive iterates to data as smoothing is annealed, and\n(ii) it facilitates analysis of model bias with Lipschitz constants. As\ndistance to data can be expensive to compute online, we consider settings where\nwe need amortize this computation. Instead of learning the distance however, we\npropose to learn its gradients directly as an oracle for first-order\noptimizers. We show these gradients can be efficiently learned with\nscore-matching techniques by leveraging the equivalence between distance to\ndata and data likelihood. Using this insight, we propose Score-Guided Planning\n(SGP), a planning algorithm for offline RL that utilizes score-matching to\nenable first-order planning in high-dimensional problems, where zeroth-order\nmethods were unable to scale, and ensembles were unable to overcome local\nminima. Website: https://sites.google.com/view/score-guided-planning/home\n","authors":["H. J. Terry Suh","Glen Chou","Hongkai Dai","Lujie Yang","Abhishek Gupta","Russ Tedrake"],"pdf_url":"https://arxiv.org/pdf/2306.14079v2.pdf","comment":"Glen Chou, Hongkai Dai, and Lujie Yang contributed equally to this\n  work. Accepted to CoRL 2023"},{"id":"http://arxiv.org/abs/2310.10958v1","updated":"2023-10-17T03:11:30Z","published":"2023-10-17T03:11:30Z","title":"Enhancing Deep Neural Network Training Efficiency and Performance\n  through Linear Prediction","summary":"  Deep neural networks (DNN) have achieved remarkable success in various\nfields, including computer vision and natural language processing. However,\ntraining an effective DNN model still poses challenges. This paper aims to\npropose a method to optimize the training effectiveness of DNN, with the goal\nof improving model performance. Firstly, based on the observation that the DNN\nparameters change in certain laws during training process, the potential of\nparameter prediction for improving model training efficiency and performance is\ndiscovered. Secondly, considering the magnitude of DNN model parameters,\nhardware limitations and characteristics of Stochastic Gradient Descent (SGD)\nfor noise tolerance, a Parameter Linear Prediction (PLP) method is exploit to\nperform DNN parameter prediction. Finally, validations are carried out on some\nrepresentative backbones. Experiment results show that compare to the normal\ntraining ways, under the same training conditions and epochs, by employing\nproposed PLP method, the optimal model is able to obtain average about 1%\naccuracy improvement and 0.01 top-1/top-5 error reduction for Vgg16, Resnet18\nand GoogLeNet based on CIFAR-100 dataset, which shown the effectiveness of the\nproposed method on different DNN structures, and validated its capacity in\nenhancing DNN training efficiency and performance.\n","authors":["Hejie Ying","Mengmeng Song","Yaohong Tang","Shungen Xiao","Zimin Xiao"],"pdf_url":"https://arxiv.org/pdf/2310.10958v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2310.11257v1","updated":"2023-10-17T13:22:59Z","published":"2023-10-17T13:22:59Z","title":"An empirical study of automatic wildlife detection using drone thermal\n  imaging and object detection","summary":"  Artificial intelligence has the potential to make valuable contributions to\nwildlife management through cost-effective methods for the collection and\ninterpretation of wildlife data. Recent advances in remotely piloted aircraft\nsystems (RPAS or ``drones'') and thermal imaging technology have created new\napproaches to collect wildlife data. These emerging technologies could provide\npromising alternatives to standard labourious field techniques as well as cover\nmuch larger areas. In this study, we conduct a comprehensive review and\nempirical study of drone-based wildlife detection. Specifically, we collect a\nrealistic dataset of drone-derived wildlife thermal detections. Wildlife\ndetections, including arboreal (for instance, koalas, phascolarctos cinereus)\nand ground dwelling species in our collected data are annotated via bounding\nboxes by experts. We then benchmark state-of-the-art object detection\nalgorithms on our collected dataset. We use these experimental results to\nidentify issues and discuss future directions in automatic animal monitoring\nusing drones.\n","authors":["Miao Chang","Tan Vuong","Manas Palaparthi","Lachlan Howell","Alessio Bonti","Mohamed Abdelrazek","Duc Thanh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2310.11257v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11210v1","updated":"2023-10-17T12:39:16Z","published":"2023-10-17T12:39:16Z","title":"Learning Comprehensive Representations with Richer Self for\n  Text-to-Image Person Re-Identification","summary":"  Text-to-image person re-identification (TIReID) retrieves pedestrian images\nof the same identity based on a query text. However, existing methods for\nTIReID typically treat it as a one-to-one image-text matching problem, only\nfocusing on the relationship between image-text pairs within a view. The\nmany-to-many matching between image-text pairs across views under the same\nidentity is not taken into account, which is one of the main reasons for the\npoor performance of existing methods. To this end, we propose a simple yet\neffective framework, called LCR$^2$S, for modeling many-to-many correspondences\nof the same identity by learning comprehensive representations for both\nmodalities from a novel perspective. We construct a support set for each image\n(text) by using other images (texts) under the same identity and design a\nmulti-head attentional fusion module to fuse the image (text) and its support\nset. The resulting enriched image and text features fuse information from\nmultiple views, which are aligned to train a \"richer\" TIReID model with\nmany-to-many correspondences. Since the support set is unavailable during\ninference, we propose to distill the knowledge learned by the \"richer\" model\ninto a lightweight model for inference with a single image/text as input. The\nlightweight model focuses on semantic association and reasoning of multi-view\ninformation, which can generate a comprehensive representation containing\nmulti-view information with only a single-view input to perform accurate\ntext-to-image retrieval during inference. In particular, we use the intra-modal\nfeatures and inter-modal semantic relations of the \"richer\" model to supervise\nthe lightweight model to inherit its powerful capability. Extensive experiments\ndemonstrate the effectiveness of LCR$^2$S, and it also achieves new\nstate-of-the-art performance on three popular TIReID datasets.\n","authors":["Shuanglin Yan","Neng Dong","Jun Liu","Liyan Zhang","Jinhui Tang"],"pdf_url":"https://arxiv.org/pdf/2310.11210v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2310.01596v2","updated":"2023-10-17T03:41:32Z","published":"2023-10-02T19:41:42Z","title":"ImagenHub: Standardizing the evaluation of conditional image generation\n  models","summary":"  Recently, a myriad of conditional image generation and editing models have\nbeen developed to serve different downstream tasks, including text-to-image\ngeneration, text-guided image editing, subject-driven image generation,\ncontrol-guided image generation, etc. However, we observe huge inconsistencies\nin experimental conditions: datasets, inference, and evaluation metrics -\nrender fair comparisons difficult. This paper proposes ImagenHub, which is a\none-stop library to standardize the inference and evaluation of all the\nconditional image generation models. Firstly, we define seven prominent tasks\nand curate high-quality evaluation datasets for them. Secondly, we built a\nunified inference pipeline to ensure fair comparison. Thirdly, we design two\nhuman evaluation scores, i.e. Semantic Consistency and Perceptual Quality,\nalong with comprehensive guidelines to evaluate generated images. We train\nexpert raters to evaluate the model outputs based on the proposed metrics. Our\nhuman evaluation achieves a high inter-worker agreement of Krippendorff's alpha\non 76% models with a value higher than 0.4. We comprehensively evaluated a\ntotal of around 30 models and observed three key takeaways: (1) the existing\nmodels' performance is generally unsatisfying except for Text-guided Image\nGeneration and Subject-driven Image Generation, with 74% models achieving an\noverall score lower than 0.5. (2) we examined the claims from published papers\nand found 83% of them hold with a few exceptions. (3) None of the existing\nautomatic metrics has a Spearman's correlation higher than 0.2 except\nsubject-driven image generation. Moving forward, we will continue our efforts\nto evaluate newly published models and update our leaderboard to keep track of\nthe progress in conditional image generation.\n","authors":["Max Ku","Tianle Li","Kai Zhang","Yujie Lu","Xingyu Fu","Wenwen Zhuang","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2310.01596v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09473v2","updated":"2023-10-17T22:01:00Z","published":"2023-02-19T04:03:22Z","title":"Video-Text Retrieval by Supervised Sparse Multi-Grained Learning","summary":"  While recent progress in video-text retrieval has been advanced by the\nexploration of better representation learning, in this paper, we present a\nnovel multi-grained sparse learning framework, S3MA, to learn an aligned sparse\nspace shared between the video and the text for video-text retrieval. The\nshared sparse space is initialized with a finite number of sparse concepts,\neach of which refers to a number of words. With the text data at hand, we learn\nand update the shared sparse space in a supervised manner using the proposed\nsimilarity and alignment losses. Moreover, to enable multi-grained alignment,\nwe incorporate frame representations for better modeling the video modality and\ncalculating fine-grained and coarse-grained similarities. Benefiting from the\nlearned shared sparse space and multi-grained similarities, extensive\nexperiments on several video-text retrieval benchmarks demonstrate the\nsuperiority of S3MA over existing methods. Our code is available at\nhttps://github.com/yimuwangcs/Better_Cross_Modal_Retrieval.\n","authors":["Yimu Wang","Peng Shi"],"pdf_url":"https://arxiv.org/pdf/2302.09473v2.pdf","comment":"Findings of EMNLP 2023"}]},"2023-10-16T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2305.19555v2","updated":"2023-10-16T23:25:59Z","published":"2023-05-31T04:50:29Z","title":"Large Language Models Are Not Strong Abstract Reasoners","summary":"  Large Language Models have shown tremendous performance on a large variety of\nnatural language processing tasks, ranging from text comprehension to common\nsense reasoning. However, the mechanisms responsible for this success remain\nopaque, and it is unclear whether LLMs can achieve human-like cognitive\ncapabilities or whether these models are still fundamentally circumscribed.\nAbstract reasoning is a fundamental task for cognition, consisting of finding\nand applying a general pattern from few data. Evaluating deep neural\narchitectures on this task could give insight into their potential limitations\nregarding reasoning and their broad generalisation abilities, yet this is\ncurrently an under-explored area. In this paper, we introduce a new benchmark\nfor evaluating language models beyond memorization on abstract reasoning tasks.\nWe perform extensive evaluations of state-of-the-art LLMs, showing that they\ncurrently achieve very limited performance in contrast with other natural\nlanguage tasks, and we examine the reasons for this difference. We apply\ntechniques that have been shown to improve performance on other NLP tasks and\nshow that their impact on abstract reasoning is limited.\n","authors":["Gaël Gendron","Qiming Bao","Michael Witbrock","Gillian Dobbie"],"pdf_url":"https://arxiv.org/pdf/2305.19555v2.pdf","comment":"46 pages, 13 pages for the main paper and 33 pages for the\n  supplement, 35 figures. V2: performed additional experiments"},{"id":"http://arxiv.org/abs/2310.10873v1","updated":"2023-10-16T22:53:54Z","published":"2023-10-16T22:53:54Z","title":"IDEAL: Influence-Driven Selective Annotations Empower In-Context\n  Learners in Large Language Models","summary":"  In-context learning is a promising paradigm that utilizes in-context examples\nas prompts for the predictions of large language models. These prompts are\ncrucial for achieving strong performance. However, since the prompts need to be\nsampled from a large volume of annotated examples, finding the right prompt may\nresult in high annotation costs. To address this challenge, this paper\nintroduces an influence-driven selective annotation method that aims to\nminimize annotation costs while improving the quality of in-context examples.\nThe essence of our method is to select a pivotal subset from a large-scale\nunlabeled data pool to annotate for the subsequent sampling of prompts.\nSpecifically, a directed graph is first constructed to represent unlabeled\ndata. Afterward, the influence of candidate unlabeled subsets is quantified\nwith a diffusion process. A simple yet effective greedy algorithm for unlabeled\ndata selection is lastly introduced. It iteratively selects the data if it\nprovides a maximum marginal gain with respect to quantified influence. Compared\nwith previous efforts on selective annotations, our influence-driven method\nworks in an end-to-end manner, avoids an intractable explicit balance between\ndata diversity and representativeness, and enjoys theoretical support.\nExperiments confirm the superiority of the proposed method on various\nbenchmarks, achieving better performance under lower time consumption during\nsubset selection. The project page is available at\nhttps://skzhang1.github.io/IDEAL/.\n","authors":["Shaokun Zhang","Xiaobo Xia","Zhaoqing Wang","Ling-Hao Chen","Jiale Liu","Qingyun Wu","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2310.10873v1.pdf","comment":"24 pages, 6 figures, 11 tables"},{"id":"http://arxiv.org/abs/2310.10865v1","updated":"2023-10-16T22:25:09Z","published":"2023-10-16T22:25:09Z","title":"Will the Prince Get True Love's Kiss? On the Model Sensitivity to Gender\n  Perturbation over Fairytale Texts","summary":"  Recent studies show that traditional fairytales are rife with harmful gender\nbiases. To help mitigate these gender biases in fairytales, this work aims to\nassess learned biases of language models by evaluating their robustness against\ngender perturbations. Specifically, we focus on Question Answering (QA) tasks\nin fairytales. Using counterfactual data augmentation to the FairytaleQA\ndataset, we evaluate model robustness against swapped gender character\ninformation, and then mitigate learned biases by introducing counterfactual\ngender stereotypes during training time. We additionally introduce a novel\napproach that utilizes the massive vocabulary of language models to support\ntext genres beyond fairytales. Our experimental results suggest that models are\nsensitive to gender perturbations, with significant performance drops compared\nto the original testing set. However, when first fine-tuned on a counterfactual\ntraining dataset, models are less sensitive to the later introduced anti-gender\nstereotyped text.\n","authors":["Christina Chance","Da Yin","Dakuo Wang","Kai-Wei Chang"],"pdf_url":"https://arxiv.org/pdf/2310.10865v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12532v3","updated":"2023-10-16T22:00:34Z","published":"2023-05-21T18:25:07Z","title":"Multilingual Simplification of Medical Texts","summary":"  Automated text simplification aims to produce simple versions of complex\ntexts. This task is especially useful in the medical domain, where the latest\nmedical findings are typically communicated via complex and technical articles.\nThis creates barriers for laypeople seeking access to up-to-date medical\nfindings, consequently impeding progress on health literacy. Most existing work\non medical text simplification has focused on monolingual settings, with the\nresult that such evidence would be available only in just one language (most\noften, English). This work addresses this limitation via multilingual\nsimplification, i.e., directly simplifying complex texts into simplified texts\nin multiple languages. We introduce MultiCochrane, the first sentence-aligned\nmultilingual text simplification dataset for the medical domain in four\nlanguages: English, Spanish, French, and Farsi. We evaluate fine-tuned and\nzero-shot models across these languages, with extensive human assessments and\nanalyses. Although models can now generate viable simplified texts, we identify\noutstanding challenges that this dataset might be used to address.\n","authors":["Sebastian Joseph","Kathryn Kazanas","Keziah Reina","Vishnesh J. Ramanathan","Wei Xu","Byron C. Wallace","Junyi Jessy Li"],"pdf_url":"https://arxiv.org/pdf/2305.12532v3.pdf","comment":"This version will be in EMNLP 2023 main"},{"id":"http://arxiv.org/abs/2304.04250v3","updated":"2023-10-16T21:47:20Z","published":"2023-04-09T14:52:18Z","title":"Editable User Profiles for Controllable Text Recommendation","summary":"  Methods for making high-quality recommendations often rely on learning latent\nrepresentations from interaction data. These methods, while performant, do not\nprovide ready mechanisms for users to control the recommendation they receive.\nOur work tackles this problem by proposing LACE, a novel concept value\nbottleneck model for controllable text recommendations. LACE represents each\nuser with a succinct set of human-readable concepts through retrieval given\nuser-interacted documents and learns personalized representations of the\nconcepts based on user documents. This concept based user profile is then\nleveraged to make recommendations. The design of our model affords control over\nthe recommendations through a number of intuitive interactions with a\ntransparent user profile. We first establish the quality of recommendations\nobtained from LACE in an offline evaluation on three recommendation tasks\nspanning six datasets in warm-start, cold-start, and zero-shot setups. Next, we\nvalidate the controllability of LACE under simulated user interactions.\nFinally, we implement LACE in an interactive controllable recommender system\nand conduct a user study to demonstrate that users are able to improve the\nquality of recommendations they receive through interactions with an editable\nuser profile.\n","authors":["Sheshera Mysore","Mahmood Jasim","Andrew McCallum","Hamed Zamani"],"pdf_url":"https://arxiv.org/pdf/2304.04250v3.pdf","comment":"SIGIR-2023 paper with extended results"},{"id":"http://arxiv.org/abs/2310.10845v1","updated":"2023-10-16T21:37:34Z","published":"2023-10-16T21:37:34Z","title":"CoTFormer: More Tokens With Attention Make Up For Less Depth","summary":"  The race to continually develop ever larger and deeper foundational models is\nunderway. However, techniques like the Chain-of-Thought (CoT) method continue\nto play a pivotal role in achieving optimal downstream performance. In this\nwork, we establish an approximate parallel between using chain-of-thought and\nemploying a deeper transformer. Building on this insight, we introduce\nCoTFormer, a transformer variant that employs an implicit CoT-like mechanism to\nachieve capacity comparable to a deeper model. Our empirical findings\ndemonstrate the effectiveness of CoTFormers, as they significantly outperform\nlarger standard transformers.\n","authors":["Amirkeivan Mohtashami","Matteo Pagliardini","Martin Jaggi"],"pdf_url":"https://arxiv.org/pdf/2310.10845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10844v1","updated":"2023-10-16T21:37:24Z","published":"2023-10-16T21:37:24Z","title":"Survey of Vulnerabilities in Large Language Models Revealed by\n  Adversarial Attacks","summary":"  Large Language Models (LLMs) are swiftly advancing in architecture and\ncapability, and as they integrate more deeply into complex systems, the urgency\nto scrutinize their security properties grows. This paper surveys research in\nthe emerging interdisciplinary field of adversarial attacks on LLMs, a subfield\nof trustworthy ML, combining the perspectives of Natural Language Processing\nand Security. Prior work has shown that even safety-aligned LLMs (via\ninstruction tuning and reinforcement learning through human feedback) can be\nsusceptible to adversarial attacks, which exploit weaknesses and mislead AI\nsystems, as evidenced by the prevalence of `jailbreak' attacks on models like\nChatGPT and Bard. In this survey, we first provide an overview of large\nlanguage models, describe their safety alignment, and categorize existing\nresearch based on various learning structures: textual-only attacks,\nmulti-modal attacks, and additional attack methods specifically targeting\ncomplex systems, such as federated learning or multi-agent systems. We also\noffer comprehensive remarks on works that focus on the fundamental sources of\nvulnerabilities and potential defenses. To make this field more accessible to\nnewcomers, we present a systematic review of existing works, a structured\ntypology of adversarial attack concepts, and additional resources, including\nslides for presentations on related topics at the 62nd Annual Meeting of the\nAssociation for Computational Linguistics (ACL'24).\n","authors":["Erfan Shayegani","Md Abdullah Al Mamun","Yu Fu","Pedram Zaree","Yue Dong","Nael Abu-Ghazaleh"],"pdf_url":"https://arxiv.org/pdf/2310.10844v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06434v2","updated":"2023-10-16T21:32:56Z","published":"2023-10-10T09:04:33Z","title":"Whispering LLaMA: A Cross-Modal Generative Error Correction Framework\n  for Speech Recognition","summary":"  We introduce a new cross-modal fusion technique designed for generative error\ncorrection in automatic speech recognition (ASR). Our methodology leverages\nboth acoustic information and external linguistic representations to generate\naccurate speech transcription contexts. This marks a step towards a fresh\nparadigm in generative error correction within the realm of n-best hypotheses.\nUnlike the existing ranking-based rescoring methods, our approach adeptly uses\ndistinct initialization techniques and parameter-efficient algorithms to boost\nASR performance derived from pre-trained speech and text models. Through\nevaluation across diverse ASR datasets, we evaluate the stability and\nreproducibility of our fusion technique, demonstrating its improved word error\nrate relative (WERR) performance in comparison to n-best hypotheses by\nrelatively 37.66%. To encourage future research, we have made our code and\npre-trained models open source at\nhttps://github.com/Srijith-rkr/Whispering-LLaMA.\n","authors":["Srijith Radhakrishnan","Chao-Han Huck Yang","Sumeer Ahmad Khan","Rohit Kumar","Narsis A. Kiani","David Gomez-Cabrero","Jesper N. Tegner"],"pdf_url":"https://arxiv.org/pdf/2310.06434v2.pdf","comment":"Accepted to EMNLP 2023 as main paper. 10 pages. Revised math\n  notations. GitHub: https://github.com/Srijith-rkr/Whispering-LLaMA"},{"id":"http://arxiv.org/abs/2310.10830v1","updated":"2023-10-16T21:05:12Z","published":"2023-10-16T21:05:12Z","title":"Fake News in Sheep's Clothing: Robust Fake News Detection Against\n  LLM-Empowered Style Attacks","summary":"  It is commonly perceived that online fake news and reliable news exhibit\nstark differences in writing styles, such as the use of sensationalist versus\nobjective language. However, we emphasize that style-related features can also\nbe exploited for style-based attacks. Notably, the rise of powerful Large\nLanguage Models (LLMs) has enabled malicious users to mimic the style of\ntrustworthy news outlets at minimal cost. Our analysis reveals that\nLLM-camouflaged fake news content leads to substantial performance degradation\nof state-of-the-art text-based detectors (up to 38% decrease in F1 Score),\nposing a significant challenge for automated detection in online ecosystems. To\naddress this, we introduce SheepDog, a style-agnostic fake news detector robust\nto news writing styles. SheepDog achieves this adaptability through\nLLM-empowered news reframing, which customizes each article to match different\nwriting styles using style-oriented reframing prompts. By employing\nstyle-agnostic training, SheepDog enhances its resilience to stylistic\nvariations by maximizing prediction consistency across these diverse\nreframings. Furthermore, SheepDog extracts content-focused veracity\nattributions from LLMs, where the news content is evaluated against a set of\nfact-checking rationales. These attributions provide supplementary information\nand potential interpretability that assist veracity prediction. On three\nbenchmark datasets, empirical results show that SheepDog consistently yields\nsignificant improvements over competitive baselines and enhances robustness\nagainst LLM-empowered style attacks.\n","authors":["Jiaying Wu","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2310.10830v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15724v4","updated":"2023-10-16T21:04:57Z","published":"2023-06-27T18:03:15Z","title":"REFLECT: Summarizing Robot Experiences for Failure Explanation and\n  Correction","summary":"  The ability to detect and analyze failed executions automatically is crucial\nfor an explainable and robust robotic system. Recently, Large Language Models\n(LLMs) have demonstrated strong reasoning abilities on textual inputs. To\nleverage the power of LLMs for robot failure explanation, we introduce REFLECT,\na framework which queries LLM for failure reasoning based on a hierarchical\nsummary of robot past experiences generated from multisensory observations. The\nfailure explanation can further guide a language-based planner to correct the\nfailure and complete the task. To systematically evaluate the framework, we\ncreate the RoboFail dataset with a variety of tasks and failure scenarios. We\ndemonstrate that the LLM-based framework is able to generate informative\nfailure explanations that assist successful correction planning.\n","authors":["Zeyi Liu","Arpit Bahety","Shuran Song"],"pdf_url":"https://arxiv.org/pdf/2306.15724v4.pdf","comment":"Conference on Robot Learning (CoRL) 2023; Project website:\n  https://robot-reflect.github.io/"},{"id":"http://arxiv.org/abs/2310.10803v1","updated":"2023-10-16T20:05:36Z","published":"2023-10-16T20:05:36Z","title":"SD-HuBERT: Self-Distillation Induces Syllabic Organization in HuBERT","summary":"  Data-driven unit discovery in self-supervised learning (SSL) of speech has\nembarked on a new era of spoken language processing. Yet, the discovered units\noften remain in phonetic space, limiting the utility of SSL representations.\nHere, we demonstrate that a syllabic organization emerges in learning\nsentence-level representation of speech. In particular, we adopt\n\"self-distillation\" objective to fine-tune the pretrained HuBERT with an\naggregator token that summarizes the entire sentence. Without any supervision,\nthe resulting model draws definite boundaries in speech, and the\nrepresentations across frames show salient syllabic structures. We demonstrate\nthat this emergent structure largely corresponds to the ground truth syllables.\nFurthermore, we propose a new benchmark task, Spoken Speech ABX, for evaluating\nsentence-level representation of speech. When compared to previous models, our\nmodel outperforms in both unsupervised syllable discovery and learning\nsentence-level representation. Together, we demonstrate that the\nself-distillation of HuBERT gives rise to syllabic organization without relying\non external labels or modalities, and potentially provides novel data-driven\nunits for spoken language modeling.\n","authors":["Cheol Jun Cho","Abdelrahman Mohamed","Shang-Wen Li","Alan W Black","Gopala K. Anumanchipalli"],"pdf_url":"https://arxiv.org/pdf/2310.10803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10788v1","updated":"2023-10-16T19:50:01Z","published":"2023-10-16T19:50:01Z","title":"Self-Supervised Models of Speech Infer Universal Articulatory Kinematics","summary":"  Self-Supervised Learning (SSL) based models of speech have shown remarkable\nperformance on a range of downstream tasks. These state-of-the-art models have\nremained blackboxes, but many recent studies have begun \"probing\" models like\nHuBERT, to correlate their internal representations to different aspects of\nspeech. In this paper, we show \"inference of articulatory kinematics\" as\nfundamental property of SSL models, i.e., the ability of these models to\ntransform acoustics into the causal articulatory dynamics underlying the speech\nsignal. We also show that this abstraction is largely overlapping across the\nlanguage of the data used to train the model, with preference to the language\nwith similar phonological system. Furthermore, we show that with simple affine\ntransformations, Acoustic-to-Articulatory inversion (AAI) is transferrable\nacross speakers, even across genders, languages, and dialects, showing the\ngeneralizability of this property. Together, these results shed new light on\nthe internals of SSL models that are critical to their superior performance,\nand open up new avenues into language-agnostic universal models for speech\nengineering, that are interpretable and grounded in speech science.\n","authors":["Cheol Jun Cho","Abdelrahman Mohamed","Alan W Black","Gopala K. Anumanchipalli"],"pdf_url":"https://arxiv.org/pdf/2310.10788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10781v1","updated":"2023-10-16T19:35:04Z","published":"2023-10-16T19:35:04Z","title":"BanglaNLP at BLP-2023 Task 1: Benchmarking different Transformer Models\n  for Violence Inciting Text Detection in Bengali","summary":"  This paper presents the system that we have developed while solving this\nshared task on violence inciting text detection in Bangla. We explain both the\ntraditional and the recent approaches that we have used to make our models\nlearn. Our proposed system helps to classify if the given text contains any\nthreat. We studied the impact of data augmentation when there is a limited\ndataset available. Our quantitative results show that finetuning a\nmultilingual-e5-base model performed the best in our task compared to other\ntransformer-based architectures. We obtained a macro F1 of 68.11\\% in the test\nset and our performance in this shared task is ranked at 23 in the leaderboard.\n","authors":["Saumajit Saha","Albert Nanda"],"pdf_url":"https://arxiv.org/pdf/2310.10781v1.pdf","comment":"5 pages, 2 figures, workshop"},{"id":"http://arxiv.org/abs/2310.10765v1","updated":"2023-10-16T18:59:31Z","published":"2023-10-16T18:59:31Z","title":"BiomedJourney: Counterfactual Biomedical Image Generation by\n  Instruction-Learning from Multimodal Patient Journeys","summary":"  Rapid progress has been made in instruction-learning for image editing with\nnatural-language instruction, as exemplified by InstructPix2Pix. In\nbiomedicine, such methods can be applied to counterfactual image generation,\nwhich helps differentiate causal structure from spurious correlation and\nfacilitate robust image interpretation for disease progression modeling.\nHowever, generic image-editing models are ill-suited for the biomedical domain,\nand counterfactual biomedical image generation is largely underexplored. In\nthis paper, we present BiomedJourney, a novel method for counterfactual\nbiomedical image generation by instruction-learning from multimodal patient\njourneys. Given a patient with two biomedical images taken at different time\npoints, we use GPT-4 to process the corresponding imaging reports and generate\na natural language description of disease progression. The resulting triples\n(prior image, progression description, new image) are then used to train a\nlatent diffusion model for counterfactual biomedical image generation. Given\nthe relative scarcity of image time series data, we introduce a two-stage\ncurriculum that first pretrains the denoising network using the much more\nabundant single image-report pairs (with dummy prior image), and then continues\ntraining using the counterfactual triples. Experiments using the standard\nMIMIC-CXR dataset demonstrate the promise of our method. In a comprehensive\nbattery of tests on counterfactual medical image generation, BiomedJourney\nsubstantially outperforms prior state-of-the-art methods in instruction image\nediting and medical image generation such as InstructPix2Pix and RoentGen. To\nfacilitate future study in counterfactual medical generation, we plan to\nrelease our instruction-learning code and pretrained models.\n","authors":["Yu Gu","Jianwei Yang","Naoto Usuyama","Chunyuan Li","Sheng Zhang","Matthew P. Lungren","Jianfeng Gao","Hoifung Poon"],"pdf_url":"https://arxiv.org/pdf/2310.10765v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10760v1","updated":"2023-10-16T18:45:38Z","published":"2023-10-16T18:45:38Z","title":"Towards reducing hallucination in extracting information from financial\n  reports using Large Language Models","summary":"  For a financial analyst, the question and answer (Q\\&A) segment of the\ncompany financial report is a crucial piece of information for various analysis\nand investment decisions. However, extracting valuable insights from the Q\\&A\nsection has posed considerable challenges as the conventional methods such as\ndetailed reading and note-taking lack scalability and are susceptible to human\nerrors, and Optical Character Recognition (OCR) and similar techniques\nencounter difficulties in accurately processing unstructured transcript text,\noften missing subtle linguistic nuances that drive investor decisions. Here, we\ndemonstrate the utilization of Large Language Models (LLMs) to efficiently and\nrapidly extract information from earnings report transcripts while ensuring\nhigh accuracy transforming the extraction process as well as reducing\nhallucination by combining retrieval-augmented generation technique as well as\nmetadata. We evaluate the outcomes of various LLMs with and without using our\nproposed approach based on various objective metrics for evaluating Q\\&A\nsystems, and empirically demonstrate superiority of our method.\n","authors":["Bhaskarjit Sarmah","Tianjie Zhu","Dhagash Mehta","Stefano Pasquali"],"pdf_url":"https://arxiv.org/pdf/2310.10760v1.pdf","comment":"4 pages + references. Accepted for publication in Workshop on\n  Generative AI at the 3rd International Conference on AI-ML Systems 2023,\n  Bengaluru, India"},{"id":"http://arxiv.org/abs/2310.01415v2","updated":"2023-10-16T18:33:10Z","published":"2023-10-02T17:59:57Z","title":"GPT-Driver: Learning to Drive with GPT","summary":"  We present a simple yet effective approach that can transform the OpenAI\nGPT-3.5 model into a reliable motion planner for autonomous vehicles. Motion\nplanning is a core challenge in autonomous driving, aiming to plan a driving\ntrajectory that is safe and comfortable. Existing motion planners predominantly\nleverage heuristic methods to forecast driving trajectories, yet these\napproaches demonstrate insufficient generalization capabilities in the face of\nnovel and unseen driving scenarios. In this paper, we propose a novel approach\nto motion planning that capitalizes on the strong reasoning capabilities and\ngeneralization potential inherent to Large Language Models (LLMs). The\nfundamental insight of our approach is the reformulation of motion planning as\na language modeling problem, a perspective not previously explored.\nSpecifically, we represent the planner inputs and outputs as language tokens,\nand leverage the LLM to generate driving trajectories through a language\ndescription of coordinate positions. Furthermore, we propose a novel\nprompting-reasoning-finetuning strategy to stimulate the numerical reasoning\npotential of the LLM. With this strategy, the LLM can describe highly precise\ntrajectory coordinates and also its internal decision-making process in natural\nlanguage. We evaluate our approach on the large-scale nuScenes dataset, and\nextensive experiments substantiate the effectiveness, generalization ability,\nand interpretability of our GPT-based motion planner. Code is now available at\nhttps://github.com/PointsCoder/GPT-Driver.\n","authors":["Jiageng Mao","Yuxi Qian","Hang Zhao","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2310.01415v2.pdf","comment":"Project Page:\n  https://pointscoder.github.io/projects/gpt_driver/index.html"},{"id":"http://arxiv.org/abs/2310.10735v1","updated":"2023-10-16T18:05:54Z","published":"2023-10-16T18:05:54Z","title":"Building Persona Consistent Dialogue Agents with Offline Reinforcement\n  Learning","summary":"  Maintaining a consistent persona is a key quality for any open domain\ndialogue system. Current state-of-the-art systems do this by training agents\nwith supervised learning or online reinforcement learning (RL). However,\nsystems trained with supervised learning often lack consistency as they are\nnever punished for uttering contradictions. Additional training with RL can\nalleviate some of these issues, however the training process is expensive.\nInstead, we propose an offline RL framework to improve the persona consistency\nof dialogue systems. Our framework allows us to combine the advantages of\nprevious methods as we can inexpensively train our model on existing data as in\nsupervised learning, while punishing and rewarding specific utterances as in\nRL. We also introduce a simple importance sampling method to reduce the\nvariance of importance weights in offline RL training which we call\nVariance-Reducing MLE-Initialized (VaRMI) importance sampling. Our automatic\nand human evaluations show that our framework improves both the persona\nconsistency and dialogue quality of a state-of-the-art social chatbot.\n","authors":["Ryan Shea","Zhou Yu"],"pdf_url":"https://arxiv.org/pdf/2310.10735v1.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.10648v1","updated":"2023-10-16T17:59:50Z","published":"2023-10-16T17:59:50Z","title":"Step-by-Step Remediation of Students' Mathematical Mistakes","summary":"  Scaling high-quality tutoring is a major challenge in education. Because of\nthe growing demand, many platforms employ novice tutors who, unlike\nprofessional educators, struggle to effectively address student mistakes and\nthus fail to seize prime learning opportunities for students. In this paper, we\nexplore the potential for large language models (LLMs) to assist math tutors in\nremediating student mistakes. We present ReMath, a benchmark co-developed with\nexperienced math teachers that deconstructs their thought process for\nremediation. The benchmark consists of three step-by-step tasks: (1) infer the\ntype of student error, (2) determine the strategy to address the error, and (3)\ngenerate a response that incorporates that information. We evaluate the\nperformance of state-of-the-art instruct-tuned and dialog models on ReMath. Our\nfindings suggest that although models consistently improve upon original tutor\nresponses, we cannot rely on models alone to remediate mistakes. Providing\nmodels with the error type (e.g., the student is guessing) and strategy (e.g.,\nsimplify the problem) leads to a 75% improvement in the response quality over\nmodels without that information. Nonetheless, despite the improvement, the\nquality of the best model's responses still falls short of experienced math\nteachers. Our work sheds light on the potential and limitations of using\ncurrent LLMs to provide high-quality learning experiences for both tutors and\nstudents at scale. Our work is open-sourced at this link:\n\\url{https://github.com/rosewang2008/remath}.\n","authors":["Rose E. Wang","Qingyang Zhang","Carly Robinson","Susanna Loeb","Dorottya Demszky"],"pdf_url":"https://arxiv.org/pdf/2310.10648v1.pdf","comment":"Our work is open-sourced at this link:\n  \\url{https://github.com/rosewang2008/remath}"},{"id":"http://arxiv.org/abs/2310.10645v1","updated":"2023-10-16T17:59:12Z","published":"2023-10-16T17:59:12Z","title":"Interactive Task Planning with Language Models","summary":"  An interactive robot framework accomplishes long-horizon task planning and\ncan easily generalize to new goals or distinct tasks, even during execution.\nHowever, most traditional methods require predefined module design, which makes\nit hard to generalize to different goals. Recent large language model based\napproaches can allow for more open-ended planning but often require heavy\nprompt engineering or domain-specific pretrained models. To tackle this, we\npropose a simple framework that achieves interactive task planning with\nlanguage models. Our system incorporates both high-level planning and low-level\nfunction execution via language. We verify the robustness of our system in\ngenerating novel high-level instructions for unseen objectives and its ease of\nadaptation to different tasks by merely substituting the task guidelines,\nwithout the need for additional complex prompt engineering. Furthermore, when\nthe user sends a new request, our system is able to replan accordingly with\nprecision based on the new request, task guidelines and previously executed\nsteps. Please check more details on our https://wuphilipp.github.io/itp_site\nand https://youtu.be/TrKLuyv26_g.\n","authors":["Boyi Li","Philipp Wu","Pieter Abbeel","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2310.10645v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10638v1","updated":"2023-10-16T17:57:12Z","published":"2023-10-16T17:57:12Z","title":"In-Context Pretraining: Language Modeling Beyond Document Boundaries","summary":"  Large language models (LMs) are currently trained to predict tokens given\ndocument prefixes, enabling them to directly perform long-form generation and\nprompting-style tasks which can be reduced to document completion. Existing\npretraining pipelines train LMs by concatenating random sets of short documents\nto create input contexts but the prior documents provide no signal for\npredicting the next document. We instead present In-Context Pretraining, a new\napproach where language models are pretrained on a sequence of related\ndocuments, thereby explicitly encouraging them to read and reason across\ndocument boundaries. We can do In-Context Pretraining by simply changing the\ndocument ordering so that each context contains related documents, and directly\napplying existing pretraining pipelines. However, this document sorting problem\nis challenging. There are billions of documents and we would like the sort to\nmaximize contextual similarity for every document without repeating any data.\nTo do this, we introduce approximate algorithms for finding related documents\nwith efficient nearest neighbor search and constructing coherent input contexts\nwith a graph traversal algorithm. Our experiments show In-Context Pretraining\noffers a simple and scalable approach to significantly enhance LMs'performance:\nwe see notable improvements in tasks that require more complex contextual\nreasoning, including in-context learning (+8%), reading comprehension (+15%),\nfaithfulness to previous contexts (+16%), long-context reasoning (+5%), and\nretrieval augmentation (+9%).\n","authors":["Weijia Shi","Sewon Min","Maria Lomeli","Chunting Zhou","Margaret Li","Victoria Lin","Noah A. Smith","Luke Zettlemoyer","Scott Yih","Mike Lewis"],"pdf_url":"https://arxiv.org/pdf/2310.10638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10637v1","updated":"2023-10-16T17:56:07Z","published":"2023-10-16T17:56:07Z","title":"\"Mistakes Help Us Grow\": Facilitating and Evaluating Growth Mindset\n  Supportive Language in Classrooms","summary":"  Teachers' growth mindset supportive language (GMSL)--rhetoric emphasizing\nthat one's skills can be improved over time--has been shown to significantly\nreduce disparities in academic achievement and enhance students' learning\noutcomes. Although teachers espouse growth mindset principles, most find it\ndifficult to adopt GMSL in their practice due the lack of effective coaching in\nthis area. We explore whether large language models (LLMs) can provide\nautomated, personalized coaching to support teachers' use of GMSL. We establish\nan effective coaching tool to reframe unsupportive utterances to GMSL by\ndeveloping (i) a parallel dataset containing GMSL-trained teacher reframings of\nunsupportive statements with an accompanying annotation guide, (ii) a GMSL\nprompt framework to revise teachers' unsupportive language, and (iii) an\nevaluation framework grounded in psychological theory for evaluating GMSL with\nthe help of students and teachers. We conduct a large-scale evaluation\ninvolving 174 teachers and 1,006 students, finding that both teachers and\nstudents perceive GMSL-trained teacher and model reframings as more effective\nin fostering a growth mindset and promoting challenge-seeking behavior, among\nother benefits. We also find that model-generated reframings outperform those\nfrom the GMSL-trained teachers. These results show promise for harnessing LLMs\nto provide automated GMSL feedback for teachers and, more broadly, LLMs'\npotentiality for supporting students' learning in the classroom. Our findings\nalso demonstrate the benefit of large-scale human evaluations when applying\nLLMs in educational domains.\n","authors":["Kunal Handa","Margaret Clapper","Jessica Boyle","Rose E Wang","Diyi Yang","David S Yeager","Dorottya Demszky"],"pdf_url":"https://arxiv.org/pdf/2310.10637v1.pdf","comment":"In Proceedings of the 2023 Conference on Empirical Methods in Natural\n  Language Processing"},{"id":"http://arxiv.org/abs/2310.10634v1","updated":"2023-10-16T17:54:53Z","published":"2023-10-16T17:54:53Z","title":"OpenAgents: An Open Platform for Language Agents in the Wild","summary":"  Language agents show potential in being capable of utilizing natural language\nfor varied and intricate tasks in diverse environments, particularly when built\nupon large language models (LLMs). Current language agent frameworks aim to\nfacilitate the construction of proof-of-concept language agents while\nneglecting the non-expert user access to agents and paying little attention to\napplication-level designs. We present OpenAgents, an open platform for using\nand hosting language agents in the wild of everyday life. OpenAgents includes\nthree agents: (1) Data Agent for data analysis with Python/SQL and data tools;\n(2) Plugins Agent with 200+ daily API tools; (3) Web Agent for autonomous web\nbrowsing. OpenAgents enables general users to interact with agent\nfunctionalities through a web user interface optimized for swift responses and\ncommon failures while offering developers and researchers a seamless deployment\nexperience on local setups, providing a foundation for crafting innovative\nlanguage agents and facilitating real-world evaluations. We elucidate the\nchallenges and opportunities, aspiring to set a foundation for future research\nand development of real-world language agents.\n","authors":["Tianbao Xie","Fan Zhou","Zhoujun Cheng","Peng Shi","Luoxuan Weng","Yitao Liu","Toh Jing Hua","Junning Zhao","Qian Liu","Che Liu","Leo Z. Liu","Yiheng Xu","Hongjin Su","Dongchan Shin","Caiming Xiong","Tao Yu"],"pdf_url":"https://arxiv.org/pdf/2310.10634v1.pdf","comment":"34 pages, 8 figures"},{"id":"http://arxiv.org/abs/2310.10632v1","updated":"2023-10-16T17:54:20Z","published":"2023-10-16T17:54:20Z","title":"BioPlanner: Automatic Evaluation of LLMs on Protocol Planning in Biology","summary":"  The ability to automatically generate accurate protocols for scientific\nexperiments would represent a major step towards the automation of science.\nLarge Language Models (LLMs) have impressive capabilities on a wide range of\ntasks, such as question answering and the generation of coherent text and code.\nHowever, LLMs can struggle with multi-step problems and long-term planning,\nwhich are crucial for designing scientific experiments. Moreover, evaluation of\nthe accuracy of scientific protocols is challenging, because experiments can be\ndescribed correctly in many different ways, require expert knowledge to\nevaluate, and cannot usually be executed automatically. Here we present an\nautomatic evaluation framework for the task of planning experimental protocols,\nand we introduce BioProt: a dataset of biology protocols with corresponding\npseudocode representations. To measure performance on generating scientific\nprotocols, we use an LLM to convert a natural language protocol into\npseudocode, and then evaluate an LLM's ability to reconstruct the pseudocode\nfrom a high-level description and a list of admissible pseudocode functions. We\nevaluate GPT-3 and GPT-4 on this task and explore their robustness. We\nexternally validate the utility of pseudocode representations of text by\ngenerating accurate novel protocols using retrieved pseudocode, and we run a\ngenerated protocol successfully in our biological laboratory. Our framework is\nextensible to the evaluation and improvement of language model planning\nabilities in other areas of science or other areas that lack automatic\nevaluation.\n","authors":["Odhran O'Donoghue","Aleksandar Shtedritski","John Ginger","Ralph Abboud","Ali Essa Ghareeb","Justin Booth","Samuel G Rodriques"],"pdf_url":"https://arxiv.org/pdf/2310.10632v1.pdf","comment":"EMNLP 2023. Dataset and code:\n  https://github.com/bioplanner/bioplanner"},{"id":"http://arxiv.org/abs/2310.10631v1","updated":"2023-10-16T17:54:07Z","published":"2023-10-16T17:54:07Z","title":"Llemma: An Open Language Model For Mathematics","summary":"  We present Llemma, a large language model for mathematics. We continue\npretraining Code Llama on the Proof-Pile-2, a mixture of scientific papers, web\ndata containing mathematics, and mathematical code, yielding Llemma. On the\nMATH benchmark Llemma outperforms all known open base models, as well as the\nunreleased Minerva model suite on an equi-parameter basis. Moreover, Llemma is\ncapable of tool use and formal theorem proving without any further finetuning.\nWe openly release all artifacts, including 7 billion and 34 billion parameter\nmodels, the Proof-Pile-2, and code to replicate our experiments.\n","authors":["Zhangir Azerbayev","Hailey Schoelkopf","Keiran Paster","Marco Dos Santos","Stephen McAleer","Albert Q. Jiang","Jia Deng","Stella Biderman","Sean Welleck"],"pdf_url":"https://arxiv.org/pdf/2310.10631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10628v1","updated":"2023-10-16T17:51:29Z","published":"2023-10-16T17:51:29Z","title":"Data Contamination Through the Lens of Time","summary":"  Recent claims about the impressive abilities of large language models (LLMs)\nare often supported by evaluating publicly available benchmarks. Since LLMs\ntrain on wide swaths of the internet, this practice raises concerns of data\ncontamination, i.e., evaluating on examples that are explicitly or implicitly\nincluded in the training data. Data contamination remains notoriously\nchallenging to measure and mitigate, even with partial attempts like controlled\nexperimentation of training data, canary strings, or embedding similarities. In\nthis work, we conduct the first thorough longitudinal analysis of data\ncontamination in LLMs by using the natural experiment of training cutoffs in\nGPT models to look at benchmarks released over time. Specifically, we consider\ntwo code/mathematical problem-solving datasets, Codeforces and Project Euler,\nand find statistically significant trends among LLM pass rate vs. GitHub\npopularity and release date that provide strong evidence of contamination. By\nopen-sourcing our dataset, raw results, and evaluation framework, our work\npaves the way for rigorous analyses of data contamination in modern models. We\nconclude with a discussion of best practices and future steps for publicly\nreleasing benchmarks in the age of LLMs that train on webscale data.\n","authors":["Manley Roberts","Himanshu Thakur","Christine Herlihy","Colin White","Samuel Dooley"],"pdf_url":"https://arxiv.org/pdf/2310.10628v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10627v1","updated":"2023-10-16T17:51:17Z","published":"2023-10-16T17:51:17Z","title":"Factored Verification: Detecting and Reducing Hallucination in Summaries\n  of Academic Papers","summary":"  Hallucination plagues even frontier LLMs--but how bad is it really for\nsummarizing academic papers? We evaluate Factored Verification, a simple\nautomated method for detecting hallucinations in abstractive summaries. This\nmethod sets a new SotA on hallucination detection in the summarization task of\nthe HaluEval benchmark, achieving 76.2% accuracy. We then use this method to\nestimate how often language models hallucinate when summarizing across multiple\nacademic papers and find 0.62 hallucinations in the average ChatGPT (16k)\nsummary, 0.84 for GPT-4, and 1.55 for Claude 2. We ask models to self-correct\nusing Factored Critiques and find that this lowers the number of hallucinations\nto 0.49 for ChatGPT, 0.46 for GPT-4, and 0.95 for Claude 2. The hallucinations\nwe find are often subtle, so we advise caution when using models to synthesize\nacademic papers.\n","authors":["Charlie George","Andreas Stuhlmüller"],"pdf_url":"https://arxiv.org/pdf/2310.10627v1.pdf","comment":"Second Workshop on Information Extraction from Scientific\n  Publications (WIESP) at IJCNLP-AACL 2023"},{"id":"http://arxiv.org/abs/2310.10623v1","updated":"2023-10-16T17:46:26Z","published":"2023-10-16T17:46:26Z","title":"Generating Summaries with Controllable Readability Levels","summary":"  Readability refers to how easily a reader can understand a written text.\nSeveral factors affect the readability level, such as the complexity of the\ntext, its subject matter, and the reader's background knowledge. Generating\nsummaries based on different readability levels is critical for enabling\nknowledge consumption by diverse audiences. However, current text generation\napproaches lack refined control, resulting in texts that are not customized to\nreaders' proficiency levels. In this work, we bridge this gap and study\ntechniques to generate summaries at specified readability levels. Unlike\nprevious methods that focus on a specific readability level (e.g., lay\nsummarization), we generate summaries with fine-grained control over their\nreadability. We develop three text generation techniques for controlling\nreadability: (1) instruction-based readability control, (2) reinforcement\nlearning to minimize the gap between requested and observed readability and (3)\na decoding approach that uses lookahead to estimate the readability of upcoming\ndecoding steps. We show that our generation methods significantly improve\nreadability control on news summarization (CNN/DM dataset), as measured by\nvarious readability metrics and human judgement, establishing strong baselines\nfor controllable readability in summarization.\n","authors":["Leonardo F. R. Ribeiro","Mohit Bansal","Markus Dreyer"],"pdf_url":"https://arxiv.org/pdf/2310.10623v1.pdf","comment":"Accepted as an EMNLP 2023 main paper"},{"id":"http://arxiv.org/abs/2301.04213v2","updated":"2023-10-16T17:42:58Z","published":"2023-01-10T21:26:08Z","title":"Does Localization Inform Editing? Surprising Differences in\n  Causality-Based Localization vs. Knowledge Editing in Language Models","summary":"  Language models learn a great quantity of factual information during\npretraining, and recent work localizes this information to specific model\nweights like mid-layer MLP weights. In this paper, we find that we can change\nhow a fact is stored in a model by editing weights that are in a different\nlocation than where existing methods suggest that the fact is stored. This is\nsurprising because we would expect that localizing facts to specific model\nparameters would tell us where to manipulate knowledge in models, and this\nassumption has motivated past work on model editing methods. Specifically, we\nshow that localization conclusions from representation denoising (also known as\nCausal Tracing) do not provide any insight into which model MLP layer would be\nbest to edit in order to override an existing stored fact with a new one. This\nfinding raises questions about how past work relies on Causal Tracing to select\nwhich model layers to edit. Next, we consider several variants of the editing\nproblem, including erasing and amplifying facts. For one of our editing\nproblems, editing performance does relate to localization results from\nrepresentation denoising, but we find that which layer we edit is a far better\npredictor of performance. Our results suggest, counterintuitively, that better\nmechanistic understanding of how pretrained language models work may not always\ntranslate to insights about how to best change their behavior. Our code is\navailable at https://github.com/google/belief-localization\n","authors":["Peter Hase","Mohit Bansal","Been Kim","Asma Ghandeharioun"],"pdf_url":"https://arxiv.org/pdf/2301.04213v2.pdf","comment":"NeurIPS 2023 (Spotlight). 26 pages, 22 figures"},{"id":"http://arxiv.org/abs/2310.10605v1","updated":"2023-10-16T17:31:34Z","published":"2023-10-16T17:31:34Z","title":"ForceGen: End-to-end de novo protein generation based on nonlinear\n  mechanical unfolding responses using a protein language diffusion model","summary":"  Through evolution, nature has presented a set of remarkable protein\nmaterials, including elastins, silks, keratins and collagens with superior\nmechanical performances that play crucial roles in mechanobiology. However,\ngoing beyond natural designs to discover proteins that meet specified\nmechanical properties remains challenging. Here we report a generative model\nthat predicts protein designs to meet complex nonlinear mechanical\nproperty-design objectives. Our model leverages deep knowledge on protein\nsequences from a pre-trained protein language model and maps mechanical\nunfolding responses to create novel proteins. Via full-atom molecular\nsimulations for direct validation, we demonstrate that the designed proteins\nare novel, and fulfill the targeted mechanical properties, including unfolding\nenergy and mechanical strength, as well as the detailed unfolding\nforce-separation curves. Our model offers rapid pathways to explore the\nenormous mechanobiological protein sequence space unconstrained by biological\nsynthesis, using mechanical features as target to enable the discovery of\nprotein materials with superior mechanical properties.\n","authors":["Bo Ni","David L. Kaplan","Markus J. Buehler"],"pdf_url":"https://arxiv.org/pdf/2310.10605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10594v1","updated":"2023-10-16T17:16:32Z","published":"2023-10-16T17:16:32Z","title":"Motion2Language, Unsupervised learning of synchronized semantic motion\n  segmentation","summary":"  In this paper, we investigate building a sequence to sequence architecture\nfor motion to language translation and synchronization. The aim is to translate\nmotion capture inputs into English natural-language descriptions, such that the\ndescriptions are generated synchronously with the actions performed, enabling\nsemantic segmentation as a byproduct, but without requiring synchronized\ntraining data. We propose a new recurrent formulation of local attention that\nis suited for synchronous/live text generation, as well as an improved motion\nencoder architecture better suited to smaller data and for synchronous\ngeneration. We evaluate both contributions in individual experiments, using the\nstandard BLEU4 metric, as well as a simple semantic equivalence measure, on the\nKIT motion language dataset. In a follow-up experiment, we assess the quality\nof the synchronization of generated text in our proposed approaches through\nmultiple evaluation metrics. We find that both contributions to the attention\nmechanism and the encoder architecture additively improve the quality of\ngenerated text (BLEU and semantic equivalence), but also of synchronization.\nOur code will be made available at\n\\url{https://github.com/rd20karim/M2T-Segmentation/tree/main}\n","authors":["Karim Radouane","Andon Tchechmedjiev","Sylvie Ranwez","Julien Lagarde"],"pdf_url":"https://arxiv.org/pdf/2310.10594v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2310.10590v1","updated":"2023-10-16T17:11:42Z","published":"2023-10-16T17:11:42Z","title":"Mastering the Task of Open Information Extraction with Large Language\n  Models and Consistent Reasoning Environment","summary":"  Open Information Extraction (OIE) aims to extract objective structured\nknowledge from natural texts, which has attracted growing attention to build\ndedicated models with human experience. As the large language models (LLMs)\nhave exhibited remarkable in-context learning capabilities, a question arises\nas to whether the task of OIE can be effectively tackled with this paradigm? In\nthis paper, we explore solving the OIE problem by constructing an appropriate\nreasoning environment for LLMs. Specifically, we first propose a method to\neffectively estimate the discrepancy of syntactic distribution between a LLM\nand test samples, which can serve as correlation evidence for preparing\npositive demonstrations. Upon the evidence, we introduce a simple yet effective\nmechanism to establish the reasoning environment for LLMs on specific tasks.\nWithout bells and whistles, experimental results on the standard CaRB benchmark\ndemonstrate that our $6$-shot approach outperforms state-of-the-art supervised\nmethod, achieving an $55.3$ $F_1$ score. Further experiments on TACRED and\nACE05 show that our method can naturally generalize to other information\nextraction tasks, resulting in improvements of $5.7$ and $6.8$ $F_1$ scores,\nrespectively.\n","authors":["Ji Qi","Kaixuan Ji","Xiaozhi Wang","Jifan Yu","Kaisheng Zeng","Lei Hou","Juanzi Li","Bin Xu"],"pdf_url":"https://arxiv.org/pdf/2310.10590v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10586v1","updated":"2023-10-16T17:05:56Z","published":"2023-10-16T17:05:56Z","title":"BiLL-VTG: Bridging Large Language Models and Lightweight Visual Tools\n  for Video-based Texts Generation","summary":"  Building models that generate textual responses to user instructions for\nvideos is a practical and challenging topic, as it requires both vision\nunderstanding and knowledge reasoning. Compared to language and image\nmodalities, training efficiency remains a serious problem as existing studies\ntrain models on massive sparse videos aligned with brief descriptions. In this\npaper, we introduce BiLL-VTG, a fast adaptive framework that leverages large\nlanguage models (LLMs) to reasoning on videos based on essential lightweight\nvisual tools. Specifically, we reveal the key to response specific instructions\nis the concentration on relevant video events, and utilize two visual tools of\nstructured scene graph generation and descriptive image caption generation to\ngather and represent the events information. Thus, a LLM equipped with world\nknowledge is adopted as the reasoning agent to achieve the response by\nperforming multiple reasoning steps on specified video events.To address the\ndifficulty of specifying events from agent, we further propose an\nInstruction-oriented Video Events Recognition (InsOVER) algorithm based on the\nefficient Hungarian matching to localize corresponding video events using\nlinguistic instructions, enabling LLMs to interact with long videos. Extensive\nexperiments on two typical video-based texts generations tasks show that our\ntuning-free framework outperforms the pre-trained models including\nFlamingo-80B, to achieve the state-of-the-art performance.\n","authors":["Ji Qi","Kaixuan Ji","Jifan Yu","Duokang Wang","Bin Xu","Lei Hou","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2310.10586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10583v1","updated":"2023-10-16T16:57:55Z","published":"2023-10-16T16:57:55Z","title":"Who Are All The Stochastic Parrots Imitating? They Should Tell Us!","summary":"  Both standalone language models (LMs) as well as LMs within downstream-task\nsystems have been shown to generate statements which are factually untrue. This\nproblem is especially severe for low-resource languages, where training data is\nscarce and of worse quality than for high-resource languages. In this opinion\npiece, we argue that LMs in their current state will never be fully trustworthy\nin critical settings and suggest a possible novel strategy to handle this\nissue: by building LMs such that can cite their sources - i.e., point a user to\nthe parts of their training data that back up their outputs. We first discuss\nwhich current NLP tasks would or would not benefit from such models. We then\nhighlight the expected benefits such models would bring, e.g., quick\nverifiability of statements. We end by outlining the individual tasks that\nwould need to be solved on the way to developing LMs with the ability to cite.\nWe hope to start a discussion about the field's current approach to building\nLMs, especially for low-resource languages, and the role of the training data\nin explaining model generations.\n","authors":["Sagi Shaier","Lawrence E. Hunter","Katharina von der Wense"],"pdf_url":"https://arxiv.org/pdf/2310.10583v1.pdf","comment":"Accepted to IJCNLP-AACL 2023"},{"id":"http://arxiv.org/abs/2310.10571v1","updated":"2023-10-16T16:45:52Z","published":"2023-10-16T16:45:52Z","title":"Emerging Challenges in Personalized Medicine: Assessing Demographic\n  Effects on Biomedical Question Answering Systems","summary":"  State-of-the-art question answering (QA) models exhibit a variety of social\nbiases (e.g., with respect to sex or race), generally explained by similar\nissues in their training data. However, what has been overlooked so far is that\nin the critical domain of biomedicine, any unjustified change in model output\ndue to patient demographics is problematic: it results in the unfair treatment\nof patients. Selecting only questions on biomedical topics whose answers do not\ndepend on ethnicity, sex, or sexual orientation, we ask the following research\nquestions: (RQ1) Do the answers of QA models change when being provided with\nirrelevant demographic information? (RQ2) Does the answer of RQ1 differ between\nknowledge graph (KG)-grounded and text-based QA systems? We find that\nirrelevant demographic information change up to 15% of the answers of a\nKG-grounded system and up to 23% of the answers of a text-based system,\nincluding changes that affect accuracy. We conclude that unjustified answer\nchanges caused by patient demographics are a frequent phenomenon, which raises\nfairness concerns and should be paid more attention to.\n","authors":["Sagi Shaier","Kevin Bennett","Lawrence Hunter","Katharina von der Wense"],"pdf_url":"https://arxiv.org/pdf/2310.10571v1.pdf","comment":"Accepted to IJCNLP-AACL 2023"},{"id":"http://arxiv.org/abs/2310.10570v1","updated":"2023-10-16T16:45:12Z","published":"2023-10-16T16:45:12Z","title":"On Position Bias in Summarization with Large Language Models","summary":"  Large language models (LLMs) excel in zero-shot abstractive summarization\ntasks, delivering fluent and pertinent summaries. Recent advancements have\nextended their capabilities to handle long-input contexts, surpassing token\nlimits of 32k or more. However, in the realm of multi-document question\nanswering, language models exhibit uneven utilization of their input context.\nThey tend to favor the initial and final segments, resulting in a U-shaped\nperformance pattern concerning where the answer is located within the input.\nThis bias raises concerns, particularly in summarization tasks where crucial\ncontent may be dispersed throughout the source document(s). This paper presents\na comprehensive investigation encompassing 10 datasets, 4 LLMs, and 5\nevaluation metrics to analyze how these models leverage their input for\nabstractive summarization. Our findings reveal a pronounced bias towards the\nintroductory content (and to a lesser extent, the final content), posing\nchallenges for LLM performance across a range of diverse summarization\nbenchmarks.\n","authors":["Mathieu Ravaut","Shafiq Joty","Aixin Sun","Nancy F. Chen"],"pdf_url":"https://arxiv.org/pdf/2310.10570v1.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2310.10567v1","updated":"2023-10-16T16:42:01Z","published":"2023-10-16T16:42:01Z","title":"RegaVAE: A Retrieval-Augmented Gaussian Mixture Variational Auto-Encoder\n  for Language Modeling","summary":"  Retrieval-augmented language models show promise in addressing issues like\noutdated information and hallucinations in language models (LMs). However,\ncurrent research faces two main problems: 1) determining what information to\nretrieve, and 2) effectively combining retrieved information during generation.\nWe argue that valuable retrieved information should not only be related to the\ncurrent source text but also consider the future target text, given the nature\nof LMs that model future tokens. Moreover, we propose that aggregation using\nlatent variables derived from a compact latent space is more efficient than\nutilizing explicit raw text, which is limited by context length and susceptible\nto noise. Therefore, we introduce RegaVAE, a retrieval-augmented language model\nbuilt upon the variational auto-encoder (VAE). It encodes the text corpus into\na latent space, capturing current and future information from both source and\ntarget text. Additionally, we leverage the VAE to initialize the latent space\nand adopt the probabilistic form of the retrieval generation paradigm by\nexpanding the Gaussian prior distribution into a Gaussian mixture distribution.\nTheoretical analysis provides an optimizable upper bound for RegaVAE.\nExperimental results on various datasets demonstrate significant improvements\nin text generation quality and hallucination removal.\n","authors":["Jingcheng Deng","Liang Pang","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2310.10567v1.pdf","comment":"Accepted to the Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.07923v2","updated":"2023-10-16T16:30:21Z","published":"2023-10-11T22:35:18Z","title":"The Expresssive Power of Transformers with Chain of Thought","summary":"  Recent theoretical work has identified surprisingly simple reasoning\nproblems, such as checking if two nodes in a graph are connected or simulating\nfinite-state machines, that are provably unsolvable by standard transformers\nthat answer immediately after reading their input. However, in practice,\ntransformers' reasoning can be improved by allowing them to use a \"chain of\nthought\" or \"scratchpad\", i.e., generate and condition on a sequence of\nintermediate tokens before answering. Motivated by this, we ask: Does such\nintermediate generation fundamentally extend the computational power of a\ndecoder-only transformer? We show that the answer is yes, but the amount of\nincrease depends crucially on the amount of intermediate generation. For\ninstance, we find that transformer decoders with a logarithmic number of\ndecoding steps (w.r.t. the input length) push the limits of standard\ntransformers only slightly, while a linear number of decoding steps adds a\nclear new ability (under standard complexity conjectures): recognizing all\nregular languages. Our results also imply that linear steps keep transformer\ndecoders within context-sensitive languages, and polynomial steps make them\nrecognize exactly the class of polynomial-time solvable problems -- the first\nexact characterization of a type of transformers in terms of standard\ncomplexity classes. Together, our results provide a nuanced framework for\nunderstanding how the length of a transformer's chain of thought or scratchpad\nimpacts its reasoning power.\n","authors":["William Merrill","Ashish Sabharwal"],"pdf_url":"https://arxiv.org/pdf/2310.07923v2.pdf","comment":"9-page preprint"},{"id":"http://arxiv.org/abs/2304.08612v3","updated":"2023-10-16T16:27:02Z","published":"2023-04-17T20:59:49Z","title":"Bridging Discrete and Backpropagation: Straight-Through and Beyond","summary":"  Backpropagation, the cornerstone of deep learning, is limited to computing\ngradients for continuous variables. This limitation poses challenges for\nproblems involving discrete latent variables. To address this issue, we propose\na novel approach to approximate the gradient of parameters involved in\ngenerating discrete latent variables. First, we examine the widely used\nStraight-Through (ST) heuristic and demonstrate that it works as a first-order\napproximation of the gradient. Guided by our findings, we propose ReinMax,\nwhich achieves second-order accuracy by integrating Heun's method, a\nsecond-order numerical method for solving ODEs. ReinMax does not require\nHessian or other second-order derivatives, thus having negligible computation\noverheads. Extensive experimental results on various tasks demonstrate the\nsuperiority of ReinMax over the state of the art. Implementations are released\nat https://github.com/microsoft/ReinMax.\n","authors":["Liyuan Liu","Chengyu Dong","Xiaodong Liu","Bin Yu","Jianfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2304.08612v3.pdf","comment":"NeurIPS 2023 (Oral)"},{"id":"http://arxiv.org/abs/2310.10707v1","updated":"2023-10-16T16:18:55Z","published":"2023-10-16T16:18:55Z","title":"Demonstrations Are All You Need: Advancing Offensive Content\n  Paraphrasing using In-Context Learning","summary":"  Paraphrasing of offensive content is a better alternative to content removal\nand helps improve civility in a communication environment. Supervised\nparaphrasers; however, rely heavily on large quantities of labelled data to\nhelp preserve meaning and intent. They also retain a large portion of the\noffensiveness of the original content, which raises questions on their overall\nusability. In this paper we aim to assist practitioners in developing usable\nparaphrasers by exploring In-Context Learning (ICL) with large language models\n(LLMs), i.e., using a limited number of input-label demonstration pairs to\nguide the model in generating desired outputs for specific queries. Our study\nfocuses on key factors such as -- number and order of demonstrations, exclusion\nof prompt instruction, and reduction in measured toxicity. We perform\nprincipled evaluation on three datasets, including our proposed Context-Aware\nPolite Paraphrase dataset, comprising of dialogue-style rude utterances, polite\nparaphrases, and additional dialogue context. We evaluate our approach using\ntwo closed source and one open source LLM. Our results reveal that ICL is\ncomparable to supervised methods in generation quality, while being\nqualitatively better by 25% on human evaluation and attaining lower toxicity by\n76%. Also, ICL-based paraphrasers only show a slight reduction in performance\neven with just 10% training data.\n","authors":["Anirudh Som","Karan Sikka","Helen Gent","Ajay Divakaran","Andreas Kathol","Dimitra Vergyri"],"pdf_url":"https://arxiv.org/pdf/2310.10707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10543v1","updated":"2023-10-16T16:14:20Z","published":"2023-10-16T16:14:20Z","title":"ViPE: Visualise Pretty-much Everything","summary":"  Figurative and non-literal expressions are profoundly integrated in human\ncommunication. Visualising such expressions allow us to convey our creative\nthoughts, and evoke nuanced emotions. Recent text-to-image models like Stable\nDiffusion, on the other hand, struggle to depict non-literal expressions.\nRecent works primarily deal with this issue by compiling humanly annotated\ndatasets on a small scale, which not only demands specialised expertise but\nalso proves highly inefficient. To address this issue, we introduce ViPE:\nVisualise Pretty-much Everything. ViPE offers a series of lightweight and\nrobust language models that have been trained on a large-scale set of lyrics\nwith noisy visual descriptions that represent their implicit meaning. The\nsynthetic visual descriptions are generated by GPT3.5 relying on neither human\nannotations nor images. ViPE effectively expresses any arbitrary piece of text\ninto a visualisable description, enabling meaningful and high-quality image\ngeneration. We provide compelling evidence that ViPE is more robust than GPT3.5\nin synthesising visual elaborations. ViPE also exhibits an understanding of\nfigurative expressions comparable to human experts, providing a powerful and\nopen-source backbone to many downstream applications such as music video and\ncaption generation.\n","authors":["Hassan Shahmohammadi","Adhiraj Ghosh","Hendrik P. A. Lensch"],"pdf_url":"https://arxiv.org/pdf/2310.10543v1.pdf","comment":"To be presented in EMNLP2023 Main Conference"},{"id":"http://arxiv.org/abs/2212.05525v3","updated":"2023-10-16T16:11:58Z","published":"2022-12-11T15:45:26Z","title":"Extending TrOCR for Text Localization-Free OCR of Full-Page Scanned\n  Receipt Images","summary":"  Digitization of scanned receipts aims to extract text from receipt images and\nsave it into structured documents. This is usually split into two sub-tasks:\ntext localization and optical character recognition (OCR). Most existing OCR\nmodels only focus on the cropped text instance images, which require the\nbounding box information provided by a text region detection model. Introducing\nan additional detector to identify the text instance images in advance adds\ncomplexity, however instance-level OCR models have very low accuracy when\nprocessing the whole image for the document-level OCR, such as receipt images\ncontaining multiple text lines arranged in various layouts. To this end, we\npropose a localization-free document-level OCR model for transcribing all the\ncharacters in a receipt image into an ordered sequence end-to-end.\nSpecifically, we finetune the pretrained instance-level model TrOCR with\nrandomly cropped image chunks, and gradually increase the image chunk size to\ngeneralize the recognition ability from instance images to full-page images. In\nour experiments on the SROIE receipt OCR dataset, the model finetuned with our\nstrategy achieved 64.4 F1-score and a 22.8% character error rate (CER),\nrespectively, which outperforms the baseline results with 48.5 F1-score and\n50.6% CER. The best model, which splits the full image into 15 equally sized\nchunks, gives 87.8 F1-score and 4.98% CER with minimal additional pre or\npost-processing of the output. Moreover, the characters in the generated\ndocument-level sequences are arranged in the reading order, which is practical\nfor real-world applications.\n","authors":["Hongkuan Zhang","Edward Whittaker","Ikuo Kitagishi"],"pdf_url":"https://arxiv.org/pdf/2212.05525v3.pdf","comment":"ICCV2023 RCV Workshop"},{"id":"http://arxiv.org/abs/2310.10532v1","updated":"2023-10-16T15:50:34Z","published":"2023-10-16T15:50:34Z","title":"One For All & All For One: Bypassing Hyperparameter Tuning with Model\n  Averaging For Cross-Lingual Transfer","summary":"  Multilingual language models enable zero-shot cross-lingual transfer\n(ZS-XLT): fine-tuned on sizable source-language task data, they perform the\ntask in target languages without labeled instances. The effectiveness of ZS-XLT\nhinges on the linguistic proximity between languages and the amount of\npretraining data for a language. Because of this, model selection based on\nsource-language validation is unreliable: it picks model snapshots with\nsuboptimal target-language performance. As a remedy, some work optimizes ZS-XLT\nby extensively tuning hyperparameters: the follow-up work then routinely\nstruggles to replicate the original results. Other work searches over narrower\nhyperparameter grids, reporting substantially lower performance. In this work,\nwe therefore propose an unsupervised evaluation protocol for ZS-XLT that\ndecouples performance maximization from hyperparameter tuning. As a robust and\nmore transparent alternative to extensive hyperparameter tuning, we propose to\naccumulatively average snapshots from different runs into a single model. We\nrun broad ZS-XLT experiments on both higher-level semantic tasks (NLI,\nextractive QA) and a lower-level token classification task (NER) and find that\nconventional model selection based on source-language validation quickly\nplateaus to suboptimal ZS-XLT performance. On the other hand, our accumulative\nrun-by-run averaging of models trained with different hyperparameters boosts\nZS-XLT performance and closely correlates with \"oracle\" ZS-XLT, i.e., model\nselection based on target-language validation performance.\n","authors":["Fabian David Schmidt","Ivan Vulić","Goran Glavaš"],"pdf_url":"https://arxiv.org/pdf/2310.10532v1.pdf","comment":"Accepted to findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.10520v1","updated":"2023-10-16T15:38:02Z","published":"2023-10-16T15:38:02Z","title":"Semantic Parsing by Large Language Models for Intricate Updating\n  Strategies of Zero-Shot Dialogue State Tracking","summary":"  Zero-shot Dialogue State Tracking (DST) addresses the challenge of acquiring\nand annotating task-oriented dialogues, which can be time consuming and costly.\nHowever, DST extends beyond simple slot-filling and requires effective updating\nstrategies for tracking dialogue state as conversations progress. In this\npaper, we propose ParsingDST, a new In-Context Learning (ICL) method, to\nintroduce additional intricate updating strategies in zero-shot DST. Our\napproach reformulates the DST task by leveraging powerful Large Language Models\n(LLMs) and translating the original dialogue text to JSON through semantic\nparsing as an intermediate state. We also design a novel framework that\nincludes more modules to ensure the effectiveness of updating strategies in the\ntext-to-JSON process. Experimental results demonstrate that our approach\noutperforms existing zero-shot DST methods on MultiWOZ, exhibiting significant\nimprovements in Joint Goal Accuracy (JGA) and slot accuracy compared to\nexisting ICL methods.\n","authors":["Yuxiang Wu","Guanting Dong","Weiran Xu"],"pdf_url":"https://arxiv.org/pdf/2310.10520v1.pdf","comment":"Accepted to the Findings of EMNLP 2023 (Short Paper)"},{"id":"http://arxiv.org/abs/2310.10501v1","updated":"2023-10-16T15:20:30Z","published":"2023-10-16T15:20:30Z","title":"NeMo Guardrails: A Toolkit for Controllable and Safe LLM Applications\n  with Programmable Rails","summary":"  NeMo Guardrails is an open-source toolkit for easily adding programmable\nguardrails to LLM-based conversational systems. Guardrails (or rails for short)\nare a specific way of controlling the output of an LLM, such as not talking\nabout topics considered harmful, following a predefined dialogue path, using a\nparticular language style, and more. There are several mechanisms that allow\nLLM providers and developers to add guardrails that are embedded into a\nspecific model at training, e.g. using model alignment. Differently, using a\nruntime inspired from dialogue management, NeMo Guardrails allows developers to\nadd programmable rails to LLM applications - these are user-defined,\nindependent of the underlying LLM, and interpretable. Our initial results show\nthat the proposed approach can be used with several LLM providers to develop\ncontrollable and safe LLM applications using programmable rails.\n","authors":["Traian Rebedea","Razvan Dinu","Makesh Sreedhar","Christopher Parisien","Jonathan Cohen"],"pdf_url":"https://arxiv.org/pdf/2310.10501v1.pdf","comment":"Accepted at EMNLP 2023 - Demo track"},{"id":"http://arxiv.org/abs/2310.10495v1","updated":"2023-10-16T15:17:22Z","published":"2023-10-16T15:17:22Z","title":"Metric Ensembles For Hallucination Detection","summary":"  Abstractive text summarization has garnered increased interest as of late, in\npart due to the proliferation of large language models (LLMs). One of the most\npressing problems related to generation of abstractive summaries is the need to\nreduce \"hallucinations,\" information that was not included in the document\nbeing summarized, and which may be wholly incorrect. Due to this need, a wide\narray of metrics estimating consistency with the text being summarized have\nbeen proposed. We examine in particular a suite of unsupervised metrics for\nsummary consistency, and measure their correlations with each other and with\nhuman evaluation scores in the wiki_bio_gpt3_hallucination dataset. We then\ncompare these evaluations to models made from a simple linear ensemble of these\nmetrics. We find that LLM-based methods outperform other unsupervised metrics\nfor hallucination detection. We also find that ensemble methods can improve\nthese scores even further, provided that the metrics in the ensemble have\nsufficiently similar and uncorrelated error rates. Finally, we present an\nensemble method for LLM-based evaluations that we show improves over this\nprevious SOTA.\n","authors":["Grant C. Forbes","Parth Katlana","Zeydy Ortiz"],"pdf_url":"https://arxiv.org/pdf/2310.10495v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.10492v1","updated":"2023-10-16T15:16:16Z","published":"2023-10-16T15:16:16Z","title":"UNO-DST: Leveraging Unlabelled Data in Zero-Shot Dialogue State Tracking","summary":"  Previous zero-shot dialogue state tracking (DST) methods only apply transfer\nlearning, but ignore unlabelled data in the target domain. We transform\nzero-shot DST into few-shot DST by utilising such unlabelled data via joint and\nself-training methods. Our method incorporates auxiliary tasks that generate\nslot types as inverse prompts for main tasks, creating slot values during joint\ntraining. Cycle consistency between these two tasks enables the generation and\nselection of quality samples in unknown target domains for subsequent\nfine-tuning. This approach also facilitates automatic label creation, thereby\noptimizing the training and fine-tuning of DST models. We demonstrate this\nmethod's effectiveness on large language models in zero-shot scenarios,\nimproving average joint goal accuracy by $8\\%$ across all domains in MultiWOZ.\n","authors":["Chuang Li","Yan Zhang","Min-Yen Kan","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2310.10492v1.pdf","comment":"8 pages, 6 figures, 6 tables"},{"id":"http://arxiv.org/abs/2310.10706v1","updated":"2023-10-16T15:11:01Z","published":"2023-10-16T15:11:01Z","title":"Harnessing the Power of LLMs: Evaluating Human-AI text Co-Creation\n  through the Lens of News Headline Generation","summary":"  To explore how humans can best leverage LLMs for writing and how interacting\nwith these models affects feelings of ownership and trust in the writing\nprocess, we compared common human-AI interaction types (e.g., guiding system,\nselecting from system outputs, post-editing outputs) in the context of\nLLM-assisted news headline generation. While LLMs alone can generate\nsatisfactory news headlines, on average, human control is needed to fix\nundesirable model outputs. Of the interaction methods, guiding and selecting\nmodel output added the most benefit with the lowest cost (in time and effort).\nFurther, AI assistance did not harm participants' perception of control\ncompared to freeform editing.\n","authors":["Zijian Ding","Alison Smith-Renner","Wenjuan Zhang","Joel R. Tetreault","Alejandro Jaimes"],"pdf_url":"https://arxiv.org/pdf/2310.10706v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2212.07055v2","updated":"2023-10-16T23:27:20Z","published":"2022-12-14T06:51:39Z","title":"Most Important Person-guided Dual-branch Cross-Patch Attention for Group\n  Affect Recognition","summary":"  Group affect refers to the subjective emotion that is evoked by an external\nstimulus in a group, which is an important factor that shapes group behavior\nand outcomes. Recognizing group affect involves identifying important\nindividuals and salient objects among a crowd that can evoke emotions. However,\nmost existing methods lack attention to affective meaning in group dynamics and\nfail to account for the contextual relevance of faces and objects in\ngroup-level images. In this work, we propose a solution by incorporating the\npsychological concept of the Most Important Person (MIP), which represents the\nmost noteworthy face in a crowd and has affective semantic meaning. We present\nthe Dual-branch Cross-Patch Attention Transformer (DCAT) which uses global\nimage and MIP together as inputs. Specifically, we first learn the informative\nfacial regions produced by the MIP and the global context separately. Then, the\nCross-Patch Attention module is proposed to fuse the features of MIP and global\ncontext together to complement each other. Our proposed method outperforms\nstate-of-the-art methods on GAF 3.0, GroupEmoW, and HECO datasets. Moreover, we\ndemonstrate the potential for broader applications by showing that our proposed\nmodel can be transferred to another group affect task, group cohesion, and\nachieve comparable results.\n","authors":["Hongxia Xie","Ming-Xian Lee","Tzu-Jui Chen","Hung-Jen Chen","Hou-I Liu","Hong-Han Shuai","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2212.07055v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2303.11435v4","updated":"2023-10-16T23:17:06Z","published":"2023-03-20T20:28:17Z","title":"Inversion by Direct Iteration: An Alternative to Denoising Diffusion for\n  Image Restoration","summary":"  Inversion by Direct Iteration (InDI) is a new formulation for supervised\nimage restoration that avoids the so-called ``regression to the mean'' effect\nand produces more realistic and detailed images than existing regression-based\nmethods. It does this by gradually improving image quality in small steps,\nsimilar to generative denoising diffusion models. Image restoration is an\nill-posed problem where multiple high-quality images are plausible\nreconstructions of a given low-quality input. Therefore, the outcome of a\nsingle step regression model is typically an aggregate of all possible\nexplanations, therefore lacking details and realism. The main advantage of InDI\nis that it does not try to predict the clean target image in a single step but\ninstead gradually improves the image in small steps, resulting in better\nperceptual quality. While generative denoising diffusion models also work in\nsmall steps, our formulation is distinct in that it does not require knowledge\nof any analytic form of the degradation process. Instead, we directly learn an\niterative restoration process from low-quality and high-quality paired\nexamples. InDI can be applied to virtually any image degradation, given paired\ntraining data. In conditional denoising diffusion image restoration the\ndenoising network generates the restored image by repeatedly denoising an\ninitial image of pure noise, conditioned on the degraded input. Contrary to\nconditional denoising formulations, InDI directly proceeds by iteratively\nrestoring the input low-quality image, producing high-quality results on a\nvariety of image restoration tasks, including motion and out-of-focus\ndeblurring, super-resolution, compression artifact removal, and denoising.\n","authors":["Mauricio Delbracio","Peyman Milanfar"],"pdf_url":"https://arxiv.org/pdf/2303.11435v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10875v1","updated":"2023-10-16T23:01:39Z","published":"2023-10-16T23:01:39Z","title":"Filling the Holes on 3D Heritage Object Surface based on Automatic\n  Segmentation Algorithm","summary":"  Reconstructing and processing the 3D objects are popular activities in the\nresearch field of computer graphics, image processing and computer vision. The\n3D objects are processed based on the methods like geometric modeling, a branch\nof applied mathematics and computational geometry, or the machine learning\nalgorithms based on image processing. The computation of geometrical objects\nincludes processing the curves and surfaces, subdivision, simplification,\nmeshing, holes filling, reconstructing, and refining the 3D surface objects on\nboth point cloud data and triangular mesh. While the machine learning methods\nare developed using deep learning models. With the support of 3D laser scan\ndevices and Lidar techniques, the obtained dataset is close to original shape\nof the real objects. Besides, the photography and its application based on the\nmodern techniques in recent years help us collect data and process the 3D\nmodels more precise. This article proposes an improved method for filling holes\non the 3D object surface based on an automatic segmentation. Instead of filling\nthe hole directly as the existing methods, we now subdivide the hole before\nfilling it. The hole is first determined and segmented automatically based on\ncomputation of its local curvature. It is then filled on each part of the hole\nto match its local curvature shape. The method can work on both 3D point cloud\nsurfaces and triangular mesh surface. Comparing to the state of the art\nmethods, our proposed method obtained higher accuracy of the reconstructed 3D\nobjects.\n","authors":["Sinh Van Nguyen","Son Thanh Le","Minh Khai Tran","Le Thanh Sach"],"pdf_url":"https://arxiv.org/pdf/2310.10875v1.pdf","comment":"20 pages, 11 figures, 37 references"},{"id":"http://arxiv.org/abs/2310.10869v1","updated":"2023-10-16T22:32:43Z","published":"2023-10-16T22:32:43Z","title":"Approximation properties of slice-matching operators","summary":"  Iterative slice-matching procedures are efficient schemes for transferring a\nsource measure to a target measure, especially in high dimensions. These\nschemes have been successfully used in applications such as color transfer and\nshape retrieval, and are guaranteed to converge under regularity assumptions.\nIn this paper, we explore approximation properties related to a single step of\nsuch iterative schemes by examining an associated slice-matching operator,\ndepending on a source measure, a target measure, and slicing directions. In\nparticular, we demonstrate an invariance property with respect to the source\nmeasure, an equivariance property with respect to the target measure, and\nLipschitz continuity concerning the slicing directions. We furthermore\nestablish error bounds corresponding to approximating the target measure by one\nstep of the slice-matching scheme and characterize situations in which the\nslice-matching operator recovers the optimal transport map between two\nmeasures. We also investigate connections to affine registration problems with\nrespect to (sliced) Wasserstein distances. These connections can be also be\nviewed as extensions to the invariance and equivariance properties of the\nslice-matching operator and illustrate the extent to which slice-matching\nschemes incorporate affine effects.\n","authors":["Shiying Li","Caroline Moosmueller"],"pdf_url":"https://arxiv.org/pdf/2310.10869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10862v1","updated":"2023-10-16T22:22:33Z","published":"2023-10-16T22:22:33Z","title":"The Invisible Map: Visual-Inertial SLAM with Fiducial Markers for\n  Smartphone-based Indoor Navigation","summary":"  We present a system for creating building-scale, easily navigable 3D maps\nusing mainstream smartphones. In our approach, we formulate the 3D-mapping\nproblem as an instance of Graph SLAM and infer the position of both building\nlandmarks (fiducial markers) and navigable paths through the environment (phone\nposes). Our results demonstrate the system's ability to create accurate 3D\nmaps. Further, we highlight the importance of careful selection of mapping\nhyperparameters and provide a novel technique for tuning these hyperparameters\nto adapt our algorithm to new environments.\n","authors":["Paul Ruvolo","Ayush Chakraborty","Rucha Dave","Richard Li","Duncan Mazza","Xierui Shen","Raiyan Siddique","Krishna Suresh"],"pdf_url":"https://arxiv.org/pdf/2310.10862v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10861v1","updated":"2023-10-16T22:21:56Z","published":"2023-10-16T22:21:56Z","title":"SoybeanNet: Transformer-Based Convolutional Neural Network for Soybean\n  Pod Counting from Unmanned Aerial Vehicle (UAV) Images","summary":"  Soybeans are a critical source of food, protein and oil, and thus have\nreceived extensive research aimed at enhancing their yield, refining\ncultivation practices, and advancing soybean breeding techniques. Within this\ncontext, soybean pod counting plays an essential role in understanding and\noptimizing production. Despite recent advancements, the development of a robust\npod-counting algorithm capable of performing effectively in real-field\nconditions remains a significant challenge This paper presents a pioneering\nwork of accurate soybean pod counting utilizing unmanned aerial vehicle (UAV)\nimages captured from actual soybean fields in Michigan, USA. Specifically, this\npaper presents SoybeanNet, a novel point-based counting network that harnesses\npowerful transformer backbones for simultaneous soybean pod counting and\nlocalization with high accuracy. In addition, a new dataset of UAV-acquired\nimages for soybean pod counting was created and open-sourced, consisting of 113\ndrone images with more than 260k manually annotated soybean pods captured under\nnatural lighting conditions. Through comprehensive evaluations, SoybeanNet\ndemonstrated superior performance over five state-of-the-art approaches when\ntested on the collected images. Remarkably, SoybeanNet achieved a counting\naccuracy of $84.51\\%$ when tested on the testing dataset, attesting to its\nefficacy in real-world scenarios. The publication also provides both the source\ncode (\\url{https://github.com/JiajiaLi04/Soybean-Pod-Counting-from-UAV-Images})\nand the labeled soybean dataset\n(\\url{https://www.kaggle.com/datasets/jiajiali/uav-based-soybean-pod-images}),\noffering a valuable resource for future research endeavors in soybean pod\ncounting and related fields.\n","authors":["Jiajia Li","Raju Thada Magar","Dong Chen","Feng Lin","Dechun Wang","Xiang Yin","Weichao Zhuang","Zhaojian Li"],"pdf_url":"https://arxiv.org/pdf/2310.10861v1.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2304.02742v2","updated":"2023-10-16T21:44:19Z","published":"2023-04-05T20:47:40Z","title":"Zero-shot Medical Image Translation via Frequency-Guided Diffusion\n  Models","summary":"  Recently, the diffusion model has emerged as a superior generative model that\ncan produce high quality and realistic images. However, for medical image\ntranslation, the existing diffusion models are deficient in accurately\nretaining structural information since the structure details of source domain\nimages are lost during the forward diffusion process and cannot be fully\nrecovered through learned reverse diffusion, while the integrity of anatomical\nstructures is extremely important in medical images. For instance, errors in\nimage translation may distort, shift, or even remove structures and tumors,\nleading to incorrect diagnosis and inadequate treatments. Training and\nconditioning diffusion models using paired source and target images with\nmatching anatomy can help. However, such paired data are very difficult and\ncostly to obtain, and may also reduce the robustness of the developed model to\nout-of-distribution testing data. We propose a frequency-guided diffusion model\n(FGDM) that employs frequency-domain filters to guide the diffusion model for\nstructure-preserving image translation. Based on its design, FGDM allows\nzero-shot learning, as it can be trained solely on the data from the target\ndomain, and used directly for source-to-target domain translation without any\nexposure to the source-domain data during training. We trained FGDM solely on\nthe head-and-neck CT data, and evaluated it on both head-and-neck and lung\ncone-beam CT (CBCT)-to-CT translation tasks. FGDM outperformed the\nstate-of-the-art methods (GAN-based, VAE-based, and diffusion-based) in metrics\nof Fr\\'echet Inception Distance (FID), Peak Signal-to-Noise Ratio (PSNR), and\nStructural Similarity Index Measure (SSIM), showing its significant advantages\nin zero-shot medical image translation.\n","authors":["Yunxiang Li","Hua-Chieh Shao","Xiao Liang","Liyuan Chen","Ruiqi Li","Steve Jiang","Jing Wang","You Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.02742v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.01592v4","updated":"2023-10-16T21:30:14Z","published":"2023-03-02T21:31:35Z","title":"Joint cortical registration of geometry and function using\n  semi-supervised learning","summary":"  Brain surface-based image registration, an important component of brain image\nanalysis, establishes spatial correspondence between cortical surfaces.\nExisting iterative and learning-based approaches focus on accurate registration\nof folding patterns of the cerebral cortex, and assume that geometry predicts\nfunction and thus functional areas will also be well aligned. However,\nstructure/functional variability of anatomically corresponding areas across\nsubjects has been widely reported. In this work, we introduce a learning-based\ncortical registration framework, JOSA, which jointly aligns folding patterns\nand functional maps while simultaneously learning an optimal atlas. We\ndemonstrate that JOSA can substantially improve registration performance in\nboth anatomical and functional domains over existing methods. By employing a\nsemi-supervised training strategy, the proposed framework obviates the need for\nfunctional data during inference, enabling its use in broad neuroscientific\ndomains where functional data may not be observed. The source code of JOSA will\nbe released to the public at https://voxelmorph.net.\n","authors":["Jian Li","Greta Tuckute","Evelina Fedorenko","Brian L. Edlow","Bruce Fischl","Adrian V. Dalca"],"pdf_url":"https://arxiv.org/pdf/2303.01592v4.pdf","comment":"B. Fischl and A. V. Dalca are co-senior authors with equal\n  contribution. This work has been published in MIDL 2023\n  (https://openreview.net/forum?id=n9v_BuIcY7G) Medical Imaging with Deep\n  Learning, Nashville, TN, Jul. 2023"},{"id":"http://arxiv.org/abs/2211.14420v2","updated":"2023-10-16T21:27:20Z","published":"2022-11-26T00:55:52Z","title":"Photo Rater: Photographs Auto-Selector with Deep Learning","summary":"  Photo Rater is a computer vision project that uses neural networks to help\nphotographers select the best photo among those that are taken based on the\nsame scene. This process is usually referred to as \"culling\" in photography,\nand it can be tedious and time-consuming if done manually. Photo Rater utilizes\nthree separate neural networks to complete such a task: one for general image\nquality assessment, one for classifying whether the photo is blurry (either due\nto unsteady hands or out-of-focusness), and one for assessing general\naesthetics (including the composition of the photo, among others). After\nfeeding the image through each neural network, Photo Rater outputs a final\nscore for each image, ranking them based on this score and presenting it to the\nuser.\n","authors":["Wentao Guo","Charlie Ruan","Claire Zhou"],"pdf_url":"https://arxiv.org/pdf/2211.14420v2.pdf","comment":"The authors discovered issues in the code that produced figures 8 and\n  9"},{"id":"http://arxiv.org/abs/2310.10835v1","updated":"2023-10-16T21:17:29Z","published":"2023-10-16T21:17:29Z","title":"Provable Probabilistic Imaging using Score-Based Generative Priors","summary":"  Estimating high-quality images while also quantifying their uncertainty are\ntwo desired features in an image reconstruction algorithm for solving ill-posed\ninverse problems. In this paper, we propose plug-and-play Monte Carlo (PMC) as\na principled framework for characterizing the space of possible solutions to a\ngeneral inverse problem. PMC is able to incorporate expressive score-based\ngenerative priors for high-quality image reconstruction while also performing\nuncertainty quantification via posterior sampling. In particular, we introduce\ntwo PMC algorithms which can be viewed as the sampling analogues of the\ntraditional plug-and-play priors (PnP) and regularization by denoising (RED)\nalgorithms. We also establish a theoretical analysis for characterizing the\nconvergence of the PMC algorithms. Our analysis provides non-asymptotic\nstationarity guarantees for both algorithms, even in the presence of\nnon-log-concave likelihoods and imperfect score networks. We demonstrate the\nperformance of the PMC algorithms on multiple representative inverse problems\nwith both linear and nonlinear forward models. Experimental results show that\nPMC significantly improves reconstruction quality and enables high-fidelity\nuncertainty quantification.\n","authors":["Yu Sun","Zihui Wu","Yifan Chen","Berthy T. Feng","Katherine L. Bouman"],"pdf_url":"https://arxiv.org/pdf/2310.10835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15724v4","updated":"2023-10-16T21:04:57Z","published":"2023-06-27T18:03:15Z","title":"REFLECT: Summarizing Robot Experiences for Failure Explanation and\n  Correction","summary":"  The ability to detect and analyze failed executions automatically is crucial\nfor an explainable and robust robotic system. Recently, Large Language Models\n(LLMs) have demonstrated strong reasoning abilities on textual inputs. To\nleverage the power of LLMs for robot failure explanation, we introduce REFLECT,\na framework which queries LLM for failure reasoning based on a hierarchical\nsummary of robot past experiences generated from multisensory observations. The\nfailure explanation can further guide a language-based planner to correct the\nfailure and complete the task. To systematically evaluate the framework, we\ncreate the RoboFail dataset with a variety of tasks and failure scenarios. We\ndemonstrate that the LLM-based framework is able to generate informative\nfailure explanations that assist successful correction planning.\n","authors":["Zeyi Liu","Arpit Bahety","Shuran Song"],"pdf_url":"https://arxiv.org/pdf/2306.15724v4.pdf","comment":"Conference on Robot Learning (CoRL) 2023; Project website:\n  https://robot-reflect.github.io/"},{"id":"http://arxiv.org/abs/2310.10822v1","updated":"2023-10-16T20:44:09Z","published":"2023-10-16T20:44:09Z","title":"Vision and Language Navigation in the Real World via Online Visual\n  Language Mapping","summary":"  Navigating in unseen environments is crucial for mobile robots. Enhancing\nthem with the ability to follow instructions in natural language will further\nimprove navigation efficiency in unseen cases. However, state-of-the-art (SOTA)\nvision-and-language navigation (VLN) methods are mainly evaluated in\nsimulation, neglecting the complex and noisy real world. Directly transferring\nSOTA navigation policies trained in simulation to the real world is challenging\ndue to the visual domain gap and the absence of prior knowledge about unseen\nenvironments. In this work, we propose a novel navigation framework to address\nthe VLN task in the real world. Utilizing the powerful foundation models, the\nproposed framework includes four key components: (1) an LLMs-based instruction\nparser that converts the language instruction into a sequence of pre-defined\nmacro-action descriptions, (2) an online visual-language mapper that builds a\nreal-time visual-language map to maintain a spatial and semantic understanding\nof the unseen environment, (3) a language indexing-based localizer that grounds\neach macro-action description into a waypoint location on the map, and (4) a\nDD-PPO-based local controller that predicts the action. We evaluate the\nproposed pipeline on an Interbotix LoCoBot WX250 in an unseen lab environment.\nWithout any fine-tuning, our pipeline significantly outperforms the SOTA VLN\nbaseline in the real world.\n","authors":["Chengguang Xu","Hieu T. Nguyen","Christopher Amato","Lawson L. S. Wong"],"pdf_url":"https://arxiv.org/pdf/2310.10822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.05833v2","updated":"2023-10-16T20:29:02Z","published":"2022-06-12T20:25:21Z","title":"COLD Fusion: Calibrated and Ordinal Latent Distribution Fusion for\n  Uncertainty-Aware Multimodal Emotion Recognition","summary":"  Automatically recognising apparent emotions from face and voice is hard, in\npart because of various sources of uncertainty, including in the input data and\nthe labels used in a machine learning framework. This paper introduces an\nuncertainty-aware audiovisual fusion approach that quantifies modality-wise\nuncertainty towards emotion prediction. To this end, we propose a novel fusion\nframework in which we first learn latent distributions over audiovisual\ntemporal context vectors separately, and then constrain the variance vectors of\nunimodal latent distributions so that they represent the amount of information\neach modality provides w.r.t. emotion recognition. In particular, we impose\nCalibration and Ordinal Ranking constraints on the variance vectors of\naudiovisual latent distributions. When well-calibrated, modality-wise\nuncertainty scores indicate how much their corresponding predictions may differ\nfrom the ground truth labels. Well-ranked uncertainty scores allow the ordinal\nranking of different frames across the modalities. To jointly impose both these\nconstraints, we propose a softmax distributional matching loss. In both\nclassification and regression settings, we compare our uncertainty-aware fusion\nmodel with standard model-agnostic fusion baselines. Our evaluation on two\nemotion recognition corpora, AVEC 2019 CES and IEMOCAP, shows that audiovisual\nemotion recognition can considerably benefit from well-calibrated and\nwell-ranked latent uncertainty measures.\n","authors":["Mani Kumar Tellamekala","Shahin Amiriparian","Björn W. Schuller","Elisabeth André","Timo Giesbrecht","Michel Valstar"],"pdf_url":"https://arxiv.org/pdf/2206.05833v2.pdf","comment":"Accepted to IEEE Transactions on Pattern Analysis and Machine\n  Intelligence"},{"id":"http://arxiv.org/abs/2305.16283v4","updated":"2023-10-16T20:15:05Z","published":"2023-05-25T17:39:13Z","title":"CommonScenes: Generating Commonsense 3D Indoor Scenes with Scene Graphs","summary":"  Controllable scene synthesis aims to create interactive environments for\nvarious industrial use cases. Scene graphs provide a highly suitable interface\nto facilitate these applications by abstracting the scene context in a compact\nmanner. Existing methods, reliant on retrieval from extensive databases or\npre-trained shape embeddings, often overlook scene-object and object-object\nrelationships, leading to inconsistent results due to their limited generation\ncapacity. To address this issue, we present CommonScenes, a fully generative\nmodel that converts scene graphs into corresponding controllable 3D scenes,\nwhich are semantically realistic and conform to commonsense. Our pipeline\nconsists of two branches, one predicting the overall scene layout via a\nvariational auto-encoder and the other generating compatible shapes via latent\ndiffusion, capturing global scene-object and local inter-object relationships\nwhile preserving shape diversity. The generated scenes can be manipulated by\nediting the input scene graph and sampling the noise in the diffusion model.\nDue to lacking a scene graph dataset offering high-quality object-level meshes\nwith relations, we also construct SG-FRONT, enriching the off-the-shelf indoor\ndataset 3D-FRONT with additional scene graph labels. Extensive experiments are\nconducted on SG-FRONT where CommonScenes shows clear advantages over other\nmethods regarding generation consistency, quality, and diversity. Codes and the\ndataset will be released upon acceptance.\n","authors":["Guangyao Zhai","Evin Pınar Örnek","Shun-Cheng Wu","Yan Di","Federico Tombari","Nassir Navab","Benjamin Busam"],"pdf_url":"https://arxiv.org/pdf/2305.16283v4.pdf","comment":"NeurIPS 2023 accepted"},{"id":"http://arxiv.org/abs/2310.10806v1","updated":"2023-10-16T20:09:49Z","published":"2023-10-16T20:09:49Z","title":"Convolutional Neural Network Model for Diabetic Retinopathy Feature\n  Extraction and Classification","summary":"  The application of Artificial Intelligence in the medical market brings up\nincreasing concerns but aids in more timely diagnosis of silent progressing\ndiseases like Diabetic Retinopathy. In order to diagnose Diabetic Retinopathy\n(DR), ophthalmologists use color fundus images, or pictures of the back of the\nretina, to identify small distinct features through a difficult and\ntime-consuming process. Our work creates a novel CNN model and identifies the\nseverity of DR through fundus image input. We classified 4 known DR features,\nincluding micro-aneurysms, cotton wools, exudates, and hemorrhages, through\nconvolutional layers and were able to provide an accurate diagnostic without\nadditional user input. The proposed model is more interpretable and robust to\noverfitting. We present initial results with a sensitivity of 97% and an\naccuracy of 71%. Our contribution is an interpretable model with similar\naccuracy to more complex models. With that, our model advances the field of DR\ndetection and proves to be a key step towards AI-focused medical diagnosis.\n","authors":["Sharan Subramanian","Leilani H. Gilpin"],"pdf_url":"https://arxiv.org/pdf/2310.10806v1.pdf","comment":"9 pages, 2 tables, 5 figures"},{"id":"http://arxiv.org/abs/2309.05197v2","updated":"2023-10-16T20:07:01Z","published":"2023-09-11T02:20:28Z","title":"Learning Sequential Acquisition Policies for Robot-Assisted Feeding","summary":"  A robot providing mealtime assistance must perform specialized maneuvers with\nvarious utensils in order to pick up and feed a range of food items. Beyond\nthese dexterous low-level skills, an assistive robot must also plan these\nstrategies in sequence over a long horizon to clear a plate and complete a\nmeal. Previous methods in robot-assisted feeding introduce highly specialized\nprimitives for food handling without a means to compose them together.\nMeanwhile, existing approaches to long-horizon manipulation lack the\nflexibility to embed highly specialized primitives into their frameworks. We\npropose Visual Action Planning OveR Sequences (VAPORS), a framework for\nlong-horizon food acquisition. VAPORS learns a policy for high-level action\nselection by leveraging learned latent plate dynamics in simulation. To carry\nout sequential plans in the real world, VAPORS delegates action execution to\nvisually parameterized primitives. We validate our approach on complex\nreal-world acquisition trials involving noodle acquisition and bimanual\nscooping of jelly beans. Across 38 plates, VAPORS acquires much more\nefficiently than baselines, generalizes across realistic plate variations such\nas toppings and sauces, and qualitatively appeals to user feeding preferences\nin a survey conducted across 49 individuals. Code, datasets, videos, and\nsupplementary materials can be found on our website:\nhttps://sites.google.com/view/vaporsbot.\n","authors":["Priya Sundaresan","Jiajun Wu","Dorsa Sadigh"],"pdf_url":"https://arxiv.org/pdf/2309.05197v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10769v1","updated":"2023-10-16T19:03:19Z","published":"2023-10-16T19:03:19Z","title":"LAMP: Learn A Motion Pattern for Few-Shot-Based Video Generation","summary":"  With the impressive progress in diffusion-based text-to-image generation,\nextending such powerful generative ability to text-to-video raises enormous\nattention. Existing methods either require large-scale text-video pairs and a\nlarge number of training resources or learn motions that are precisely aligned\nwith template videos. It is non-trivial to balance a trade-off between the\ndegree of generation freedom and the resource costs for video generation. In\nour study, we present a few-shot-based tuning framework, LAMP, which enables\ntext-to-image diffusion model Learn A specific Motion Pattern with 8~16 videos\non a single GPU. Specifically, we design a first-frame-conditioned pipeline\nthat uses an off-the-shelf text-to-image model for content generation so that\nour tuned video diffusion model mainly focuses on motion learning. The\nwell-developed text-to-image techniques can provide visually pleasing and\ndiverse content as generation conditions, which highly improves video quality\nand generation freedom. To capture the features of temporal dimension, we\nexpand the pretrained 2D convolution layers of the T2I model to our novel\ntemporal-spatial motion learning layers and modify the attention blocks to the\ntemporal level. Additionally, we develop an effective inference trick,\nshared-noise sampling, which can improve the stability of videos with\ncomputational costs. Our method can also be flexibly applied to other tasks,\ne.g. real-world image animation and video editing. Extensive experiments\ndemonstrate that LAMP can effectively learn the motion pattern on limited data\nand generate high-quality videos. The code and models are available at\nhttps://rq-wu.github.io/projects/LAMP.\n","authors":["Ruiqi Wu","Liangyu Chen","Tong Yang","Chunle Guo","Chongyi Li","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.10769v1.pdf","comment":"Project Page: https://rq-wu.github.io/projects/LAMP"},{"id":"http://arxiv.org/abs/2310.10765v1","updated":"2023-10-16T18:59:31Z","published":"2023-10-16T18:59:31Z","title":"BiomedJourney: Counterfactual Biomedical Image Generation by\n  Instruction-Learning from Multimodal Patient Journeys","summary":"  Rapid progress has been made in instruction-learning for image editing with\nnatural-language instruction, as exemplified by InstructPix2Pix. In\nbiomedicine, such methods can be applied to counterfactual image generation,\nwhich helps differentiate causal structure from spurious correlation and\nfacilitate robust image interpretation for disease progression modeling.\nHowever, generic image-editing models are ill-suited for the biomedical domain,\nand counterfactual biomedical image generation is largely underexplored. In\nthis paper, we present BiomedJourney, a novel method for counterfactual\nbiomedical image generation by instruction-learning from multimodal patient\njourneys. Given a patient with two biomedical images taken at different time\npoints, we use GPT-4 to process the corresponding imaging reports and generate\na natural language description of disease progression. The resulting triples\n(prior image, progression description, new image) are then used to train a\nlatent diffusion model for counterfactual biomedical image generation. Given\nthe relative scarcity of image time series data, we introduce a two-stage\ncurriculum that first pretrains the denoising network using the much more\nabundant single image-report pairs (with dummy prior image), and then continues\ntraining using the counterfactual triples. Experiments using the standard\nMIMIC-CXR dataset demonstrate the promise of our method. In a comprehensive\nbattery of tests on counterfactual medical image generation, BiomedJourney\nsubstantially outperforms prior state-of-the-art methods in instruction image\nediting and medical image generation such as InstructPix2Pix and RoentGen. To\nfacilitate future study in counterfactual medical generation, we plan to\nrelease our instruction-learning code and pretrained models.\n","authors":["Yu Gu","Jianwei Yang","Naoto Usuyama","Chunyuan Li","Sheng Zhang","Matthew P. Lungren","Jianfeng Gao","Hoifung Poon"],"pdf_url":"https://arxiv.org/pdf/2310.10765v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10756v1","updated":"2023-10-16T18:38:26Z","published":"2023-10-16T18:38:26Z","title":"Deep Conditional Shape Models for 3D cardiac image segmentation","summary":"  Delineation of anatomical structures is often the first step of many medical\nimage analysis workflows. While convolutional neural networks achieve high\nperformance, these do not incorporate anatomical shape information. We\nintroduce a novel segmentation algorithm that uses Deep Conditional Shape\nmodels (DCSMs) as a core component. Using deep implicit shape representations,\nthe algorithm learns a modality-agnostic shape model that can generate the\nsigned distance functions for any anatomy of interest. To fit the generated\nshape to the image, the shape model is conditioned on anatomic landmarks that\ncan be automatically detected or provided by the user. Finally, we add a\nmodality-dependent, lightweight refinement network to capture any fine details\nnot represented by the implicit function. The proposed DCSM framework is\nevaluated on the problem of cardiac left ventricle (LV) segmentation from\nmultiple 3D modalities (contrast-enhanced CT, non-contrasted CT, 3D\nechocardiography-3DE). We demonstrate that the automatic DCSM outperforms the\nbaseline for non-contrasted CT without the local refinement, and with the\nrefinement for contrasted CT and 3DE, especially with significant improvement\nin the Hausdorff distance. The semi-automatic DCSM with user-input landmarks,\nwhile only trained on contrasted CT, achieves greater than 92% Dice for all\nmodalities. Both automatic DCSM with refinement and semi-automatic DCSM achieve\nequivalent or better performance compared to inter-user variability for these\nmodalities.\n","authors":["Athira J Jacob","Puneet Sharma","Daniel Ruckert"],"pdf_url":"https://arxiv.org/pdf/2310.10756v1.pdf","comment":"Accepted and presented as oral presentation at Statistical Atlases\n  and Computational Modeling of the Heart (STACOM) workshop at MICCAI 2023"},{"id":"http://arxiv.org/abs/2310.10755v1","updated":"2023-10-16T18:37:33Z","published":"2023-10-16T18:37:33Z","title":"IDRNet: Intervention-Driven Relation Network for Semantic Segmentation","summary":"  Co-occurrent visual patterns suggest that pixel relation modeling facilitates\ndense prediction tasks, which inspires the development of numerous context\nmodeling paradigms, \\emph{e.g.}, multi-scale-driven and similarity-driven\ncontext schemes. Despite the impressive results, these existing paradigms often\nsuffer from inadequate or ineffective contextual information aggregation due to\nreliance on large amounts of predetermined priors. To alleviate the issues, we\npropose a novel \\textbf{I}ntervention-\\textbf{D}riven \\textbf{R}elation\n\\textbf{Net}work (\\textbf{IDRNet}), which leverages a deletion diagnostics\nprocedure to guide the modeling of contextual relations among different pixels.\nSpecifically, we first group pixel-level representations into semantic-level\nrepresentations with the guidance of pseudo labels and further improve the\ndistinguishability of the grouped representations with a feature enhancement\nmodule. Next, a deletion diagnostics procedure is conducted to model relations\nof these semantic-level representations via perceiving the network outputs and\nthe extracted relations are utilized to guide the semantic-level\nrepresentations to interact with each other. Finally, the interacted\nrepresentations are utilized to augment original pixel-level representations\nfor final predictions. Extensive experiments are conducted to validate the\neffectiveness of IDRNet quantitatively and qualitatively. Notably, our\nintervention-driven context scheme brings consistent performance improvements\nto state-of-the-art segmentation frameworks and achieves competitive results on\npopular benchmark datasets, including ADE20K, COCO-Stuff, PASCAL-Context, LIP,\nand Cityscapes. Code is available at\n\\url{https://github.com/SegmentationBLWX/sssegmentation}.\n","authors":["Zhenchao Jin","Xiaowei Hu","Lingting Zhu","Luchuan Song","Li Yuan","Lequan Yu"],"pdf_url":"https://arxiv.org/pdf/2310.10755v1.pdf","comment":"Accepted by NeurIPs 2023"},{"id":"http://arxiv.org/abs/2310.01415v2","updated":"2023-10-16T18:33:10Z","published":"2023-10-02T17:59:57Z","title":"GPT-Driver: Learning to Drive with GPT","summary":"  We present a simple yet effective approach that can transform the OpenAI\nGPT-3.5 model into a reliable motion planner for autonomous vehicles. Motion\nplanning is a core challenge in autonomous driving, aiming to plan a driving\ntrajectory that is safe and comfortable. Existing motion planners predominantly\nleverage heuristic methods to forecast driving trajectories, yet these\napproaches demonstrate insufficient generalization capabilities in the face of\nnovel and unseen driving scenarios. In this paper, we propose a novel approach\nto motion planning that capitalizes on the strong reasoning capabilities and\ngeneralization potential inherent to Large Language Models (LLMs). The\nfundamental insight of our approach is the reformulation of motion planning as\na language modeling problem, a perspective not previously explored.\nSpecifically, we represent the planner inputs and outputs as language tokens,\nand leverage the LLM to generate driving trajectories through a language\ndescription of coordinate positions. Furthermore, we propose a novel\nprompting-reasoning-finetuning strategy to stimulate the numerical reasoning\npotential of the LLM. With this strategy, the LLM can describe highly precise\ntrajectory coordinates and also its internal decision-making process in natural\nlanguage. We evaluate our approach on the large-scale nuScenes dataset, and\nextensive experiments substantiate the effectiveness, generalization ability,\nand interpretability of our GPT-based motion planner. Code is now available at\nhttps://github.com/PointsCoder/GPT-Driver.\n","authors":["Jiageng Mao","Yuxi Qian","Hang Zhao","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2310.01415v2.pdf","comment":"Project Page:\n  https://pointscoder.github.io/projects/gpt_driver/index.html"},{"id":"http://arxiv.org/abs/2310.10651v1","updated":"2023-10-16T17:59:58Z","published":"2023-10-16T17:59:58Z","title":"HairCLIPv2: Unifying Hair Editing via Proxy Feature Blending","summary":"  Hair editing has made tremendous progress in recent years. Early hair editing\nmethods use well-drawn sketches or masks to specify the editing conditions.\nEven though they can enable very fine-grained local control, such interaction\nmodes are inefficient for the editing conditions that can be easily specified\nby language descriptions or reference images. Thanks to the recent breakthrough\nof cross-modal models (e.g., CLIP), HairCLIP is the first work that enables\nhair editing based on text descriptions or reference images. However, such\ntext-driven and reference-driven interaction modes make HairCLIP unable to\nsupport fine-grained controls specified by sketch or mask. In this paper, we\npropose HairCLIPv2, aiming to support all the aforementioned interactions with\none unified framework. Simultaneously, it improves upon HairCLIP with better\nirrelevant attributes (e.g., identity, background) preservation and unseen text\ndescriptions support. The key idea is to convert all the hair editing tasks\ninto hair transfer tasks, with editing conditions converted into different\nproxies accordingly. The editing effects are added upon the input image by\nblending the corresponding proxy features within the hairstyle or hair color\nfeature spaces. Besides the unprecedented user interaction mode support,\nquantitative and qualitative experiments demonstrate the superiority of\nHairCLIPv2 in terms of editing effects, irrelevant attribute preservation and\nvisual naturalness. Our code is available at\n\\url{https://github.com/wty-ustc/HairCLIPv2}.\n","authors":["Tianyi Wei","Dongdong Chen","Wenbo Zhou","Jing Liao","Weiming Zhang","Gang Hua","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2310.10651v1.pdf","comment":"ICCV 2023, code is available at\n  https://github.com/wty-ustc/HairCLIPv2"},{"id":"http://arxiv.org/abs/2310.10650v1","updated":"2023-10-16T17:59:56Z","published":"2023-10-16T17:59:56Z","title":"TraM-NeRF: Tracing Mirror and Near-Perfect Specular Reflections through\n  Neural Radiance Fields","summary":"  Implicit representations like Neural Radiance Fields (NeRF) showed impressive\nresults for photorealistic rendering of complex scenes with fine details.\nHowever, ideal or near-perfectly specular reflecting objects such as mirrors,\nwhich are often encountered in various indoor scenes, impose ambiguities and\ninconsistencies in the representation of the reconstructed scene leading to\nsevere artifacts in the synthesized renderings. In this paper, we present a\nnovel reflection tracing method tailored for the involved volume rendering\nwithin NeRF that takes these mirror-like objects into account while avoiding\nthe cost of straightforward but expensive extensions through standard path\ntracing. By explicitly modeling the reflection behavior using physically\nplausible materials and estimating the reflected radiance with Monte-Carlo\nmethods within the volume rendering formulation, we derive efficient strategies\nfor importance sampling and the transmittance computation along rays from only\nfew samples. We show that our novel method enables the training of consistent\nrepresentations of such challenging scenes and achieves superior results in\ncomparison to previous state-of-the-art approaches.\n","authors":["Leif Van Holland","Ruben Bliersbach","Jan U. Müller","Patrick Stotko","Reinhard Klein"],"pdf_url":"https://arxiv.org/pdf/2310.10650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10647v1","updated":"2023-10-16T17:59:28Z","published":"2023-10-16T17:59:28Z","title":"A Survey on Video Diffusion Models","summary":"  The recent wave of AI-generated content (AIGC) has witnessed substantial\nsuccess in computer vision, with the diffusion model playing a crucial role in\nthis achievement. Due to their impressive generative capabilities, diffusion\nmodels are gradually superseding methods based on GANs and auto-regressive\nTransformers, demonstrating exceptional performance not only in image\ngeneration and editing, but also in the realm of video-related research.\nHowever, existing surveys mainly focus on diffusion models in the context of\nimage generation, with few up-to-date reviews on their application in the video\ndomain. To address this gap, this paper presents a comprehensive review of\nvideo diffusion models in the AIGC era. Specifically, we begin with a concise\nintroduction to the fundamentals and evolution of diffusion models.\nSubsequently, we present an overview of research on diffusion models in the\nvideo domain, categorizing the work into three key areas: video generation,\nvideo editing, and other video understanding tasks. We conduct a thorough\nreview of the literature in these three key areas, including further\ncategorization and practical contributions in the field. Finally, we discuss\nthe challenges faced by research in this domain and outline potential future\ndevelopmental trends. A comprehensive list of video diffusion models studied in\nthis survey is available at\nhttps://github.com/ChenHsing/Awesome-Video-Diffusion-Models.\n","authors":["Zhen Xing","Qijun Feng","Haoran Chen","Qi Dai","Han Hu","Hang Xu","Zuxuan Wu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2310.10647v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10644v1","updated":"2023-10-16T17:59:09Z","published":"2023-10-16T17:59:09Z","title":"TOSS:High-quality Text-guided Novel View Synthesis from a Single Image","summary":"  In this paper, we present TOSS, which introduces text to the task of novel\nview synthesis (NVS) from just a single RGB image. While Zero-1-to-3 has\ndemonstrated impressive zero-shot open-set NVS capability, it treats NVS as a\npure image-to-image translation problem. This approach suffers from the\nchallengingly under-constrained nature of single-view NVS: the process lacks\nmeans of explicit user control and often results in implausible NVS\ngenerations. To address this limitation, TOSS uses text as high-level semantic\ninformation to constrain the NVS solution space. TOSS fine-tunes text-to-image\nStable Diffusion pre-trained on large-scale text-image pairs and introduces\nmodules specifically tailored to image and camera pose conditioning, as well as\ndedicated training for pose correctness and preservation of fine details.\nComprehensive experiments are conducted with results showing that our proposed\nTOSS outperforms Zero-1-to-3 with more plausible, controllable and\nmultiview-consistent NVS results. We further support these results with\ncomprehensive ablations that underscore the effectiveness and potential of the\nintroduced semantic guidance and architecture design.\n","authors":["Yukai Shi","Jianan Wang","He Cao","Boshi Tang","Xianbiao Qi","Tianyu Yang","Yukun Huang","Shilong Liu","Lei Zhang","Heung-Yeung Shum"],"pdf_url":"https://arxiv.org/pdf/2310.10644v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10642v1","updated":"2023-10-16T17:57:43Z","published":"2023-10-16T17:57:43Z","title":"Real-time Photorealistic Dynamic Scene Representation and Rendering with\n  4D Gaussian Splatting","summary":"  Reconstructing dynamic 3D scenes from 2D images and generating diverse views\nover time is challenging due to scene complexity and temporal dynamics. Despite\nadvancements in neural implicit models, limitations persist: (i) Inadequate\nScene Structure: Existing methods struggle to reveal the spatial and temporal\nstructure of dynamic scenes from directly learning the complex 6D plenoptic\nfunction. (ii) Scaling Deformation Modeling: Explicitly modeling scene element\ndeformation becomes impractical for complex dynamics. To address these issues,\nwe consider the spacetime as an entirety and propose to approximate the\nunderlying spatio-temporal 4D volume of a dynamic scene by optimizing a\ncollection of 4D primitives, with explicit geometry and appearance modeling.\nLearning to optimize the 4D primitives enables us to synthesize novel views at\nany desired time with our tailored rendering routine. Our model is conceptually\nsimple, consisting of a 4D Gaussian parameterized by anisotropic ellipses that\ncan rotate arbitrarily in space and time, as well as view-dependent and\ntime-evolved appearance represented by the coefficient of 4D spherindrical\nharmonics. This approach offers simplicity, flexibility for variable-length\nvideo and end-to-end training, and efficient real-time rendering, making it\nsuitable for capturing complex dynamic scene motions. Experiments across\nvarious benchmarks, including monocular and multi-view scenarios, demonstrate\nour 4DGS model's superior visual quality and efficiency.\n","authors":["Zeyu Yang","Hongye Yang","Zijie Pan","Xiatian Zhu","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.10642v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2310.10640v1","updated":"2023-10-16T17:57:37Z","published":"2023-10-16T17:57:37Z","title":"LLM Blueprint: Enabling Text-to-Image Generation with Complex and\n  Detailed Prompts","summary":"  Diffusion-based generative models have significantly advanced text-to-image\ngeneration but encounter challenges when processing lengthy and intricate text\nprompts describing complex scenes with multiple objects. While excelling in\ngenerating images from short, single-object descriptions, these models often\nstruggle to faithfully capture all the nuanced details within longer and more\nelaborate textual inputs. In response, we present a novel approach leveraging\nLarge Language Models (LLMs) to extract critical components from text prompts,\nincluding bounding box coordinates for foreground objects, detailed textual\ndescriptions for individual objects, and a succinct background context. These\ncomponents form the foundation of our layout-to-image generation model, which\noperates in two phases. The initial Global Scene Generation utilizes object\nlayouts and background context to create an initial scene but often falls short\nin faithfully representing object characteristics as specified in the prompts.\nTo address this limitation, we introduce an Iterative Refinement Scheme that\niteratively evaluates and refines box-level content to align them with their\ntextual descriptions, recomposing objects as needed to ensure consistency. Our\nevaluation on complex prompts featuring multiple objects demonstrates a\nsubstantial improvement in recall compared to baseline diffusion models. This\nis further validated by a user study, underscoring the efficacy of our approach\nin generating coherent and detailed scenes from intricate textual inputs.\n","authors":["Hanan Gani","Shariq Farooq Bhat","Muzammal Naseer","Salman Khan","Peter Wonka"],"pdf_url":"https://arxiv.org/pdf/2310.10640v1.pdf","comment":"Code: https://github.com/hananshafi/llmblueprint"},{"id":"http://arxiv.org/abs/2310.10635v1","updated":"2023-10-16T17:55:14Z","published":"2023-10-16T17:55:14Z","title":"Towards Scenario-based Safety Validation for Autonomous Trains with Deep\n  Generative Models","summary":"  Modern AI techniques open up ever-increasing possibilities for autonomous\nvehicles, but how to appropriately verify the reliability of such systems\nremains unclear. A common approach is to conduct safety validation based on a\npredefined Operational Design Domain (ODD) describing specific conditions under\nwhich a system under test is required to operate properly. However, collecting\nsufficient realistic test cases to ensure comprehensive ODD coverage is\nchallenging. In this paper, we report our practical experiences regarding the\nutility of data simulation with deep generative models for scenario-based ODD\nvalidation. We consider the specific use case of a camera-based rail-scene\nsegmentation system designed to support autonomous train operation. We\ndemonstrate the capabilities of semantically editing railway scenes with deep\ngenerative models to make a limited amount of test data more representative. We\nalso show how our approach helps to analyze the degree to which a system\ncomplies with typical ODD requirements. Specifically, we focus on evaluating\nproper operation under different lighting and weather conditions as well as\nwhile transitioning between them.\n","authors":["Thomas Decker","Ananta R. Bhattarai","Michael Lebacher"],"pdf_url":"https://arxiv.org/pdf/2310.10635v1.pdf","comment":"International Conference on Computer Safety, Reliability, and\n  Security 2023"},{"id":"http://arxiv.org/abs/2310.01164v2","updated":"2023-10-16T17:53:55Z","published":"2023-10-02T12:49:20Z","title":"Segment Any Building For Remote Sensing","summary":"  The task of identifying and segmenting buildings within remote sensing\nimagery has perennially stood at the forefront of scholarly investigations.\nThis manuscript accentuates the potency of harnessing diversified datasets in\ntandem with cutting-edge representation learning paradigms for building\nsegmentation in such images. Through the strategic amalgamation of disparate\ndatasets, we have not only expanded the informational horizon accessible for\nmodel training but also manifested unparalleled performance metrics across\nmultiple datasets. Our avant-garde joint training regimen underscores the merit\nof our approach, bearing significant implications in pivotal domains such as\nurban infrastructural development, disaster mitigation strategies, and\necological surveillance. Our methodology, predicated upon the fusion of\ndatasets and gleaning insights from pre-trained models, carves a new benchmark\nin the annals of building segmentation endeavors. The outcomes of this research\nboth fortify the foundations for ensuing scholarly pursuits and presage a\nhorizon replete with innovative applications in the discipline of building\nsegmentation.\n","authors":["Lei Li"],"pdf_url":"https://arxiv.org/pdf/2310.01164v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10625v1","updated":"2023-10-16T17:48:45Z","published":"2023-10-16T17:48:45Z","title":"Video Language Planning","summary":"  We are interested in enabling visual planning for complex long-horizon tasks\nin the space of generated videos and language, leveraging recent advances in\nlarge generative models pretrained on Internet-scale data. To this end, we\npresent video language planning (VLP), an algorithm that consists of a tree\nsearch procedure, where we train (i) vision-language models to serve as both\npolicies and value functions, and (ii) text-to-video models as dynamics models.\nVLP takes as input a long-horizon task instruction and current image\nobservation, and outputs a long video plan that provides detailed multimodal\n(video and language) specifications that describe how to complete the final\ntask. VLP scales with increasing computation budget where more computation time\nresults in improved video plans, and is able to synthesize long-horizon video\nplans across different robotics domains: from multi-object rearrangement, to\nmulti-camera bi-arm dexterous manipulation. Generated video plans can be\ntranslated into real robot actions via goal-conditioned policies, conditioned\non each intermediate frame of the generated video. Experiments show that VLP\nsubstantially improves long-horizon task success rates compared to prior\nmethods on both simulated and real robots (across 3 hardware platforms).\n","authors":["Yilun Du","Mengjiao Yang","Pete Florence","Fei Xia","Ayzaan Wahid","Brian Ichter","Pierre Sermanet","Tianhe Yu","Pieter Abbeel","Joshua B. Tenenbaum","Leslie Kaelbling","Andy Zeng","Jonathan Tompson"],"pdf_url":"https://arxiv.org/pdf/2310.10625v1.pdf","comment":"https://video-language-planning.github.io/"},{"id":"http://arxiv.org/abs/2310.10624v1","updated":"2023-10-16T17:48:10Z","published":"2023-10-16T17:48:10Z","title":"DynVideo-E: Harnessing Dynamic NeRF for Large-Scale Motion- and\n  View-Change Human-Centric Video Editing","summary":"  Despite remarkable research advances in diffusion-based video editing,\nexisting methods are limited to short-length videos due to the contradiction\nbetween long-range consistency and frame-wise editing. Recent approaches\nattempt to tackle this challenge by introducing video-2D representations to\ndegrade video editing to image editing. However, they encounter significant\ndifficulties in handling large-scale motion- and view-change videos especially\nfor human-centric videos. This motivates us to introduce the dynamic Neural\nRadiance Fields (NeRF) as the human-centric video representation to ease the\nvideo editing problem to a 3D space editing task. As such, editing can be\nperformed in the 3D spaces and propagated to the entire video via the\ndeformation field. To provide finer and direct controllable editing, we propose\nthe image-based 3D space editing pipeline with a set of effective designs.\nThese include multi-view multi-pose Score Distillation Sampling (SDS) from both\n2D personalized diffusion priors and 3D diffusion priors, reconstruction losses\non the reference image, text-guided local parts super-resolution, and style\ntransfer for 3D background space. Extensive experiments demonstrate that our\nmethod, dubbed as DynVideo-E, significantly outperforms SOTA approaches on two\nchallenging datasets by a large margin of 50% ~ 95% in terms of human\npreference. Compelling video comparisons are provided in the project page\nhttps://showlab.github.io/DynVideo-E/. Our code and data will be released to\nthe community.\n","authors":["Jia-Wei Liu","Yan-Pei Cao","Jay Zhangjie Wu","Weijia Mao","Yuchao Gu","Rui Zhao","Jussi Keppo","Ying Shan","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2310.10624v1.pdf","comment":"Project Page: https://showlab.github.io/DynVideo-E/"},{"id":"http://arxiv.org/abs/2310.10594v1","updated":"2023-10-16T17:16:32Z","published":"2023-10-16T17:16:32Z","title":"Motion2Language, Unsupervised learning of synchronized semantic motion\n  segmentation","summary":"  In this paper, we investigate building a sequence to sequence architecture\nfor motion to language translation and synchronization. The aim is to translate\nmotion capture inputs into English natural-language descriptions, such that the\ndescriptions are generated synchronously with the actions performed, enabling\nsemantic segmentation as a byproduct, but without requiring synchronized\ntraining data. We propose a new recurrent formulation of local attention that\nis suited for synchronous/live text generation, as well as an improved motion\nencoder architecture better suited to smaller data and for synchronous\ngeneration. We evaluate both contributions in individual experiments, using the\nstandard BLEU4 metric, as well as a simple semantic equivalence measure, on the\nKIT motion language dataset. In a follow-up experiment, we assess the quality\nof the synchronization of generated text in our proposed approaches through\nmultiple evaluation metrics. We find that both contributions to the attention\nmechanism and the encoder architecture additively improve the quality of\ngenerated text (BLEU and semantic equivalence), but also of synchronization.\nOur code will be made available at\n\\url{https://github.com/rd20karim/M2T-Segmentation/tree/main}\n","authors":["Karim Radouane","Andon Tchechmedjiev","Sylvie Ranwez","Julien Lagarde"],"pdf_url":"https://arxiv.org/pdf/2310.10594v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2310.10591v1","updated":"2023-10-16T17:12:06Z","published":"2023-10-16T17:12:06Z","title":"Interpreting and Controlling Vision Foundation Models via Text\n  Explanations","summary":"  Large-scale pre-trained vision foundation models, such as CLIP, have become\nde facto backbones for various vision tasks. However, due to their black-box\nnature, understanding the underlying rules behind these models' predictions and\ncontrolling model behaviors have remained open challenges. We present a\nframework for interpreting vision transformer's latent tokens with natural\nlanguage. Given a latent token, our framework retains its semantic information\nto the final layer using transformer's local operations and retrieves the\nclosest text for explanation. Our approach enables understanding of model\nvisual reasoning procedure without needing additional model training or data\ncollection. Based on the obtained interpretations, our framework allows for\nmodel editing that controls model reasoning behaviors and improves model\nrobustness against biases and spurious correlations.\n","authors":["Haozhe Chen","Junfeng Yang","Carl Vondrick","Chengzhi Mao"],"pdf_url":"https://arxiv.org/pdf/2310.10591v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09678v2","updated":"2023-10-16T17:07:12Z","published":"2023-08-18T16:57:25Z","title":"PoSynDA: Multi-Hypothesis Pose Synthesis Domain Adaptation for Robust 3D\n  Human Pose Estimation","summary":"  Existing 3D human pose estimators face challenges in adapting to new datasets\ndue to the lack of 2D-3D pose pairs in training sets. To overcome this issue,\nwe propose \\textit{Multi-Hypothesis \\textbf{P}ose \\textbf{Syn}thesis\n\\textbf{D}omain \\textbf{A}daptation} (\\textbf{PoSynDA}) framework to bridge\nthis data disparity gap in target domain. Typically, PoSynDA uses a\ndiffusion-inspired structure to simulate 3D pose distribution in the target\ndomain. By incorporating a multi-hypothesis network, PoSynDA generates diverse\npose hypotheses and aligns them with the target domain. To do this, it first\nutilizes target-specific source augmentation to obtain the target domain\ndistribution data from the source domain by decoupling the scale and position\nparameters. The process is then further refined through the teacher-student\nparadigm and low-rank adaptation. With extensive comparison of benchmarks such\nas Human3.6M and MPI-INF-3DHP, PoSynDA demonstrates competitive performance,\neven comparable to the target-trained MixSTE model\\cite{zhang2022mixste}. This\nwork paves the way for the practical application of 3D human pose estimation in\nunseen domains. The code is available at https://github.com/hbing-l/PoSynDA.\n","authors":["Hanbing Liu","Jun-Yan He","Zhi-Qi Cheng","Wangmeng Xiang","Qize Yang","Wenhao Chai","Gaoang Wang","Xu Bao","Bin Luo","Yifeng Geng","Xuansong Xie"],"pdf_url":"https://arxiv.org/pdf/2308.09678v2.pdf","comment":"Accepted to ACM Multimedia 2023; 10 pages, 4 figures, 8 tables; the\n  code is at https://github.com/hbing-l/PoSynDA"},{"id":"http://arxiv.org/abs/2310.10586v1","updated":"2023-10-16T17:05:56Z","published":"2023-10-16T17:05:56Z","title":"BiLL-VTG: Bridging Large Language Models and Lightweight Visual Tools\n  for Video-based Texts Generation","summary":"  Building models that generate textual responses to user instructions for\nvideos is a practical and challenging topic, as it requires both vision\nunderstanding and knowledge reasoning. Compared to language and image\nmodalities, training efficiency remains a serious problem as existing studies\ntrain models on massive sparse videos aligned with brief descriptions. In this\npaper, we introduce BiLL-VTG, a fast adaptive framework that leverages large\nlanguage models (LLMs) to reasoning on videos based on essential lightweight\nvisual tools. Specifically, we reveal the key to response specific instructions\nis the concentration on relevant video events, and utilize two visual tools of\nstructured scene graph generation and descriptive image caption generation to\ngather and represent the events information. Thus, a LLM equipped with world\nknowledge is adopted as the reasoning agent to achieve the response by\nperforming multiple reasoning steps on specified video events.To address the\ndifficulty of specifying events from agent, we further propose an\nInstruction-oriented Video Events Recognition (InsOVER) algorithm based on the\nefficient Hungarian matching to localize corresponding video events using\nlinguistic instructions, enabling LLMs to interact with long videos. Extensive\nexperiments on two typical video-based texts generations tasks show that our\ntuning-free framework outperforms the pre-trained models including\nFlamingo-80B, to achieve the state-of-the-art performance.\n","authors":["Ji Qi","Kaixuan Ji","Jifan Yu","Duokang Wang","Bin Xu","Lei Hou","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2310.10586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10708v1","updated":"2023-10-16T17:04:51Z","published":"2023-10-16T17:04:51Z","title":"Automated Natural Language Explanation of Deep Visual Neurons with Large\n  Models","summary":"  Deep neural networks have exhibited remarkable performance across a wide\nrange of real-world tasks. However, comprehending the underlying reasons for\ntheir effectiveness remains a challenging problem. Interpreting deep neural\nnetworks through examining neurons offers distinct advantages when it comes to\nexploring the inner workings of neural networks. Previous research has\nindicated that specific neurons within deep vision networks possess semantic\nmeaning and play pivotal roles in model performance. Nonetheless, the current\nmethods for generating neuron semantics heavily rely on human intervention,\nwhich hampers their scalability and applicability. To address this limitation,\nthis paper proposes a novel post-hoc framework for generating semantic\nexplanations of neurons with large foundation models, without requiring human\nintervention or prior knowledge. Our framework is designed to be compatible\nwith various model architectures and datasets, facilitating automated and\nscalable neuron interpretation. Experiments are conducted with both qualitative\nand quantitative analysis to verify the effectiveness of our proposed approach.\n","authors":["Chenxu Zhao","Wei Qian","Yucheng Shi","Mengdi Huai","Ninghao Liu"],"pdf_url":"https://arxiv.org/pdf/2310.10708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.06487v2","updated":"2023-10-16T16:53:18Z","published":"2022-03-12T17:18:16Z","title":"Evaluating Explainable AI on a Multi-Modal Medical Imaging Task: Can\n  Existing Algorithms Fulfill Clinical Requirements?","summary":"  Being able to explain the prediction to clinical end-users is a necessity to\nleverage the power of artificial intelligence (AI) models for clinical decision\nsupport. For medical images, a feature attribution map, or heatmap, is the most\ncommon form of explanation that highlights important features for AI models'\nprediction. However, it is unknown how well heatmaps perform on explaining\ndecisions on multi-modal medical images, where each image modality or channel\nvisualizes distinct clinical information of the same underlying biomedical\nphenomenon. Understanding such modality-dependent features is essential for\nclinical users' interpretation of AI decisions. To tackle this clinically\nimportant but technically ignored problem, we propose the modality-specific\nfeature importance (MSFI) metric. It encodes clinical image and explanation\ninterpretation patterns of modality prioritization and modality-specific\nfeature localization. We conduct a clinical requirement-grounded, systematic\nevaluation using computational methods and a clinician user study. Results show\nthat the examined 16 heatmap algorithms failed to fulfill clinical requirements\nto correctly indicate AI model decision process or decision quality. The\nevaluation and MSFI metric can guide the design and selection of XAI algorithms\nto meet clinical requirements on multi-modal explanation.\n","authors":["Weina Jin","Xiaoxiao Li","Ghassan Hamarneh"],"pdf_url":"https://arxiv.org/pdf/2203.06487v2.pdf","comment":"AAAI 2022"},{"id":"http://arxiv.org/abs/2310.10575v1","updated":"2023-10-16T16:52:15Z","published":"2023-10-16T16:52:15Z","title":"Matching the Neuronal Representations of V1 is Necessary to Improve\n  Robustness in CNNs with V1-like Front-ends","summary":"  While some convolutional neural networks (CNNs) have achieved great success\nin object recognition, they struggle to identify objects in images corrupted\nwith different types of common noise patterns. Recently, it was shown that\nsimulating computations in early visual areas at the front of CNNs leads to\nimprovements in robustness to image corruptions. Here, we further explore this\nresult and show that the neuronal representations that emerge from precisely\nmatching the distribution of RF properties found in primate V1 is key for this\nimprovement in robustness. We built two variants of a model with a front-end\nmodeling the primate primary visual cortex (V1): one sampling RF properties\nuniformly and the other sampling from empirical biological distributions. The\nmodel with the biological sampling has a considerably higher robustness to\nimage corruptions that the uniform variant (relative difference of 8.72%).\nWhile similar neuronal sub-populations across the two variants have similar\nresponse properties and learn similar downstream weights, the impact on\ndownstream processing is strikingly different. This result sheds light on the\norigin of the improvements in robustness observed in some biologically-inspired\nmodels, pointing to the need of precisely mimicking the neuronal\nrepresentations found in the primate brain.\n","authors":["Ruxandra Barbulescu","Tiago Marques","Arlindo L. Oliveira"],"pdf_url":"https://arxiv.org/pdf/2310.10575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10563v1","updated":"2023-10-16T16:36:54Z","published":"2023-10-16T16:36:54Z","title":"RefConv: Re-parameterized Refocusing Convolution for Powerful ConvNets","summary":"  We propose Re-parameterized Refocusing Convolution (RefConv) as a replacement\nfor regular convolutional layers, which is a plug-and-play module to improve\nthe performance without any inference costs. Specifically, given a pre-trained\nmodel, RefConv applies a trainable Refocusing Transformation to the basis\nkernels inherited from the pre-trained model to establish connections among the\nparameters. For example, a depth-wise RefConv can relate the parameters of a\nspecific channel of convolution kernel to the parameters of the other kernel,\ni.e., make them refocus on the other parts of the model they have never\nattended to, rather than focus on the input features only. From another\nperspective, RefConv augments the priors of existing model structures by\nutilizing the representations encoded in the pre-trained parameters as the\npriors and refocusing on them to learn novel representations, thus further\nenhancing the representational capacity of the pre-trained model. Experimental\nresults validated that RefConv can improve multiple CNN-based models by a clear\nmargin on image classification (up to 1.47% higher top-1 accuracy on ImageNet),\nobject detection and semantic segmentation without introducing any extra\ninference costs or altering the original model structure. Further studies\ndemonstrated that RefConv can reduce the redundancy of channels and smooth the\nloss landscape, which explains its effectiveness.\n","authors":["Zhicheng Cai","Xiaohan Ding","Qiu Shen","Xun Cao"],"pdf_url":"https://arxiv.org/pdf/2310.10563v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.08612v3","updated":"2023-10-16T16:27:02Z","published":"2023-04-17T20:59:49Z","title":"Bridging Discrete and Backpropagation: Straight-Through and Beyond","summary":"  Backpropagation, the cornerstone of deep learning, is limited to computing\ngradients for continuous variables. This limitation poses challenges for\nproblems involving discrete latent variables. To address this issue, we propose\na novel approach to approximate the gradient of parameters involved in\ngenerating discrete latent variables. First, we examine the widely used\nStraight-Through (ST) heuristic and demonstrate that it works as a first-order\napproximation of the gradient. Guided by our findings, we propose ReinMax,\nwhich achieves second-order accuracy by integrating Heun's method, a\nsecond-order numerical method for solving ODEs. ReinMax does not require\nHessian or other second-order derivatives, thus having negligible computation\noverheads. Extensive experimental results on various tasks demonstrate the\nsuperiority of ReinMax over the state of the art. Implementations are released\nat https://github.com/microsoft/ReinMax.\n","authors":["Liyuan Liu","Chengyu Dong","Xiaodong Liu","Bin Yu","Jianfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2304.08612v3.pdf","comment":"NeurIPS 2023 (Oral)"},{"id":"http://arxiv.org/abs/2310.10547v1","updated":"2023-10-16T16:15:11Z","published":"2023-10-16T16:15:11Z","title":"InfoGCN++: Learning Representation by Predicting the Future for Online\n  Human Skeleton-based Action Recognition","summary":"  Skeleton-based action recognition has made significant advancements recently,\nwith models like InfoGCN showcasing remarkable accuracy. However, these models\nexhibit a key limitation: they necessitate complete action observation prior to\nclassification, which constrains their applicability in real-time situations\nsuch as surveillance and robotic systems. To overcome this barrier, we\nintroduce InfoGCN++, an innovative extension of InfoGCN, explicitly developed\nfor online skeleton-based action recognition. InfoGCN++ augments the abilities\nof the original InfoGCN model by allowing real-time categorization of action\ntypes, independent of the observation sequence's length. It transcends\nconventional approaches by learning from current and anticipated future\nmovements, thereby creating a more thorough representation of the entire\nsequence. Our approach to prediction is managed as an extrapolation issue,\ngrounded on observed actions. To enable this, InfoGCN++ incorporates Neural\nOrdinary Differential Equations, a concept that lets it effectively model the\ncontinuous evolution of hidden states. Following rigorous evaluations on three\nskeleton-based action recognition benchmarks, InfoGCN++ demonstrates\nexceptional performance in online action recognition. It consistently equals or\nexceeds existing techniques, highlighting its significant potential to reshape\nthe landscape of real-time action recognition applications. Consequently, this\nwork represents a major leap forward from InfoGCN, pushing the limits of what's\npossible in online, skeleton-based action recognition. The code for InfoGCN++\nis publicly available at https://github.com/stnoah1/infogcn2 for further\nexploration and validation.\n","authors":["Seunggeun Chi","Hyung-gun Chi","Qixing Huang","Karthik Ramani"],"pdf_url":"https://arxiv.org/pdf/2310.10547v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible"},{"id":"http://arxiv.org/abs/2310.10543v1","updated":"2023-10-16T16:14:20Z","published":"2023-10-16T16:14:20Z","title":"ViPE: Visualise Pretty-much Everything","summary":"  Figurative and non-literal expressions are profoundly integrated in human\ncommunication. Visualising such expressions allow us to convey our creative\nthoughts, and evoke nuanced emotions. Recent text-to-image models like Stable\nDiffusion, on the other hand, struggle to depict non-literal expressions.\nRecent works primarily deal with this issue by compiling humanly annotated\ndatasets on a small scale, which not only demands specialised expertise but\nalso proves highly inefficient. To address this issue, we introduce ViPE:\nVisualise Pretty-much Everything. ViPE offers a series of lightweight and\nrobust language models that have been trained on a large-scale set of lyrics\nwith noisy visual descriptions that represent their implicit meaning. The\nsynthetic visual descriptions are generated by GPT3.5 relying on neither human\nannotations nor images. ViPE effectively expresses any arbitrary piece of text\ninto a visualisable description, enabling meaningful and high-quality image\ngeneration. We provide compelling evidence that ViPE is more robust than GPT3.5\nin synthesising visual elaborations. ViPE also exhibits an understanding of\nfigurative expressions comparable to human experts, providing a powerful and\nopen-source backbone to many downstream applications such as music video and\ncaption generation.\n","authors":["Hassan Shahmohammadi","Adhiraj Ghosh","Hendrik P. A. Lensch"],"pdf_url":"https://arxiv.org/pdf/2310.10543v1.pdf","comment":"To be presented in EMNLP2023 Main Conference"},{"id":"http://arxiv.org/abs/2310.10541v1","updated":"2023-10-16T16:13:53Z","published":"2023-10-16T16:13:53Z","title":"Efficient Dataset Distillation through Alignment with Smooth and\n  High-Quality Expert Trajectories","summary":"  Training a large and state-of-the-art machine learning model typically\nnecessitates the use of large-scale datasets, which, in turn, makes the\ntraining and parameter-tuning process expensive and time-consuming. Some\nresearchers opt to distil information from real-world datasets into tiny and\ncompact synthetic datasets while maintaining their ability to train a\nwell-performing model, hence proposing a data-efficient method known as Dataset\nDistillation (DD). Despite recent progress in this field, existing methods\nstill underperform and cannot effectively replace large datasets. In this\npaper, unlike previous methods that focus solely on improving the efficacy of\nstudent distillation, we are the first to recognize the important interplay\nbetween expert and student. We argue the significant impact of expert\nsmoothness when employing more potent expert trajectories in subsequent dataset\ndistillation. Based on this, we introduce the integration of clipping loss and\ngradient penalty to regulate the rate of parameter changes in expert\ntrajectories. Furthermore, in response to the sensitivity exhibited towards\nrandomly initialized variables during distillation, we propose representative\ninitialization for synthetic dataset and balanced inner-loop loss. Finally, we\npresent two enhancement strategies, namely intermediate matching loss and\nweight perturbation, to mitigate the potential occurrence of cumulative errors.\nWe conduct extensive experiments on datasets of different scales, sizes, and\nresolutions. The results demonstrate that the proposed method significantly\noutperforms prior methods.\n","authors":["Jiyuan Shen","Wenzhuo Yang","Kwok-Yan Lam"],"pdf_url":"https://arxiv.org/pdf/2310.10541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08696v2","updated":"2023-10-16T16:12:09Z","published":"2023-08-16T22:54:49Z","title":"Improving Anomaly Segmentation with Multi-Granularity Cross-Domain\n  Alignment","summary":"  Anomaly segmentation plays a pivotal role in identifying atypical objects in\nimages, crucial for hazard detection in autonomous driving systems. While\nexisting methods demonstrate noteworthy results on synthetic data, they often\nfail to consider the disparity between synthetic and real-world data domains.\nAddressing this gap, we introduce the Multi-Granularity Cross-Domain Alignment\n(MGCDA) framework, tailored to harmonize features across domains at both the\nscene and individual sample levels. Our contributions are twofold: i) We\npresent the Multi-source Domain Adversarial Training module. This integrates a\nmulti-source adversarial loss coupled with dynamic label smoothing,\nfacilitating the learning of domain-agnostic representations across multiple\nprocessing stages. ii) We propose an innovative Cross-domain Anomaly-aware\nContrastive Learning methodology.} This method adeptly selects challenging\nanchor points and images using an anomaly-centric strategy, ensuring precise\nalignment at the sample level. Extensive evaluations of the Fishyscapes and\nRoadAnomaly datasets demonstrate MGCDA's superior performance and adaptability.\nAdditionally, its ability to perform parameter-free inference and function with\nvarious network architectures highlights its distinctiveness in advancing the\nfrontier of anomaly segmentation.\n","authors":["Ji Zhang","Xiao Wu","Zhi-Qi Cheng","Qi He","Wei Li"],"pdf_url":"https://arxiv.org/pdf/2308.08696v2.pdf","comment":"Accepted to ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2212.05525v3","updated":"2023-10-16T16:11:58Z","published":"2022-12-11T15:45:26Z","title":"Extending TrOCR for Text Localization-Free OCR of Full-Page Scanned\n  Receipt Images","summary":"  Digitization of scanned receipts aims to extract text from receipt images and\nsave it into structured documents. This is usually split into two sub-tasks:\ntext localization and optical character recognition (OCR). Most existing OCR\nmodels only focus on the cropped text instance images, which require the\nbounding box information provided by a text region detection model. Introducing\nan additional detector to identify the text instance images in advance adds\ncomplexity, however instance-level OCR models have very low accuracy when\nprocessing the whole image for the document-level OCR, such as receipt images\ncontaining multiple text lines arranged in various layouts. To this end, we\npropose a localization-free document-level OCR model for transcribing all the\ncharacters in a receipt image into an ordered sequence end-to-end.\nSpecifically, we finetune the pretrained instance-level model TrOCR with\nrandomly cropped image chunks, and gradually increase the image chunk size to\ngeneralize the recognition ability from instance images to full-page images. In\nour experiments on the SROIE receipt OCR dataset, the model finetuned with our\nstrategy achieved 64.4 F1-score and a 22.8% character error rate (CER),\nrespectively, which outperforms the baseline results with 48.5 F1-score and\n50.6% CER. The best model, which splits the full image into 15 equally sized\nchunks, gives 87.8 F1-score and 4.98% CER with minimal additional pre or\npost-processing of the output. Moreover, the characters in the generated\ndocument-level sequences are arranged in the reading order, which is practical\nfor real-world applications.\n","authors":["Hongkuan Zhang","Edward Whittaker","Ikuo Kitagishi"],"pdf_url":"https://arxiv.org/pdf/2212.05525v3.pdf","comment":"ICCV2023 RCV Workshop"},{"id":"http://arxiv.org/abs/2306.07967v2","updated":"2023-10-16T15:52:20Z","published":"2023-06-13T17:59:32Z","title":"One-for-All: Generalized LoRA for Parameter-Efficient Fine-tuning","summary":"  We present Generalized LoRA (GLoRA), an advanced approach for universal\nparameter-efficient fine-tuning tasks. Enhancing Low-Rank Adaptation (LoRA),\nGLoRA employs a generalized prompt module to optimize pre-trained model weights\nand adjust intermediate activations, providing more flexibility and capability\nacross diverse tasks and datasets. Moreover, GLoRA facilitates efficient\nparameter adaptation by employing a scalable, modular, layer-wise structure\nsearch that learns individual adapter of each layer. Originating from a unified\nmathematical formulation, GLoRA exhibits strong transfer learning, few-shot\nlearning and domain generalization abilities, as it adapts to new tasks through\nnot only weights but also additional dimensions like activations. Comprehensive\nexperiments demonstrate that GLoRA outperforms all previous methods in natural,\nspecialized, and structured vision benchmarks, achieving superior accuracy with\nfewer parameters and computations. The proposed method on LLaMA-1 and LLaMA-2\nalso show considerable enhancements compared to the original LoRA in the\nlanguage domain. Furthermore, our structural re-parameterization design ensures\nthat GLoRA incurs no extra inference cost, rendering it a practical solution\nfor resource-limited applications. Code and models are available at:\nhttps://github.com/Arnav0400/ViT-Slim/tree/master/GLoRA.\n","authors":["Arnav Chavan","Zhuang Liu","Deepak Gupta","Eric Xing","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2306.07967v2.pdf","comment":"Technical report. v2: Add LLaMA-1&2 results. Code and models at\n  https://github.com/Arnav0400/ViT-Slim/tree/master/GLoRA"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2304.04250v3","updated":"2023-10-16T21:47:20Z","published":"2023-04-09T14:52:18Z","title":"Editable User Profiles for Controllable Text Recommendation","summary":"  Methods for making high-quality recommendations often rely on learning latent\nrepresentations from interaction data. These methods, while performant, do not\nprovide ready mechanisms for users to control the recommendation they receive.\nOur work tackles this problem by proposing LACE, a novel concept value\nbottleneck model for controllable text recommendations. LACE represents each\nuser with a succinct set of human-readable concepts through retrieval given\nuser-interacted documents and learns personalized representations of the\nconcepts based on user documents. This concept based user profile is then\nleveraged to make recommendations. The design of our model affords control over\nthe recommendations through a number of intuitive interactions with a\ntransparent user profile. We first establish the quality of recommendations\nobtained from LACE in an offline evaluation on three recommendation tasks\nspanning six datasets in warm-start, cold-start, and zero-shot setups. Next, we\nvalidate the controllability of LACE under simulated user interactions.\nFinally, we implement LACE in an interactive controllable recommender system\nand conduct a user study to demonstrate that users are able to improve the\nquality of recommendations they receive through interactions with an editable\nuser profile.\n","authors":["Sheshera Mysore","Mahmood Jasim","Andrew McCallum","Hamed Zamani"],"pdf_url":"https://arxiv.org/pdf/2304.04250v3.pdf","comment":"SIGIR-2023 paper with extended results"},{"id":"http://arxiv.org/abs/2310.10808v1","updated":"2023-10-16T20:12:06Z","published":"2023-10-16T20:12:06Z","title":"If the Sources Could Talk: Evaluating Large Language Models for Research\n  Assistance in History","summary":"  The recent advent of powerful Large-Language Models (LLM) provides a new\nconversational form of inquiry into historical memory (or, training data, in\nthis case). We show that by augmenting such LLMs with vector embeddings from\nhighly specialized academic sources, a conversational methodology can be made\naccessible to historians and other researchers in the Humanities. Concretely,\nwe evaluate and demonstrate how LLMs have the ability of assisting researchers\nwhile they examine a customized corpora of different types of documents,\nincluding, but not exclusive to: (1). primary sources, (2). secondary sources\nwritten by experts, and (3). the combination of these two. Compared to\nestablished search interfaces for digital catalogues, such as metadata and\nfull-text search, we evaluate the richer conversational style of LLMs on the\nperformance of two main types of tasks: (1). question-answering, and (2).\nextraction and organization of data. We demonstrate that LLMs semantic\nretrieval and reasoning abilities on problem-specific tasks can be applied to\nlarge textual archives that have not been part of the its training data.\nTherefore, LLMs can be augmented with sources relevant to specific research\nprojects, and can be queried privately by researchers.\n","authors":["Giselle Gonzalez Garcia","Christian Weilbach"],"pdf_url":"https://arxiv.org/pdf/2310.10808v1.pdf","comment":"Will be published at CHR2023"},{"id":"http://arxiv.org/abs/2204.03046v2","updated":"2023-10-16T19:53:37Z","published":"2022-04-06T19:00:02Z","title":"Vertical Allocation-based Fair Exposure Amortizing in Ranking","summary":"  Result ranking often affects consumer satisfaction as well as the amount of\nexposure each item receives in the ranking services. Myopically maximizing\ncustomer satisfaction by ranking items only according to relevance will lead to\nunfair distribution of exposure for items, followed by unfair opportunities and\neconomic gains for item producers/providers. Such unfairness will force\nproviders to leave the system and discourage new providers from coming in.\nEventually, fewer purchase options would be left for consumers, and the\nutilities of both consumers and providers would be harmed. Thus, to maintain a\nbalance between ranking relevance and fairness is crucial for both parties. In\nthis paper, we focus on the exposure fairness in ranking services. We\ndemonstrate that existing methods for amortized fairness optimization could be\nsuboptimal in terms of fairness-relevance tradeoff because they fail to utilize\nthe prior knowledge of consumers. We further propose a novel algorithm named\nVertical Allocation-based Fair Exposure Amortizing in Ranking, or VerFair, to\nreach a better balance between exposure fairness and ranking performance.\nExtensive experiments on three real-world datasets show that VerFair\nsignificantly outperforms state-of-the-art fair ranking algorithms in\nfairness-performance trade-offs from both the individual level and the group\nlevel.\n","authors":["Tao Yang","Zhichao Xu","Qingyao Ai"],"pdf_url":"https://arxiv.org/pdf/2204.03046v2.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.10487v1","updated":"2023-10-16T15:10:42Z","published":"2023-10-16T15:10:42Z","title":"Type-aware Decoding via Explicitly Aggregating Event Information for\n  Document-level Event Extraction","summary":"  Document-level event extraction (DEE) faces two main challenges:\narguments-scattering and multi-event. Although previous methods attempt to\naddress these challenges, they overlook the interference of event-unrelated\nsentences during event detection and neglect the mutual interference of\ndifferent event roles during argument extraction. Therefore, this paper\nproposes a novel Schema-based Explicitly Aggregating~(SEA) model to address\nthese limitations. SEA aggregates event information into event type and role\nrepresentations, enabling the decoding of event records based on specific\ntype-aware representations. By detecting each event based on its event type\nrepresentation, SEA mitigates the interference caused by event-unrelated\ninformation. Furthermore, SEA extracts arguments for each role based on its\nrole-aware representations, reducing mutual interference between different\nroles. Experimental results on the ChFinAnn and DuEE-fin datasets show that SEA\noutperforms the SOTA methods.\n","authors":["Gang Zhao","Yidong Shi","Shudong Lu","Xinjie Yang","Guanting Dong","Jian Xu","Xiaocheng Gong","Si Li"],"pdf_url":"https://arxiv.org/pdf/2310.10487v1.pdf","comment":"Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2310.10481v1","updated":"2023-10-16T15:02:37Z","published":"2023-10-16T15:02:37Z","title":"DemoSG: Demonstration-enhanced Schema-guided Generation for Low-resource\n  Event Extraction","summary":"  Most current Event Extraction (EE) methods focus on the high-resource\nscenario, which requires a large amount of annotated data and can hardly be\napplied to low-resource domains. To address EE more effectively with limited\nresources, we propose the Demonstration-enhanced Schema-guided Generation\n(DemoSG) model, which benefits low-resource EE from two aspects: Firstly, we\npropose the demonstration-based learning paradigm for EE to fully use the\nannotated data, which transforms them into demonstrations to illustrate the\nextraction process and help the model learn effectively. Secondly, we formulate\nEE as a natural language generation task guided by schema-based prompts,\nthereby leveraging label semantics and promoting knowledge transfer in\nlow-resource scenarios. We conduct extensive experiments under in-domain and\ndomain adaptation low-resource settings on three datasets, and study the\nrobustness of DemoSG. The results show that DemoSG significantly outperforms\ncurrent methods in low-resource scenarios.\n","authors":["Gang Zhao","Xiaocheng Gong","Xinjie Yang","Guanting Dong","Shudong Lu","Si Li"],"pdf_url":"https://arxiv.org/pdf/2310.10481v1.pdf","comment":"Accepted by Findings of EMNLP2023"},{"id":"http://arxiv.org/abs/2307.00654v2","updated":"2023-10-16T13:55:59Z","published":"2023-07-02T20:09:09Z","title":"Looks Can Be Deceiving: Linking User-Item Interactions and User's\n  Propensity Towards Multi-Objective Recommendations","summary":"  Multi-objective recommender systems (MORS) provide suggestions to users\naccording to multiple (and possibly conflicting) goals. When a system optimizes\nits results at the individual-user level, it tailors them on a user's\npropensity towards the different objectives. Hence, the capability to\nunderstand users' fine-grained needs towards each goal is crucial. In this\npaper, we present the results of a user study in which we monitored the way\nusers interacted with recommended items, as well as their self-proclaimed\npropensities towards relevance, novelty and diversity objectives. The study was\ndivided into several sessions, where users evaluated recommendation lists\noriginating from a relevance-only single-objective baseline as well as MORS. We\nshow that despite MORS-based recommendations attracted less selections, its\npresence in the early sessions is crucial for users' satisfaction in the later\nstages. Surprisingly, the self-proclaimed willingness of users to interact with\nnovel and diverse items is not always reflected in the recommendations they\naccept. Post-study questionnaires provide insights on how to deal with this\nmatter, suggesting that MORS-based results should be accompanied by elements\nthat allow users to understand the recommendations, so as to facilitate their\nacceptance.\n","authors":["Patrik Dokoupil","Ladislav Peska","Ludovico Boratto"],"pdf_url":"https://arxiv.org/pdf/2307.00654v2.pdf","comment":"Accepted as a short paper at ACM RecSys 2023 conference. See\n  https://doi.org/10.1145/3604915.3608848"},{"id":"http://arxiv.org/abs/2310.10300v1","updated":"2023-10-16T11:36:38Z","published":"2023-10-16T11:36:38Z","title":"BeatDance: A Beat-Based Model-Agnostic Contrastive Learning Framework\n  for Music-Dance Retrieval","summary":"  Dance and music are closely related forms of expression, with mutual\nretrieval between dance videos and music being a fundamental task in various\nfields like education, art, and sports. However, existing methods often suffer\nfrom unnatural generation effects or fail to fully explore the correlation\nbetween music and dance. To overcome these challenges, we propose BeatDance, a\nnovel beat-based model-agnostic contrastive learning framework. BeatDance\nincorporates a Beat-Aware Music-Dance InfoExtractor, a Trans-Temporal Beat\nBlender, and a Beat-Enhanced Hubness Reducer to improve dance-music retrieval\nperformance by utilizing the alignment between music beats and dance movements.\nWe also introduce the Music-Dance (MD) dataset, a large-scale collection of\nover 10,000 music-dance video pairs for training and testing. Experimental\nresults on the MD dataset demonstrate the superiority of our method over\nexisting baselines, achieving state-of-the-art performance. The code and\ndataset will be made public available upon acceptance.\n","authors":["Kaixing Yang","Xukun Zhou","Xulong Tang","Ran Diao","Hongyan Liu","Jun He","Zhaoxin Fan"],"pdf_url":"https://arxiv.org/pdf/2310.10300v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10268v1","updated":"2023-10-16T10:49:46Z","published":"2023-10-16T10:49:46Z","title":"Rethinking Financial Service Promotion With Hybrid Recommender Systems\n  at PicPay","summary":"  The fintech PicPay offers a wide range of financial services to its 30\nmillion monthly active users, with more than 50 thousand items recommended in\nthe PicPay mobile app. In this scenario, promoting specific items that are\nstrategic to the company can be very challenging. In this work, we present a\nSwitching Hybrid Recommender System that combines two algorithms to effectively\npromote items without negatively impacting the user's experience. The results\nof our A/B tests show an uplift of up to 3.2\\% when compared to a default\nrecommendation strategy.\n","authors":["Gabriel Mendonça","Matheus Santos","André Gonçalves","Yan Almeida"],"pdf_url":"https://arxiv.org/pdf/2310.10268v1.pdf","comment":"4 pages, 4 figures, submitted to ACM KDD '23 2nd Workshop on End-End\n  Customer Journey Optimization"},{"id":"http://arxiv.org/abs/2310.10187v1","updated":"2023-10-16T08:48:52Z","published":"2023-10-16T08:48:52Z","title":"An Interpretable Deep-Learning Framework for Predicting Hospital\n  Readmissions From Electronic Health Records","summary":"  With the increasing availability of patients' data, modern medicine is\nshifting towards prospective healthcare. Electronic health records contain a\nvariety of information useful for clinical patient description and can be\nexploited for the construction of predictive models, given that similar medical\nhistories will likely lead to similar progressions. One example is unplanned\nhospital readmission prediction, an essential task for reducing hospital costs\nand improving patient health. Despite predictive models showing very good\nperformances especially with deep-learning models, they are often criticized\nfor the poor interpretability of their results, a fundamental characteristic in\nthe medical field, where incorrect predictions might have serious consequences\nfor the patient health. In this paper we propose a novel, interpretable\ndeep-learning framework for predicting unplanned hospital readmissions,\nsupported by NLP findings on word embeddings and by neural-network models\n(ConvLSTM) for better handling temporal data. We validate our system on the two\npredictive tasks of hospital readmission within 30 and 180 days, using\nreal-world data. In addition, we introduce and test a model-dependent technique\nto make the representation of results easily interpretable by the medical\nstaff. Our solution achieves better performances compared to traditional models\nbased on machine learning, while providing at the same time more interpretable\nresults.\n","authors":["Fabio Azzalini","Tommaso Dolci","Marco Vagaggini"],"pdf_url":"https://arxiv.org/pdf/2310.10187v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10169v1","updated":"2023-10-16T08:16:53Z","published":"2023-10-16T08:16:53Z","title":"DemoNSF: A Multi-task Demonstration-based Generative Framework for Noisy\n  Slot Filling Task","summary":"  Recently, prompt-based generative frameworks have shown impressive\ncapabilities in sequence labeling tasks. However, in practical dialogue\nscenarios, relying solely on simplistic templates and traditional corpora\npresents a challenge for these methods in generalizing to unknown input\nperturbations. To address this gap, we propose a multi-task demonstration based\ngenerative framework for noisy slot filling, named DemoNSF. Specifically, we\nintroduce three noisy auxiliary tasks, namely noisy recovery (NR), random mask\n(RM), and hybrid discrimination (HD), to implicitly capture semantic structural\ninformation of input perturbations at different granularities. In the\ndownstream main task, we design a noisy demonstration construction strategy for\nthe generative framework, which explicitly incorporates task-specific\ninformation and perturbed distribution during training and inference.\nExperiments on two benchmarks demonstrate that DemoNSF outperforms all baseline\nmethods and achieves strong generalization. Further analysis provides empirical\nguidance for the practical application of generative frameworks. Our code is\nreleased at https://github.com/dongguanting/Demo-NSF.\n","authors":["Guanting Dong","Tingfeng Hui","Zhuoma GongQue","Jinxu Zhao","Daichi Guo","Gang Zhao","Keqing He","Weiran Xu"],"pdf_url":"https://arxiv.org/pdf/2310.10169v1.pdf","comment":"Findings of EMNLP 2023 (Short Paper)"},{"id":"http://arxiv.org/abs/2310.10151v1","updated":"2023-10-16T07:43:30Z","published":"2023-10-16T07:43:30Z","title":"DNA: Denoised Neighborhood Aggregation for Fine-grained Category\n  Discovery","summary":"  Discovering fine-grained categories from coarsely labeled data is a practical\nand challenging task, which can bridge the gap between the demand for\nfine-grained analysis and the high annotation cost. Previous works mainly focus\non instance-level discrimination to learn low-level features, but ignore\nsemantic similarities between data, which may prevent these models learning\ncompact cluster representations. In this paper, we propose Denoised\nNeighborhood Aggregation (DNA), a self-supervised framework that encodes\nsemantic structures of data into the embedding space. Specifically, we retrieve\nk-nearest neighbors of a query as its positive keys to capture semantic\nsimilarities between data and then aggregate information from the neighbors to\nlearn compact cluster representations, which can make fine-grained categories\nmore separatable. However, the retrieved neighbors can be noisy and contain\nmany false-positive keys, which can degrade the quality of learned embeddings.\nTo cope with this challenge, we propose three principles to filter out these\nfalse neighbors for better representation learning. Furthermore, we\ntheoretically justify that the learning objective of our framework is\nequivalent to a clustering loss, which can capture semantic similarities\nbetween data to form compact fine-grained clusters. Extensive experiments on\nthree benchmark datasets show that our method can retrieve more accurate\nneighbors (21.31% accuracy improvement) and outperform state-of-the-art models\nby a large margin (average 9.96% improvement on three metrics). Our code and\ndata are available at https://github.com/Lackel/DNA.\n","authors":["Wenbin An","Feng Tian","Wenkai Shi","Yan Chen","Qinghua Zheng","QianYing Wang","Ping Chen"],"pdf_url":"https://arxiv.org/pdf/2310.10151v1.pdf","comment":"Accepted by EMNLP 2023 Main"},{"id":"http://arxiv.org/abs/2310.10108v1","updated":"2023-10-16T06:41:16Z","published":"2023-10-16T06:41:16Z","title":"On Generative Agents in Recommendation","summary":"  Recommender systems are the cornerstone of today's information dissemination,\nyet a disconnect between offline metrics and online performance greatly hinders\ntheir development. Addressing this challenge, we envision a recommendation\nsimulator, capitalizing on recent breakthroughs in human-level intelligence\nexhibited by Large Language Models (LLMs). We propose Agent4Rec, a novel movie\nrecommendation simulator, leveraging LLM-empowered generative agents equipped\nwith user profile, memory, and actions modules specifically tailored for the\nrecommender system. In particular, these agents' profile modules are\ninitialized using the MovieLens dataset, capturing users' unique tastes and\nsocial traits; memory modules log both factual and emotional memories and are\nintegrated with an emotion-driven reflection mechanism; action modules support\na wide variety of behaviors, spanning both taste-driven and emotion-driven\nactions. Each agent interacts with personalized movie recommendations in a\npage-by-page manner, relying on a pre-implemented collaborative filtering-based\nrecommendation algorithm. We delve into both the capabilities and limitations\nof Agent4Rec, aiming to explore an essential research question: to what extent\ncan LLM-empowered generative agents faithfully simulate the behavior of real,\nautonomous humans in recommender systems? Extensive and multi-faceted\nevaluations of Agent4Rec highlight both the alignment and deviation between\nagents and user-personalized preferences. Beyond mere performance comparison,\nwe explore insightful experiments, such as emulating the filter bubble effect\nand discovering the underlying causal relationships in recommendation tasks.\nOur codes are available at https://github.com/LehengTHU/Agent4Rec.\n","authors":["An Zhang","Leheng Sheng","Yuxin Chen","Hao Li","Yang Deng","Xiang Wang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2310.10108v1.pdf","comment":"30 pages,14 figures"},{"id":"http://arxiv.org/abs/2310.10025v1","updated":"2023-10-16T03:00:17Z","published":"2023-10-16T03:00:17Z","title":"Dual-Scale Interest Extraction Framework with Self-Supervision for\n  Sequential Recommendation","summary":"  In the sequential recommendation task, the recommender generally learns\nmultiple embeddings from a user's historical behaviors, to catch the diverse\ninterests of the user. Nevertheless, the existing approaches just extract each\ninterest independently for the corresponding sub-sequence while ignoring the\nglobal correlation of the entire interaction sequence, which may fail to\ncapture the user's inherent preference for the potential interests\ngeneralization and unavoidably make the recommended items homogeneous with the\nhistorical behaviors. In this paper, we propose a novel Dual-Scale Interest\nExtraction framework (DSIE) to precisely estimate the user's current interests.\nSpecifically, DSIE explicitly models the user's inherent preference with\ncontrastive learning by attending over his/her entire interaction sequence at\nthe global scale and catches the user's diverse interests in a fine granularity\nat the local scale. Moreover, we develop a novel interest aggregation module to\nintegrate the multi-interests according to the inherent preference to generate\nthe user's current interests for the next-item prediction. Experiments\nconducted on three real-world benchmark datasets demonstrate that DSIE\noutperforms the state-of-the-art models in terms of recommendation preciseness\nand novelty.\n","authors":["Liangliang Chen","Hongzhan Lin","Jinshan Ma","Guang Chen"],"pdf_url":"https://arxiv.org/pdf/2310.10025v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2310.06434v2","updated":"2023-10-16T21:32:56Z","published":"2023-10-10T09:04:33Z","title":"Whispering LLaMA: A Cross-Modal Generative Error Correction Framework\n  for Speech Recognition","summary":"  We introduce a new cross-modal fusion technique designed for generative error\ncorrection in automatic speech recognition (ASR). Our methodology leverages\nboth acoustic information and external linguistic representations to generate\naccurate speech transcription contexts. This marks a step towards a fresh\nparadigm in generative error correction within the realm of n-best hypotheses.\nUnlike the existing ranking-based rescoring methods, our approach adeptly uses\ndistinct initialization techniques and parameter-efficient algorithms to boost\nASR performance derived from pre-trained speech and text models. Through\nevaluation across diverse ASR datasets, we evaluate the stability and\nreproducibility of our fusion technique, demonstrating its improved word error\nrate relative (WERR) performance in comparison to n-best hypotheses by\nrelatively 37.66%. To encourage future research, we have made our code and\npre-trained models open source at\nhttps://github.com/Srijith-rkr/Whispering-LLaMA.\n","authors":["Srijith Radhakrishnan","Chao-Han Huck Yang","Sumeer Ahmad Khan","Rohit Kumar","Narsis A. Kiani","David Gomez-Cabrero","Jesper N. Tegner"],"pdf_url":"https://arxiv.org/pdf/2310.06434v2.pdf","comment":"Accepted to EMNLP 2023 as main paper. 10 pages. Revised math\n  notations. GitHub: https://github.com/Srijith-rkr/Whispering-LLaMA"},{"id":"http://arxiv.org/abs/2206.05833v2","updated":"2023-10-16T20:29:02Z","published":"2022-06-12T20:25:21Z","title":"COLD Fusion: Calibrated and Ordinal Latent Distribution Fusion for\n  Uncertainty-Aware Multimodal Emotion Recognition","summary":"  Automatically recognising apparent emotions from face and voice is hard, in\npart because of various sources of uncertainty, including in the input data and\nthe labels used in a machine learning framework. This paper introduces an\nuncertainty-aware audiovisual fusion approach that quantifies modality-wise\nuncertainty towards emotion prediction. To this end, we propose a novel fusion\nframework in which we first learn latent distributions over audiovisual\ntemporal context vectors separately, and then constrain the variance vectors of\nunimodal latent distributions so that they represent the amount of information\neach modality provides w.r.t. emotion recognition. In particular, we impose\nCalibration and Ordinal Ranking constraints on the variance vectors of\naudiovisual latent distributions. When well-calibrated, modality-wise\nuncertainty scores indicate how much their corresponding predictions may differ\nfrom the ground truth labels. Well-ranked uncertainty scores allow the ordinal\nranking of different frames across the modalities. To jointly impose both these\nconstraints, we propose a softmax distributional matching loss. In both\nclassification and regression settings, we compare our uncertainty-aware fusion\nmodel with standard model-agnostic fusion baselines. Our evaluation on two\nemotion recognition corpora, AVEC 2019 CES and IEMOCAP, shows that audiovisual\nemotion recognition can considerably benefit from well-calibrated and\nwell-ranked latent uncertainty measures.\n","authors":["Mani Kumar Tellamekala","Shahin Amiriparian","Björn W. Schuller","Elisabeth André","Timo Giesbrecht","Michel Valstar"],"pdf_url":"https://arxiv.org/pdf/2206.05833v2.pdf","comment":"Accepted to IEEE Transactions on Pattern Analysis and Machine\n  Intelligence"},{"id":"http://arxiv.org/abs/2310.10772v1","updated":"2023-10-16T19:12:20Z","published":"2023-10-16T19:12:20Z","title":"Unsupervised Lead Sheet Generation via Semantic Compression","summary":"  Lead sheets have become commonplace in generative music research, being used\nas an initial compressed representation for downstream tasks like multitrack\nmusic generation and automatic arrangement. Despite this, researchers have\noften fallen back on deterministic reduction methods (such as the skyline\nalgorithm) to generate lead sheets when seeking paired lead sheets and full\nscores, with little attention being paid toward the quality of the lead sheets\nthemselves and how they accurately reflect their orchestrated counterparts. To\naddress these issues, we propose the problem of conditional lead sheet\ngeneration (i.e. generating a lead sheet given its full score version), and\nshow that this task can be formulated as an unsupervised music compression\ntask, where the lead sheet represents a compressed latent version of the score.\nWe introduce a novel model, called Lead-AE, that models the lead sheets as a\ndiscrete subselection of the original sequence, using a differentiable top-k\noperator to allow for controllable local sparsity constraints. Across both\nautomatic proxy tasks and direct human evaluations, we find that our method\nimproves upon the established deterministic baseline and produces coherent\nreductions of large multitrack scores.\n","authors":["Zachary Novack","Nikita Srivatsan","Taylor Berg-Kirkpatrick","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2310.10772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09678v2","updated":"2023-10-16T17:07:12Z","published":"2023-08-18T16:57:25Z","title":"PoSynDA: Multi-Hypothesis Pose Synthesis Domain Adaptation for Robust 3D\n  Human Pose Estimation","summary":"  Existing 3D human pose estimators face challenges in adapting to new datasets\ndue to the lack of 2D-3D pose pairs in training sets. To overcome this issue,\nwe propose \\textit{Multi-Hypothesis \\textbf{P}ose \\textbf{Syn}thesis\n\\textbf{D}omain \\textbf{A}daptation} (\\textbf{PoSynDA}) framework to bridge\nthis data disparity gap in target domain. Typically, PoSynDA uses a\ndiffusion-inspired structure to simulate 3D pose distribution in the target\ndomain. By incorporating a multi-hypothesis network, PoSynDA generates diverse\npose hypotheses and aligns them with the target domain. To do this, it first\nutilizes target-specific source augmentation to obtain the target domain\ndistribution data from the source domain by decoupling the scale and position\nparameters. The process is then further refined through the teacher-student\nparadigm and low-rank adaptation. With extensive comparison of benchmarks such\nas Human3.6M and MPI-INF-3DHP, PoSynDA demonstrates competitive performance,\neven comparable to the target-trained MixSTE model\\cite{zhang2022mixste}. This\nwork paves the way for the practical application of 3D human pose estimation in\nunseen domains. The code is available at https://github.com/hbing-l/PoSynDA.\n","authors":["Hanbing Liu","Jun-Yan He","Zhi-Qi Cheng","Wangmeng Xiang","Qize Yang","Wenhao Chai","Gaoang Wang","Xu Bao","Bin Luo","Yifeng Geng","Xuansong Xie"],"pdf_url":"https://arxiv.org/pdf/2308.09678v2.pdf","comment":"Accepted to ACM Multimedia 2023; 10 pages, 4 figures, 8 tables; the\n  code is at https://github.com/hbing-l/PoSynDA"},{"id":"http://arxiv.org/abs/2308.08696v2","updated":"2023-10-16T16:12:09Z","published":"2023-08-16T22:54:49Z","title":"Improving Anomaly Segmentation with Multi-Granularity Cross-Domain\n  Alignment","summary":"  Anomaly segmentation plays a pivotal role in identifying atypical objects in\nimages, crucial for hazard detection in autonomous driving systems. While\nexisting methods demonstrate noteworthy results on synthetic data, they often\nfail to consider the disparity between synthetic and real-world data domains.\nAddressing this gap, we introduce the Multi-Granularity Cross-Domain Alignment\n(MGCDA) framework, tailored to harmonize features across domains at both the\nscene and individual sample levels. Our contributions are twofold: i) We\npresent the Multi-source Domain Adversarial Training module. This integrates a\nmulti-source adversarial loss coupled with dynamic label smoothing,\nfacilitating the learning of domain-agnostic representations across multiple\nprocessing stages. ii) We propose an innovative Cross-domain Anomaly-aware\nContrastive Learning methodology.} This method adeptly selects challenging\nanchor points and images using an anomaly-centric strategy, ensuring precise\nalignment at the sample level. Extensive evaluations of the Fishyscapes and\nRoadAnomaly datasets demonstrate MGCDA's superior performance and adaptability.\nAdditionally, its ability to perform parameter-free inference and function with\nvarious network architectures highlights its distinctiveness in advancing the\nfrontier of anomaly segmentation.\n","authors":["Ji Zhang","Xiao Wu","Zhi-Qi Cheng","Qi He","Wei Li"],"pdf_url":"https://arxiv.org/pdf/2308.08696v2.pdf","comment":"Accepted to ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2310.10292v1","updated":"2023-10-16T11:28:22Z","published":"2023-10-16T11:28:22Z","title":"Effortless Cross-Platform Video Codec: A Codebook-Based Method","summary":"  Under certain circumstances, advanced neural video codecs can surpass the\nmost complex traditional codecs in their rate-distortion (RD) performance. One\nof the main reasons for the high performance of existing neural video codecs is\nthe use of the entropy model, which can provide more accurate probability\ndistribution estimations for compressing the latents. This also implies the\nrigorous requirement that entropy models running on different platforms should\nuse consistent distribution estimations. However, in cross-platform scenarios,\nentropy models running on different platforms usually yield inconsistent\nprobability distribution estimations due to floating point computation errors\nthat are platform-dependent, which can cause the decoding side to fail in\ncorrectly decoding the compressed bitstream sent by the encoding side. In this\npaper, we propose a cross-platform video compression framework based on\ncodebooks, which avoids autoregressive entropy modeling and achieves video\ncompression by transmitting the index sequence of the codebooks. Moreover,\ninstead of using optical flow for context alignment, we propose to use the\nconditional cross-attention module to obtain the context between frames. Due to\nthe absence of autoregressive modeling and optical flow alignment, we can\ndesign an extremely minimalist framework that can greatly benefit computational\nefficiency. Importantly, our framework no longer contains any distribution\nestimation modules for entropy modeling, and thus computations across platforms\nare not necessarily consistent. Experimental results show that our method can\noutperform the traditional H.265 (medium) even without any entropy constraints,\nwhile achieving the cross-platform property intrinsically.\n","authors":["Kuan Tian","Yonghang Guan","Jinxi Xiang","Jun Zhang","Xiao Han","Wei Yang"],"pdf_url":"https://arxiv.org/pdf/2310.10292v1.pdf","comment":"15 pages, 11 figures"},{"id":"http://arxiv.org/abs/2310.10036v1","updated":"2023-10-16T03:44:10Z","published":"2023-10-16T03:44:10Z","title":"Evading Detection Actively: Toward Anti-Forensics against Forgery\n  Localization","summary":"  Anti-forensics seeks to eliminate or conceal traces of tampering artifacts.\nTypically, anti-forensic methods are designed to deceive binary detectors and\npersuade them to misjudge the authenticity of an image. However, to the best of\nour knowledge, no attempts have been made to deceive forgery detectors at the\npixel level and mis-locate forged regions. Traditional adversarial attack\nmethods cannot be directly used against forgery localization due to the\nfollowing defects: 1) they tend to just naively induce the target forensic\nmodels to flip their pixel-level pristine or forged decisions; 2) their\nanti-forensics performance tends to be severely degraded when faced with the\nunseen forensic models; 3) they lose validity once the target forensic models\nare retrained with the anti-forensics images generated by them. To tackle the\nthree defects, we propose SEAR (Self-supErvised Anti-foRensics), a novel\nself-supervised and adversarial training algorithm that effectively trains\ndeep-learning anti-forensic models against forgery localization. SEAR sets a\npretext task to reconstruct perturbation for self-supervised learning. In\nadversarial training, SEAR employs a forgery localization model as a supervisor\nto explore tampering features and constructs a deep-learning concealer to erase\ncorresponding traces. We have conducted largescale experiments across diverse\ndatasets. The experimental results demonstrate that, through the combination of\nself-supervised learning and adversarial learning, SEAR successfully deceives\nthe state-of-the-art forgery localization methods, as well as tackle the three\ndefects regarding traditional adversarial attack methods mentioned above.\n","authors":["Long Zhuo","Shenghai Luo","Shunquan Tan","Han Chen","Bin Li","Jiwu Huang"],"pdf_url":"https://arxiv.org/pdf/2310.10036v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10033v1","updated":"2023-10-16T03:30:35Z","published":"2023-10-16T03:30:35Z","title":"Deep Unfolding Network for Image Compressed Sensing by Content-adaptive\n  Gradient Updating and Deformation-invariant Non-local Modeling","summary":"  Inspired by certain optimization solvers, the deep unfolding network (DUN)\nhas attracted much attention in recent years for image compressed sensing (CS).\nHowever, there still exist the following two issues: 1) In existing DUNs, most\nhyperparameters are usually content independent, which greatly limits their\nadaptability for different input contents. 2) In each iteration, a plain\nconvolutional neural network is usually adopted, which weakens the perception\nof wider context prior and therefore depresses the expressive ability. In this\npaper, inspired by the traditional Proximal Gradient Descent (PGD) algorithm, a\nnovel DUN for image compressed sensing (dubbed DUN-CSNet) is proposed to solve\nthe above two issues. Specifically, for the first issue, a novel content\nadaptive gradient descent network is proposed, in which a well-designed step\nsize generation sub-network is developed to dynamically allocate the\ncorresponding step sizes for different textures of input image by generating a\ncontent-aware step size map, realizing a content-adaptive gradient updating.\nFor the second issue, considering the fact that many similar patches exist in\nan image but have undergone a deformation, a novel deformation-invariant\nnon-local proximal mapping network is developed, which can adaptively build the\nlong-range dependencies between the nonlocal patches by deformation-invariant\nnon-local modeling, leading to a wider perception on context priors. Extensive\nexperiments manifest that the proposed DUN-CSNet outperforms existing\nstate-of-the-art CS methods by large margins.\n","authors":["Wenxue Cui","Xiaopeng Fan","Jian Zhang","Debin Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.10033v1.pdf","comment":"16 pages, 13 figures. Accepted by IEEE Transactions on Multimedia\n  (TMM)"}]},"2023-10-15T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2310.09983v1","updated":"2023-10-15T23:23:27Z","published":"2023-10-15T23:23:27Z","title":"Farzi Data: Autoregressive Data Distillation","summary":"  We study data distillation for auto-regressive machine learning tasks, where\nthe input and output have a strict left-to-right causal structure. More\nspecifically, we propose Farzi, which summarizes an event sequence dataset into\na small number of synthetic sequences -- Farzi Data -- which are optimized to\nmaintain (if not improve) model performance compared to training on the full\ndataset. Under the hood, Farzi conducts memory-efficient data distillation by\n(i) deriving efficient reverse-mode differentiation of the Adam optimizer by\nleveraging Hessian-Vector Products; and (ii) factorizing the high-dimensional\ndiscrete event-space into a latent-space which provably promotes implicit\nregularization. Empirically, for sequential recommendation and language\nmodeling tasks, we are able to achieve 98-120% of downstream full-data\nperformance when training state-of-the-art models on Farzi Data of size as\nlittle as 0.1% of the original dataset. Notably, being able to train better\nmodels with significantly less data sheds light on the design of future large\nauto-regressive models, and opens up new opportunities to further scale up\nmodel and data sizes.\n","authors":["Noveen Sachdeva","Zexue He","Wang-Cheng Kang","Jianmo Ni","Derek Zhiyuan Cheng","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2310.09983v1.pdf","comment":"Under review. 23 pages, 9 figures"},{"id":"http://arxiv.org/abs/2002.05606v2","updated":"2023-10-15T20:36:10Z","published":"2020-02-13T16:30:34Z","title":"Sentiment Analysis Using Averaged Weighted Word Vector Features","summary":"  People use the world wide web heavily to share their experience with entities\nsuch as products, services, or travel destinations. Texts that provide online\nfeedback in the form of reviews and comments are essential to make consumer\ndecisions. These comments create a valuable source that may be used to measure\nsatisfaction related to products or services. Sentiment analysis is the task of\nidentifying opinions expressed in such text fragments. In this work, we develop\ntwo methods that combine different types of word vectors to learn and estimate\npolarity of reviews. We develop average review vectors from word vectors and\nadd weights to this review vectors using word frequencies in positive and\nnegative sensitivity-tagged reviews. We applied the methods to several datasets\nfrom different domains that are used as standard benchmarks for sentiment\nanalysis. We ensemble the techniques with each other and existing methods, and\nwe make a comparison with the approaches in the literature. The results show\nthat the performances of our approaches outperform the state-of-the-art success\nrates.\n","authors":["Ali Erkan","Tunga Gungor"],"pdf_url":"https://arxiv.org/pdf/2002.05606v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09874v1","updated":"2023-10-15T16:15:07Z","published":"2023-10-15T16:15:07Z","title":"Leveraging Large Language Models (LLMs) to Empower Training-Free Dataset\n  Condensation for Content-Based Recommendation","summary":"  Modern techniques in Content-based Recommendation (CBR) leverage item content\ninformation to provide personalized services to users, but suffer from\nresource-intensive training on large datasets. To address this issue, we\nexplore the dataset condensation for textual CBR in this paper. The goal of\ndataset condensation is to synthesize a small yet informative dataset, upon\nwhich models can achieve performance comparable to those trained on large\ndatasets. While existing condensation approaches are tailored to classification\ntasks for continuous data like images or embeddings, direct application of them\nto CBR has limitations. To bridge this gap, we investigate efficient dataset\ncondensation for content-based recommendation. Inspired by the remarkable\nabilities of large language models (LLMs) in text comprehension and generation,\nwe leverage LLMs to empower the generation of textual content during\ncondensation. To handle the interaction data involving both users and items, we\ndevise a dual-level condensation method: content-level and user-level. At\ncontent-level, we utilize LLMs to condense all contents of an item into a new\ninformative title. At user-level, we design a clustering-based synthesis\nmodule, where we first utilize LLMs to extract user interests. Then, the user\ninterests and user embeddings are incorporated to condense users and generate\ninteractions for condensed users. Notably, the condensation paradigm of this\nmethod is forward and free from iterative optimization on the synthesized\ndataset. Extensive empirical findings from our study, conducted on three\nauthentic datasets, substantiate the efficacy of the proposed method.\nParticularly, we are able to approximate up to 97% of the original performance\nwhile reducing the dataset size by 95% (i.e., on dataset MIND).\n","authors":["Jiahao Wu","Qijiong Liu","Hengchang Hu","Wenqi Fan","Shengcai Liu","Qing Li","Xiao-Ming Wu","Ke Tang"],"pdf_url":"https://arxiv.org/pdf/2310.09874v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09846v1","updated":"2023-10-15T14:15:29Z","published":"2023-10-15T14:15:29Z","title":"Generalizing Few-Shot Named Entity Recognizers to Unseen Domains with\n  Type-Related Features","summary":"  Few-shot named entity recognition (NER) has shown remarkable progress in\nidentifying entities in low-resource domains. However, few-shot NER methods\nstill struggle with out-of-domain (OOD) examples due to their reliance on\nmanual labeling for the target domain. To address this limitation, recent\nstudies enable generalization to an unseen target domain with only a few\nlabeled examples using data augmentation techniques. Two important challenges\nremain: First, augmentation is limited to the training data, resulting in\nminimal overlap between the generated data and OOD examples. Second, knowledge\ntransfer is implicit and insufficient, severely hindering model\ngeneralizability and the integration of knowledge from the source domain. In\nthis paper, we propose a framework, prompt learning with type-related features\n(PLTR), to address these challenges. To identify useful knowledge in the source\ndomain and enhance knowledge transfer, PLTR automatically extracts entity\ntype-related features (TRFs) based on mutual information criteria. To bridge\nthe gap between training and OOD data, PLTR generates a unique prompt for each\nunseen example by selecting relevant TRFs. We show that PLTR achieves\nsignificant performance improvements on in-domain and cross-domain datasets.\nThe use of PLTR facilitates model adaptation and increases representation\nsimilarities between the source and unseen domains.\n","authors":["Zihan Wang","Ziqi Zhao","Zhumin Chen","Pengjie Ren","Maarten de Rijke","Zhaochun Ren"],"pdf_url":"https://arxiv.org/pdf/2310.09846v1.pdf","comment":"Accepted at EMNLP findings"},{"id":"http://arxiv.org/abs/2310.09806v1","updated":"2023-10-15T11:41:54Z","published":"2023-10-15T11:41:54Z","title":"Can LSH (Locality-Sensitive Hashing) Be Replaced by Neural Network?","summary":"  With the rapid development of GPU (Graphics Processing Unit) technologies and\nneural networks, we can explore more appropriate data structures and\nalgorithms. Recent progress shows that neural networks can partly replace\ntraditional data structures. In this paper, we proposed a novel DNN (Deep\nNeural Network)-based learned locality-sensitive hashing, called LLSH, to\nefficiently and flexibly map high-dimensional data to low-dimensional space.\nLLSH replaces the traditional LSH (Locality-sensitive Hashing) function\nfamilies with parallel multi-layer neural networks, which reduces the time and\nmemory consumption and guarantees query accuracy simultaneously. The proposed\nLLSH demonstrate the feasibility of replacing the hash index with\nlearning-based neural networks and open a new door for developers to design and\nconfigure data organization more accurately to improve information-searching\nperformance. Extensive experiments on different types of datasets show the\nsuperiority of the proposed method in query accuracy, time consumption, and\nmemory usage.\n","authors":["Renyang Liu","Jun Zhao","Xing Chu","Yu Liang","Wei Zhou","Jing He"],"pdf_url":"https://arxiv.org/pdf/2310.09806v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.03536v2","updated":"2023-10-15T09:41:15Z","published":"2022-11-07T13:22:28Z","title":"Knowledge Graph Embedding: A Survey from the Perspective of\n  Representation Spaces","summary":"  Knowledge graph embedding (KGE) is an increasingly popular technique that\naims to represent entities and relations of knowledge graphs into\nlow-dimensional semantic spaces for a wide spectrum of applications such as\nlink prediction, knowledge reasoning and knowledge completion. In this paper,\nwe provide a systematic review of existing KGE techniques based on\nrepresentation spaces. Particularly, we build a fine-grained classification to\ncategorise the models based on three mathematical perspectives of the\nrepresentation spaces: (1) Algebraic perspective, (2) Geometric perspective,\nand (3) Analytical perspective. We introduce the rigorous definitions of\nfundamental mathematical spaces before diving into KGE models and their\nmathematical properties. We further discuss different KGE methods over the\nthree categories, as well as summarise how spatial advantages work over\ndifferent embedding needs. By collating the experimental results from\ndownstream tasks, we also explore the advantages of mathematical space in\ndifferent scenarios and the reasons behind them. We further state some\npromising research directions from a representation space perspective, with\nwhich we hope to inspire researchers to design their KGE models as well as\ntheir related applications with more consideration of their mathematical space\nproperties.\n","authors":["Jiahang Cao","Jinyuan Fang","Zaiqiao Meng","Shangsong Liang"],"pdf_url":"https://arxiv.org/pdf/2211.03536v2.pdf","comment":"42 pages, 6 figures, 9 tables"},{"id":"http://arxiv.org/abs/2212.09598v3","updated":"2023-10-15T03:43:53Z","published":"2022-12-19T16:34:19Z","title":"Query-as-context Pre-training for Dense Passage Retrieval","summary":"  Recently, methods have been developed to improve the performance of dense\npassage retrieval by using context-supervised pre-training. These methods\nsimply consider two passages from the same document to be relevant, without\ntaking into account the possibility of weakly correlated pairs. Thus, this\npaper proposes query-as-context pre-training, a simple yet effective\npre-training technique to alleviate the issue. Query-as-context pre-training\nassumes that the query derived from a passage is more likely to be relevant to\nthat passage and forms a passage-query pair. These passage-query pairs are then\nused in contrastive or generative context-supervised pre-training. The\npre-trained models are evaluated on large-scale passage retrieval benchmarks\nand out-of-domain zero-shot benchmarks. Experimental results show that\nquery-as-context pre-training brings considerable gains and meanwhile speeds up\ntraining, demonstrating its effectiveness and efficiency. Our code will be\navailable at https://github.com/caskcsg/ir/tree/main/cotmae-qc .\n","authors":["Xing Wu","Guangyuan Ma","Wanhui Qian","Zijia Lin","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2212.09598v3.pdf","comment":"EMNLP 2023 Main Conference"},{"id":"http://arxiv.org/abs/2310.09716v1","updated":"2023-10-15T03:04:17Z","published":"2023-10-15T03:04:17Z","title":"Enhancing Conversational Search: Large Language Model-Aided Informative\n  Query Rewriting","summary":"  Query rewriting plays a vital role in enhancing conversational search by\ntransforming context-dependent user queries into standalone forms. Existing\napproaches primarily leverage human-rewritten queries as labels to train query\nrewriting models. However, human rewrites may lack sufficient information for\noptimal retrieval performance. To overcome this limitation, we propose\nutilizing large language models (LLMs) as query rewriters, enabling the\ngeneration of informative query rewrites through well-designed instructions. We\ndefine four essential properties for well-formed rewrites and incorporate all\nof them into the instruction. In addition, we introduce the role of rewrite\neditors for LLMs when initial query rewrites are available, forming a\n``rewrite-then-edit'' process. Furthermore, we propose distilling the rewriting\ncapabilities of LLMs into smaller models to reduce rewriting latency. Our\nexperimental evaluation on the QReCC dataset demonstrates that informative\nquery rewrites can yield substantially improved retrieval performance compared\nto human rewrites, especially with sparse retrievers.\n","authors":["Fanghua Ye","Meng Fang","Shenghui Li","Emine Yilmaz"],"pdf_url":"https://arxiv.org/pdf/2310.09716v1.pdf","comment":"21 pages, accepted to EMNLP Findings 2023"},{"id":"http://arxiv.org/abs/2310.09706v1","updated":"2023-10-15T02:19:28Z","published":"2023-10-15T02:19:28Z","title":"AdaptSSR: Pre-training User Model with Augmentation-Adaptive\n  Self-Supervised Ranking","summary":"  User modeling, which aims to capture users' characteristics or interests,\nheavily relies on task-specific labeled data and suffers from the data sparsity\nissue. Several recent studies tackled this problem by pre-training the user\nmodel on massive user behavior sequences with a contrastive learning task.\nGenerally, these methods assume different views of the same behavior sequence\nconstructed via data augmentation are semantically consistent, i.e., reflecting\nsimilar characteristics or interests of the user, and thus maximizing their\nagreement in the feature space. However, due to the diverse interests and heavy\nnoise in user behaviors, existing augmentation methods tend to lose certain\ncharacteristics of the user or introduce noisy interests. Thus, forcing the\nuser model to directly maximize the similarity between the augmented views may\nresult in a negative transfer. To this end, we propose to replace the\ncontrastive learning task with a new pretext task: Augmentation-Adaptive\nSelf-Supervised Ranking (AdaptSSR), which alleviates the requirement of\nsemantic consistency between the augmented views while pre-training a\ndiscriminative user model. Specifically, we adopt a multiple pairwise ranking\nloss which trains the user model to capture the similarity orders between the\nimplicitly augmented view, the explicitly augmented view, and views from other\nusers. We further employ an in-batch hard negative sampling strategy to\nfacilitate model training. Moreover, considering the distinct impacts of data\naugmentation on different behavior sequences, we design an\naugmentation-adaptive fusion mechanism to automatically adjust the similarity\norder constraint applied to each sample based on the estimated similarity\nbetween the augmented views. Extensive experiments on both public and\nindustrial datasets with six downstream tasks verify the effectiveness of\nAdaptSSR.\n","authors":["Yang Yu","Qi Liu","Kai Zhang","Yuren Zhang","Chao Song","Min Hou","Yuqing Yuan","Zhihao Ye","Zaixi Zhang","Sanshi Lei Yu"],"pdf_url":"https://arxiv.org/pdf/2310.09706v1.pdf","comment":"Accepted by NeurIPS 2023"}],"Multimedia":[{"id":"http://arxiv.org/abs/2310.09853v1","updated":"2023-10-15T15:00:00Z","published":"2023-10-15T15:00:00Z","title":"MERTech: Instrument Playing Technique Detection Using Self-Supervised\n  Pretrained Model With Multi-Task Finetuning","summary":"  Instrument playing techniques (IPTs) constitute a pivotal component of\nmusical expression. However, the development of automatic IPT detection methods\nsuffers from limited labeled data and inherent class imbalance issues. In this\npaper, we propose to apply a self-supervised learning model pre-trained on\nlarge-scale unlabeled music data and finetune it on IPT detection tasks. This\napproach addresses data scarcity and class imbalance challenges. Recognizing\nthe significance of pitch in capturing the nuances of IPTs and the importance\nof onset in locating IPT events, we investigate multi-task finetuning with\npitch and onset detection as auxiliary tasks. Additionally, we apply a\npost-processing approach for event-level prediction, where an IPT activation\ninitiates an event only if the onset output confirms an onset in that frame.\nOur method outperforms prior approaches in both frame-level and event-level\nmetrics across multiple IPT benchmark datasets. Further experiments demonstrate\nthe efficacy of multi-task finetuning on each IPT class.\n","authors":["Dichucheng Li","Yinghao Ma","Weixing Wei","Qiuqiang Kong","Yulun Wu","Mingjin Che","Fan Xia","Emmanouil Benetos","Wei Li"],"pdf_url":"https://arxiv.org/pdf/2310.09853v1.pdf","comment":"submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2309.17160v2","updated":"2023-10-15T12:23:50Z","published":"2023-09-29T11:52:50Z","title":"Redistributing the Precision and Content in 3D-LUT-based Inverse\n  Tone-mapping for HDR/WCG Display","summary":"  ITM(inverse tone-mapping) converts SDR (standard dynamic range) footage to\nHDR/WCG (high dynamic range /wide color gamut) for media production. It happens\nnot only when remastering legacy SDR footage in front-end content provider, but\nalso adapting on-theair SDR service on user-end HDR display. The latter\nrequires more efficiency, thus the pre-calculated LUT (look-up table) has\nbecome a popular solution. Yet, conventional fixed LUT lacks adaptability, so\nwe learn from research community and combine it with AI. Meanwhile,\nhigher-bit-depth HDR/WCG requires larger LUT than SDR, so we consult\ntraditional ITM for an efficiency-performance trade-off: We use 3 smaller LUTs,\neach has a non-uniform packing (precision) respectively denser in dark, middle\nand bright luma range. In this case, their results will have less error only in\ntheir own range, so we use a contribution map to combine their best parts to\nfinal result. With the guidance of this map, the elements (content) of 3 LUTs\nwill also be redistributed during training. We conduct ablation studies to\nverify method's effectiveness, and subjective and objective experiments to show\nits practicability. Code is available at: https://github.com/AndreGuo/ITMLUT.\n","authors":["Cheng Guo","Leidong Fan","Qian Zhang","Hanyuan Liu","Kanglin Liu","Xiuhua Jiang"],"pdf_url":"https://arxiv.org/pdf/2309.17160v2.pdf","comment":"Accepted in CVMP2023 (the 20th ACM SIGGRAPH European Conference on\n  Visual Media Production)"}]},"2023-10-14T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2310.07786v2","updated":"2023-10-14T20:10:12Z","published":"2023-10-11T18:15:55Z","title":"Non-Stationary Contextual Bandit Learning via Neural Predictive Ensemble\n  Sampling","summary":"  Real-world applications of contextual bandits often exhibit non-stationarity\ndue to seasonality, serendipity, and evolving social trends. While a number of\nnon-stationary contextual bandit learning algorithms have been proposed in the\nliterature, they excessively explore due to a lack of prioritization for\ninformation of enduring value, or are designed in ways that do not scale in\nmodern applications with high-dimensional user-specific features and large\naction set, or both. In this paper, we introduce a novel non-stationary\ncontextual bandit algorithm that addresses these concerns. It combines a\nscalable, deep-neural-network-based architecture with a carefully designed\nexploration mechanism that strategically prioritizes collecting information\nwith the most lasting value in a non-stationary environment. Through empirical\nevaluations on two real-world recommendation datasets, which exhibit pronounced\nnon-stationarity, we demonstrate that our approach significantly outperforms\nthe state-of-the-art baselines.\n","authors":["Zheqing Zhu","Yueyang Liu","Xu Kuang","Benjamin Van Roy"],"pdf_url":"https://arxiv.org/pdf/2310.07786v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14302v2","updated":"2023-10-14T18:09:31Z","published":"2023-05-23T17:43:46Z","title":"VIP5: Towards Multimodal Foundation Models for Recommendation","summary":"  Computer Vision (CV), Natural Language Processing (NLP), and Recommender\nSystems (RecSys) are three prominent AI applications that have traditionally\ndeveloped independently, resulting in disparate modeling and engineering\nmethodologies. This has impeded the ability for these fields to directly\nbenefit from each other's advancements. With the recent development of\nfoundation models, large language models have emerged as a potential\ngeneral-purpose interface for unifying different modalities and problem\nformulations. In light of this, we propose the development of a multimodal\nfoundation model (MFM) considering visual, textual, and personalization\nmodalities under the P5 recommendation paradigm, thus named VIP5 (Visual P5),\nto unify various modalities and recommendation tasks. This will enable the\nprocessing of multiple modalities in a shared architecture for improved\nrecommendations. To achieve this, we introduce multimodal personalized prompts\nto accommodate multiple modalities under a shared format. Additionally, we\npropose a parameter-efficient training method for foundation models, which\ninvolves freezing the P5 backbone and fine-tuning lightweight adapters,\nresulting in improved recommendation performance and increased efficiency in\nterms of training time and memory usage. Code and data of VIP5 are available at\nhttps://github.com/jeykigung/VIP5.\n","authors":["Shijie Geng","Juntao Tan","Shuchang Liu","Zuohui Fu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.14302v2.pdf","comment":"Accepted by EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.09593v1","updated":"2023-10-14T14:29:52Z","published":"2023-10-14T14:29:52Z","title":"Context-aware Session-based Recommendation with Graph Neural Networks","summary":"  Session-based recommendation (SBR) is a task that aims to predict items based\non anonymous sequences of user behaviors in a session. While there are methods\nthat leverage rich context information in sessions for SBR, most of them have\nthe following limitations: 1) they fail to distinguish the item-item edge types\nwhen constructing the global graph for exploiting cross-session contexts; 2)\nthey learn a fixed embedding vector for each item, which lacks the flexibility\nto reflect the variation of user interests across sessions; 3) they generally\nuse the one-hot encoded vector of the target item as the hard label to predict,\nthus failing to capture the true user preference. To solve these issues, we\npropose CARES, a novel context-aware session-based recommendation model with\ngraph neural networks, which utilizes different types of contexts in sessions\nto capture user interests. Specifically, we first construct a multi-relation\ncross-session graph to connect items according to intra- and cross-session\nitem-level contexts. Further, to encode the variation of user interests, we\ndesign personalized item representations. Finally, we employ a label\ncollaboration strategy for generating soft user preference distribution as\nlabels. Experiments on three benchmark datasets demonstrate that CARES\nconsistently outperforms state-of-the-art models in terms of P@20 and MRR@20.\nOur data and codes are publicly available at\nhttps://github.com/brilliantZhang/CARES.\n","authors":["Zhihui Zhang","JianXiang Yu","Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2310.09593v1.pdf","comment":"10 pages, 10 figures, conference"},{"id":"http://arxiv.org/abs/2310.09536v1","updated":"2023-10-14T08:46:24Z","published":"2023-10-14T08:46:24Z","title":"CarExpert: Leveraging Large Language Models for In-Car Conversational\n  Question Answering","summary":"  Large language models (LLMs) have demonstrated remarkable performance by\nfollowing natural language instructions without fine-tuning them on\ndomain-specific tasks and data. However, leveraging LLMs for domain-specific\nquestion answering suffers from severe limitations. The generated answer tends\nto hallucinate due to the training data collection time (when using\noff-the-shelf), complex user utterance and wrong retrieval (in\nretrieval-augmented generation). Furthermore, due to the lack of awareness\nabout the domain and expected output, such LLMs may generate unexpected and\nunsafe answers that are not tailored to the target domain. In this paper, we\npropose CarExpert, an in-car retrieval-augmented conversational\nquestion-answering system leveraging LLMs for different tasks. Specifically,\nCarExpert employs LLMs to control the input, provide domain-specific documents\nto the extractive and generative answering components, and controls the output\nto ensure safe and domain-specific answers. A comprehensive empirical\nevaluation exhibits that CarExpert outperforms state-of-the-art LLMs in\ngenerating natural, safe and car-specific answers.\n","authors":["Md Rashad Al Hasan Rony","Christian Suess","Sinchana Ramakanth Bhat","Viju Sudhi","Julia Schneider","Maximilian Vogel","Roman Teucher","Ken E. Friedl","Soumya Sahoo"],"pdf_url":"https://arxiv.org/pdf/2310.09536v1.pdf","comment":"Accepted into EMNLP 2023 (industry track), corresponding Author: Md\n  Rashad Al Hasan Rony"},{"id":"http://arxiv.org/abs/2310.09508v1","updated":"2023-10-14T06:32:06Z","published":"2023-10-14T06:32:06Z","title":"Findability: A Novel Measure of Information Accessibility","summary":"  The overwhelming volume of data generated and indexed by search engines poses\na significant challenge in retrieving documents from the index efficiently and\neffectively. Even with a well-crafted query, several relevant documents often\nget buried among a multitude of competing documents, resulting in reduced\naccessibility or `findability' of the desired document. Consequently, it is\ncrucial to develop a robust methodology for assessing this dimension of\nInformation Retrieval (IR) system performance. While previous studies have\nfocused on measuring document accessibility disregarding user queries and\ndocument relevance, there exists no metric to quantify the findability of a\ndocument within a given IR system without resorting to manual labor. This paper\naims to address this gap by defining and deriving a metric to evaluate the\nfindability of documents as perceived by end-users. Through experiments, we\ndemonstrate the varying impact of different retrieval models and collections on\nthe findability of documents. Furthermore, we establish the findability measure\nas an independent metric distinct from retrievability, an accessibility measure\nintroduced in prior literature.\n","authors":["Aman Sinha","Priyanshu Raj Mall","Dwaipayan Roy"],"pdf_url":"https://arxiv.org/pdf/2310.09508v1.pdf","comment":"Accepted at CIKM 2023"},{"id":"http://arxiv.org/abs/2310.09497v1","updated":"2023-10-14T05:20:02Z","published":"2023-10-14T05:20:02Z","title":"A Setwise Approach for Effective and Highly Efficient Zero-shot Ranking\n  with Large Language Models","summary":"  Large Language Models (LLMs) demonstrate impressive effectiveness in\nzero-shot document ranking tasks. Pointwise, Pairwise, and Listwise prompting\napproaches have been proposed for LLM-based zero-shot ranking. Our study begins\nby thoroughly evaluating these existing approaches within a consistent\nexperimental framework, considering factors like model size, token consumption,\nlatency, among others. This first-of-its-kind comparative evaluation of these\napproaches allows us to identify the trade-offs between effectiveness and\nefficiency inherent in each approach. We find that while Pointwise approaches\nscore high on efficiency, they suffer from poor effectiveness. Conversely,\nPairwise approaches demonstrate superior effectiveness but incur high\ncomputational overhead. To further enhance the efficiency of LLM-based\nzero-shot ranking, we propose a novel Setwise prompting approach. Our approach\nreduces the number of LLM inferences and the amount of prompt token consumption\nduring the ranking procedure, significantly improving the efficiency of\nLLM-based zero-shot ranking. We test our method using the TREC DL datasets and\nthe BEIR zero-shot document ranking benchmark. The empirical results indicate\nthat our approach considerably reduces computational costs while also retaining\nhigh zero-shot ranking effectiveness.\n","authors":["Shengyao Zhuang","Honglei Zhuang","Bevan Koopman","Guido Zuccon"],"pdf_url":"https://arxiv.org/pdf/2310.09497v1.pdf","comment":"9 pages"}],"Multimedia":[{"id":"http://arxiv.org/abs/2305.14302v2","updated":"2023-10-14T18:09:31Z","published":"2023-05-23T17:43:46Z","title":"VIP5: Towards Multimodal Foundation Models for Recommendation","summary":"  Computer Vision (CV), Natural Language Processing (NLP), and Recommender\nSystems (RecSys) are three prominent AI applications that have traditionally\ndeveloped independently, resulting in disparate modeling and engineering\nmethodologies. This has impeded the ability for these fields to directly\nbenefit from each other's advancements. With the recent development of\nfoundation models, large language models have emerged as a potential\ngeneral-purpose interface for unifying different modalities and problem\nformulations. In light of this, we propose the development of a multimodal\nfoundation model (MFM) considering visual, textual, and personalization\nmodalities under the P5 recommendation paradigm, thus named VIP5 (Visual P5),\nto unify various modalities and recommendation tasks. This will enable the\nprocessing of multiple modalities in a shared architecture for improved\nrecommendations. To achieve this, we introduce multimodal personalized prompts\nto accommodate multiple modalities under a shared format. Additionally, we\npropose a parameter-efficient training method for foundation models, which\ninvolves freezing the P5 backbone and fine-tuning lightweight adapters,\nresulting in improved recommendation performance and increased efficiency in\nterms of training time and memory usage. Code and data of VIP5 are available at\nhttps://github.com/jeykigung/VIP5.\n","authors":["Shijie Geng","Juntao Tan","Shuchang Liu","Zuohui Fu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.14302v2.pdf","comment":"Accepted by EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.09570v1","updated":"2023-10-14T12:05:18Z","published":"2023-10-14T12:05:18Z","title":"Energy-Efficient Multi-Codec Bitrate-Ladder Estimation for Adaptive\n  Video Streaming","summary":"  With the emergence of multiple modern video codecs, streaming service\nproviders are forced to encode, store, and transmit bitrate ladders of multiple\ncodecs separately, consequently suffering from additional energy costs for\nencoding, storage, and transmission. To tackle this issue, we introduce an\nonline energy-efficient Multi-Codec Bitrate ladder Estimation scheme (MCBE) for\nadaptive video streaming applications. In MCBE, quality representations within\nthe bitrate ladder of new-generation codecs (e.g., High Efficiency Video Coding\n(HEVC), Alliance for Open Media Video 1 (AV1)) that lie below the predicted\nrate-distortion curve of the Advanced Video Coding (AVC) codec are removed.\nMoreover, perceptual redundancy between representations of the bitrate ladders\nof the considered codecs is also minimized based on a Just Noticeable\nDifference (JND) threshold. Therefore, random forest-based models predict the\nVMAF score of bitrate ladder representations of each codec. In a live streaming\nsession where all clients support the decoding of AVC, HEVC, and AV1, MCBE\nachieves impressive results, reducing cumulative encoding energy by 56.45%,\nstorage energy usage by 94.99%, and transmission energy usage by 77.61%\n(considering a JND of six VMAF points). These energy reductions are in\ncomparison to a baseline bitrate ladder encoding based on current industry\npractice.\n","authors":["Vignesh V Menon","Reza Farahani","Prajit T Rajendran","Samira Afzal","Klaus Schoeffmann","Christian Timmerer"],"pdf_url":"https://arxiv.org/pdf/2310.09570v1.pdf","comment":"Accepted in IEEE International Conference on Visual Communications\n  and Image Processing (VCIP), 2023"},{"id":"http://arxiv.org/abs/2310.09461v1","updated":"2023-10-14T00:58:32Z","published":"2023-10-14T00:58:32Z","title":"MAC: ModAlity Calibration for Object Detection","summary":"  The flourishing success of Deep Neural Networks(DNNs) on RGB-input perception\ntasks has opened unbounded possibilities for non-RGB-input perception tasks,\nsuch as object detection from wireless signals, lidar scans, and infrared\nimages. Compared to the matured development pipeline of RGB-input (source\nmodality) models, developing non-RGB-input (target-modality) models from\nscratch poses excessive challenges in the modality-specific network\ndesign/training tricks and labor in the target-modality annotation. In this\npaper, we propose ModAlity Calibration (MAC), an efficient pipeline for\ncalibrating target-modality inputs to the DNN object detection models developed\non the RGB (source) modality. We compose a target-modality-input model by\nadding a small calibrator module ahead of a source-modality model and introduce\nMAC training techniques to impose dense supervision on the calibrator. By\nleveraging (1) prior knowledge synthesized from the source-modality model and\n(2) paired {target, source} data with zero manual annotations, our\ntarget-modality models reach comparable or better metrics than baseline models\nthat require 100% manual annotations. We demonstrate the effectiveness of MAC\nby composing the WiFi-input, Lidar-input, and Thermal-Infrared-input models\nupon the pre-trained RGB-input models respectively.\n","authors":["Yutian Lei","Jun Liu","Dong Huang"],"pdf_url":"https://arxiv.org/pdf/2310.09461v1.pdf","comment":null}]},"2023-10-18T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2310.12150v1","updated":"2023-10-18T17:59:10Z","published":"2023-10-18T17:59:10Z","title":"Understanding Retrieval Augmentation for Long-Form Question Answering","summary":"  We present a study of retrieval-augmented language models (LMs) on long-form\nquestion answering. We analyze how retrieval augmentation impacts different\nLMs, by comparing answers generated from models while using the same evidence\ndocuments, and how differing quality of retrieval document set impacts the\nanswers generated from the same LM. We study various attributes of generated\nanswers (e.g., fluency, length, variance) with an emphasis on the attribution\nof generated long-form answers to in-context evidence documents. We collect\nhuman annotations of answer attribution and evaluate methods for automatically\njudging attribution. Our study provides new insights on how retrieval\naugmentation impacts long, knowledge-rich text generation of LMs. We further\nidentify attribution patterns for long text generation and analyze the main\nculprits of attribution errors. Together, our analysis reveals how retrieval\naugmentation impacts long knowledge-rich text generation and provide directions\nfor future work.\n","authors":["Hung-Ting Chen","Fangyuan Xu","Shane A. Arora","Eunsol Choi"],"pdf_url":"https://arxiv.org/pdf/2310.12150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12143v1","updated":"2023-10-18T17:54:29Z","published":"2023-10-18T17:54:29Z","title":"Simple Mechanisms for Representing, Indexing and Manipulating Concepts","summary":"  Deep networks typically learn concepts via classifiers, which involves\nsetting up a model and training it via gradient descent to fit the\nconcept-labeled data. We will argue instead that learning a concept could be\ndone by looking at its moment statistics matrix to generate a concrete\nrepresentation or signature of that concept. These signatures can be used to\ndiscover structure across the set of concepts and could recursively produce\nhigher-level concepts by learning this structure from those signatures. When\nthe concepts are `intersected', signatures of the concepts can be used to find\na common theme across a number of related `intersected' concepts. This process\ncould be used to keep a dictionary of concepts so that inputs could correctly\nidentify and be routed to the set of concepts involved in the (latent)\ngeneration of the input.\n","authors":["Yuanzhi Li","Raghu Meka","Rina Panigrahy","Kulin Shah"],"pdf_url":"https://arxiv.org/pdf/2310.12143v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2310.12135v1","updated":"2023-10-18T17:48:05Z","published":"2023-10-18T17:48:05Z","title":"Pseudointelligence: A Unifying Framework for Language Model Evaluation","summary":"  With large language models surpassing human performance on an increasing\nnumber of benchmarks, we must take a principled approach for targeted\nevaluation of model capabilities. Inspired by pseudorandomness, we propose\npseudointelligence, which captures the maxim that \"(perceived) intelligence\nlies in the eye of the beholder\". That is, that claims of intelligence are\nmeaningful only when their evaluator is taken into account. Concretely, we\npropose a complexity-theoretic framework of model evaluation cast as a dynamic\ninteraction between a model and a learned evaluator. We demonstrate that this\nframework can be used to reason about two case studies in language model\nevaluation, as well as analyze existing evaluation methods.\n","authors":["Shikhar Murty","Orr Paradise","Pratyusha Sharma"],"pdf_url":"https://arxiv.org/pdf/2310.12135v1.pdf","comment":"EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2310.12128v1","updated":"2023-10-18T17:37:10Z","published":"2023-10-18T17:37:10Z","title":"DiagrammerGPT: Generating Open-Domain, Open-Platform Diagrams via LLM\n  Planning","summary":"  Text-to-image (T2I) generation has seen significant growth over the past few\nyears. Despite this, there has been little work on generating diagrams with T2I\nmodels. A diagram is a symbolic/schematic representation that explains\ninformation using structurally rich and spatially complex visualizations (e.g.,\na dense combination of related objects, text labels, directional arrows,\nconnection lines, etc.). Existing state-of-the-art T2I models often fail at\ndiagram generation because they lack fine-grained object layout control when\nmany objects are densely connected via complex relations such as arrows/lines\nand also often fail to render comprehensible text labels. To address this gap,\nwe present DiagrammerGPT, a novel two-stage text-to-diagram generation\nframework that leverages the layout guidance capabilities of LLMs (e.g., GPT-4)\nto generate more accurate open-domain, open-platform diagrams. In the first\nstage, we use LLMs to generate and iteratively refine 'diagram plans' (in a\nplanner-auditor feedback loop) which describe all the entities (objects and\ntext labels), their relationships (arrows or lines), and their bounding box\nlayouts. In the second stage, we use a diagram generator, DiagramGLIGEN, and a\ntext label rendering module to generate diagrams following the diagram plans.\nTo benchmark the text-to-diagram generation task, we introduce AI2D-Caption, a\ndensely annotated diagram dataset built on top of the AI2D dataset. We show\nquantitatively and qualitatively that our DiagrammerGPT framework produces more\naccurate diagrams, outperforming existing T2I models. We also provide\ncomprehensive analysis including open-domain diagram generation, vector graphic\ndiagram generation in different platforms, human-in-the-loop diagram plan\nediting, and multimodal planner/auditor LLMs (e.g., GPT-4Vision). We hope our\nwork can inspire further research on diagram generation via T2I models and\nLLMs.\n","authors":["Abhay Zala","Han Lin","Jaemin Cho","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2310.12128v1.pdf","comment":"Project page: https://diagrammerGPT.github.io/"},{"id":"http://arxiv.org/abs/2310.12127v1","updated":"2023-10-18T17:36:55Z","published":"2023-10-18T17:36:55Z","title":"A Tale of Pronouns: Interpretability Informs Gender Bias Mitigation for\n  Fairer Instruction-Tuned Machine Translation","summary":"  Recent instruction fine-tuned models can solve multiple NLP tasks when\nprompted to do so, with machine translation (MT) being a prominent use case.\nHowever, current research often focuses on standard performance benchmarks,\nleaving compelling fairness and ethical considerations behind. In MT, this\nmight lead to misgendered translations, resulting, among other harms, in the\nperpetuation of stereotypes and prejudices. In this work, we address this gap\nby investigating whether and to what extent such models exhibit gender bias in\nmachine translation and how we can mitigate it. Concretely, we compute\nestablished gender bias metrics on the WinoMT corpus from English to German and\nSpanish. We discover that IFT models default to male-inflected translations,\neven disregarding female occupational stereotypes. Next, using interpretability\nmethods, we unveil that models systematically overlook the pronoun indicating\nthe gender of a target occupation in misgendered translations. Finally, based\non this finding, we propose an easy-to-implement and effective bias mitigation\nsolution based on few-shot learning that leads to significantly fairer\ntranslations.\n","authors":["Giuseppe Attanasio","Flor Miriam Plaza-del-Arco","Debora Nozza","Anne Lauscher"],"pdf_url":"https://arxiv.org/pdf/2310.12127v1.pdf","comment":"Accepted at EMNLP 2023. Code and data at\n  https://github.com/MilaNLProc/interpretability-mt-gender-bias"},{"id":"http://arxiv.org/abs/2310.12126v1","updated":"2023-10-18T17:35:15Z","published":"2023-10-18T17:35:15Z","title":"SHARCS: Efficient Transformers through Routing with Dynamic Width\n  Sub-networks","summary":"  We introduce SHARCS for adaptive inference that takes into account the\nhardness of input samples. SHARCS can train a router on any transformer\nnetwork, enabling the model to direct different samples to sub-networks with\nvarying widths. Our experiments demonstrate that: (1) SHARCS outperforms or\ncomplements existing per-sample adaptive inference methods across various\nclassification tasks in terms of accuracy vs. FLOPs; (2) SHARCS generalizes\nacross different architectures and can be even applied to compressed and\nefficient transformer encoders to further improve their efficiency; (3) SHARCS\ncan provide a 2 times inference speed up at an insignificant drop in accuracy.\n","authors":["Mohammadreza Salehi","Sachin Mehta","Aditya Kusupati","Ali Farhadi","Hannaneh Hajishirzi"],"pdf_url":"https://arxiv.org/pdf/2310.12126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12118v1","updated":"2023-10-18T17:14:41Z","published":"2023-10-18T17:14:41Z","title":"Harnessing Dataset Cartography for Improved Compositional Generalization\n  in Transformers","summary":"  Neural networks have revolutionized language modeling and excelled in various\ndownstream tasks. However, the extent to which these models achieve\ncompositional generalization comparable to human cognitive abilities remains a\ntopic of debate. While existing approaches in the field have mainly focused on\nnovel architectures and alternative learning paradigms, we introduce a\npioneering method harnessing the power of dataset cartography (Swayamdipta et\nal., 2020). By strategically identifying a subset of compositional\ngeneralization data using this approach, we achieve a remarkable improvement in\nmodel accuracy, yielding enhancements of up to 10% on CFQ and COGS datasets.\nNotably, our technique incorporates dataset cartography as a curriculum\nlearning criterion, eliminating the need for hyperparameter tuning while\nconsistently achieving superior performance. Our findings highlight the\nuntapped potential of dataset cartography in unleashing the full capabilities\nof compositional generalization within Transformer models. Our code is\navailable at https://github.com/cyberiada/cartography-for-compositionality.\n","authors":["Osman Batur İnce","Tanin Zeraati","Semih Yagcioglu","Yadollah Yaghoobzadeh","Erkut Erdem","Aykut Erdem"],"pdf_url":"https://arxiv.org/pdf/2310.12118v1.pdf","comment":"Accepted to Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2305.12212v2","updated":"2023-10-18T17:05:58Z","published":"2023-05-20T15:24:38Z","title":"Prompting ChatGPT in MNER: Enhanced Multimodal Named Entity Recognition\n  with Auxiliary Refined Knowledge","summary":"  Multimodal Named Entity Recognition (MNER) on social media aims to enhance\ntextual entity prediction by incorporating image-based clues. Existing studies\nmainly focus on maximizing the utilization of pertinent image information or\nincorporating external knowledge from explicit knowledge bases. However, these\nmethods either neglect the necessity of providing the model with external\nknowledge, or encounter issues of high redundancy in the retrieved knowledge.\nIn this paper, we present PGIM -- a two-stage framework that aims to leverage\nChatGPT as an implicit knowledge base and enable it to heuristically generate\nauxiliary knowledge for more efficient entity prediction. Specifically, PGIM\ncontains a Multimodal Similar Example Awareness module that selects suitable\nexamples from a small number of predefined artificial samples. These examples\nare then integrated into a formatted prompt template tailored to the MNER and\nguide ChatGPT to generate auxiliary refined knowledge. Finally, the acquired\nknowledge is integrated with the original text and fed into a downstream model\nfor further processing. Extensive experiments show that PGIM outperforms\nstate-of-the-art methods on two classic MNER datasets and exhibits a stronger\nrobustness and generalization capability.\n","authors":["Jinyuan Li","Han Li","Zhuo Pan","Di Sun","Jiahao Wang","Wenkun Zhang","Gang Pan"],"pdf_url":"https://arxiv.org/pdf/2305.12212v2.pdf","comment":"Accepted to Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.12100v1","updated":"2023-10-18T16:43:08Z","published":"2023-10-18T16:43:08Z","title":"Non-Intrusive Adaptation: Input-Centric Parameter-efficient Fine-Tuning\n  for Versatile Multimodal Modeling","summary":"  Large language models (LLMs) and vision language models (VLMs) demonstrate\nexcellent performance on a wide range of tasks by scaling up parameter counts\nfrom O(10^9) to O(10^{12}) levels and further beyond. These large scales make\nit impossible to adapt and deploy fully specialized models given a task of\ninterest. Parameter-efficient fine-tuning (PEFT) emerges as a promising\ndirection to tackle the adaptation and serving challenges for such large\nmodels. We categorize PEFT techniques into two types: intrusive and\nnon-intrusive. Intrusive PEFT techniques directly change a model's internal\narchitecture. Though more flexible, they introduce significant complexities for\ntraining and serving. Non-intrusive PEFT techniques leave the internal\narchitecture unchanged and only adapt model-external parameters, such as\nembeddings for input. In this work, we describe AdaLink as a non-intrusive PEFT\ntechnique that achieves competitive performance compared to SoTA intrusive PEFT\n(LoRA) and full model fine-tuning (FT) on various tasks. We evaluate using both\ntext-only and multimodal tasks, with experiments that account for both\nparameter-count scaling and training regime (with and without instruction\ntuning).\n","authors":["Yaqing Wang","Jialin Wu","Tanmaya Dabral","Jiageng Zhang","Geoff Brown","Chun-Ta Lu","Frederick Liu","Yi Liang","Bo Pang","Michael Bendersky","Radu Soricut"],"pdf_url":"https://arxiv.org/pdf/2310.12100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.13877v4","updated":"2023-10-18T16:29:09Z","published":"2022-09-28T07:25:04Z","title":"YATO: Yet Another deep learning based Text analysis Open toolkit","summary":"  We introduce YATO, an open-source, easy-to-use toolkit for text analysis with\ndeep learning. Different from existing heavily engineered toolkits and\nplatforms, YATO is lightweight and user-friendly for researchers from\ncross-disciplinary areas. Designed in a hierarchical structure, YATO supports\nfree combinations of three types of widely used features including 1)\ntraditional neural networks (CNN, RNN, etc.); 2) pre-trained language models\n(BERT, RoBERTa, ELECTRA, etc.); and 3) user-customized neural features via a\nsimple configurable file. Benefiting from the advantages of flexibility and\nease of use, YATO can facilitate fast reproduction and refinement of\nstate-of-the-art NLP models, and promote the cross-disciplinary applications of\nNLP techniques. The code, examples, and documentation are publicly available at\nhttps://github.com/jiesutd/YATO. A demo video is also available at\nhttps://www.youtube.com/playlist?list=PLJ0mhzMcRuDUlTkzBfAftOqiJRxYTTjXH.\n","authors":["Zeqiang Wang","Yile Wang","Jiageng Wu","Zhiyang Teng","Jie Yang"],"pdf_url":"https://arxiv.org/pdf/2209.13877v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12086v1","updated":"2023-10-18T16:27:49Z","published":"2023-10-18T16:27:49Z","title":"Unveiling the Siren's Song: Towards Reliable Fact-Conflicting\n  Hallucination Detection","summary":"  Large Language Models (LLMs), such as ChatGPT/GPT-4, have garnered widespread\nattention owing to their myriad of practical applications, yet their adoption\nhas been constrained by issues of fact-conflicting hallucinations across web\nplatforms. The assessment of factuality in text, produced by LLMs, remains\ninadequately explored, extending not only to the judgment of vanilla facts but\nalso encompassing the evaluation of factual errors emerging in complex\ninferential tasks like multi-hop, and etc. In response, we introduce FactCHD, a\nfact-conflicting hallucination detection benchmark meticulously designed for\nLLMs. Functioning as a pivotal tool in evaluating factuality within\n\"Query-Respons\" contexts, our benchmark assimilates a large-scale dataset,\nencapsulating a broad spectrum of factuality patterns, such as vanilla,\nmulti-hops, comparison, and set-operation patterns. A distinctive feature of\nour benchmark is its incorporation of fact-based chains of evidence, thereby\nfacilitating comprehensive and conducive factual reasoning throughout the\nassessment process. We evaluate multiple LLMs, demonstrating the effectiveness\nof the benchmark and current methods fall short of faithfully detecting factual\nerrors. Furthermore, we present TRUTH-TRIANGULATOR that synthesizes reflective\nconsiderations by tool-enhanced ChatGPT and LoRA-tuning based on Llama2, aiming\nto yield more credible detection through the amalgamation of predictive results\nand evidence. The benchmark dataset and source code will be made available in\nhttps://github.com/zjunlp/FactCHD.\n","authors":["Xiang Chen","Duanzheng Song","Honghao Gui","Chengxi Wang","Ningyu Zhang","Fei Huang","Chengfei Lv","Dan Zhang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.12086v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2310.12085v1","updated":"2023-10-18T16:27:06Z","published":"2023-10-18T16:27:06Z","title":"On the Benefit of Generative Foundation Models for Human Activity\n  Recognition","summary":"  In human activity recognition (HAR), the limited availability of annotated\ndata presents a significant challenge. Drawing inspiration from the latest\nadvancements in generative AI, including Large Language Models (LLMs) and\nmotion synthesis models, we believe that generative AI can address this data\nscarcity by autonomously generating virtual IMU data from text descriptions.\nBeyond this, we spotlight several promising research pathways that could\nbenefit from generative AI for the community, including the generating\nbenchmark datasets, the development of foundational models specific to HAR, the\nexploration of hierarchical structures within HAR, breaking down complex\nactivities, and applications in health sensing and activity summarization.\n","authors":["Zikang Leng","Hyeokhyen Kwon","Thomas Plötz"],"pdf_url":"https://arxiv.org/pdf/2310.12085v1.pdf","comment":"Generative AI for Pervasive Computing (GenAI4PC) Symposium within\n  UbiComp/ISWC 2023"},{"id":"http://arxiv.org/abs/2310.12074v1","updated":"2023-10-18T16:07:13Z","published":"2023-10-18T16:07:13Z","title":"Towards Safer Operations: An Expert-involved Dataset of High-Pressure\n  Gas Incidents for Preventing Future Failures","summary":"  This paper introduces a new IncidentAI dataset for safety prevention.\nDifferent from prior corpora that usually contain a single task, our dataset\ncomprises three tasks: named entity recognition, cause-effect extraction, and\ninformation retrieval. The dataset is annotated by domain experts who have at\nleast six years of practical experience as high-pressure gas conservation\nmanagers. We validate the contribution of the dataset in the scenario of safety\nprevention. Preliminary results on the three tasks show that NLP techniques are\nbeneficial for analyzing incident reports to prevent future failures. The\ndataset facilitates future research in NLP and incident management communities.\nThe access to the dataset is also provided (the IncidentAI dataset is available\nat: https://github.com/Cinnamon/incident-ai-dataset).\n","authors":["Shumpei Inoue","Minh-Tien Nguyen","Hiroki Mizokuchi","Tuan-Anh D. Nguyen","Huu-Hiep Nguyen","Dung Tien Le"],"pdf_url":"https://arxiv.org/pdf/2310.12074v1.pdf","comment":"Accepted by EMNLP 2023 (The industry track)"},{"id":"http://arxiv.org/abs/2310.12072v1","updated":"2023-10-18T16:07:01Z","published":"2023-10-18T16:07:01Z","title":"SPEED: Speculative Pipelined Execution for Efficient Decoding","summary":"  Generative Large Language Models (LLMs) based on the Transformer architecture\nhave recently emerged as a dominant foundation model for a wide range of\nNatural Language Processing tasks. Nevertheless, their application in real-time\nscenarios has been highly restricted due to the significant inference latency\nassociated with these models. This is particularly pronounced due to the\nautoregressive nature of generative LLM inference, where tokens are generated\nsequentially since each token depends on all previous output tokens. It is\ntherefore challenging to achieve any token-level parallelism, making inference\nextremely memory-bound. In this work, we propose SPEED, which improves\ninference efficiency by speculatively executing multiple future tokens in\nparallel with the current token using predicted values based on early-layer\nhidden states. For Transformer decoders that employ parameter sharing, the\nmemory operations for the tokens executing in parallel can be amortized, which\nallows us to accelerate generative LLM inference. We demonstrate the efficiency\nof our method in terms of latency reduction relative to model accuracy and\ndemonstrate how speculation allows for training deeper decoders with parameter\nsharing with minimal runtime overhead.\n","authors":["Coleman Hooper","Sehoon Kim","Hiva Mohammadzadeh","Hasan Genc","Kurt Keutzer","Amir Gholami","Sophia Shao"],"pdf_url":"https://arxiv.org/pdf/2310.12072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12064v1","updated":"2023-10-18T15:53:45Z","published":"2023-10-18T15:53:45Z","title":"Code Book for the Annotation of Diverse Cross-Document Coreference of\n  Entities in News Articles","summary":"  This paper presents a scheme for annotating coreference across news articles,\nextending beyond traditional identity relations by also considering\nnear-identity and bridging relations. It includes a precise description of how\nto set up Inception, a respective annotation tool, how to annotate entities in\nnews articles, connect them with diverse coreferential relations, and link them\nacross documents to Wikidata's global knowledge graph. This multi-layered\nannotation approach is discussed in the context of the problem of media bias.\nOur main contribution lies in providing a methodology for creating a diverse\ncross-document coreference corpus which can be applied to the analysis of media\nbias by word-choice and labelling.\n","authors":["Jakob Vogel"],"pdf_url":"https://arxiv.org/pdf/2310.12064v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12059v1","updated":"2023-10-18T15:48:07Z","published":"2023-10-18T15:48:07Z","title":"Evaluating the Symbol Binding Ability of Large Language Models for\n  Multiple-Choice Questions in Vietnamese General Education","summary":"  In this paper, we evaluate the ability of large language models (LLMs) to\nperform multiple choice symbol binding (MCSB) for multiple choice question\nanswering (MCQA) tasks in zero-shot, one-shot, and few-shot settings. We focus\non Vietnamese, with fewer challenging MCQA datasets than in English. The two\nexisting datasets, ViMMRC 1.0 and ViMMRC 2.0, focus on literature. Recent\nresearch in Vietnamese natural language processing (NLP) has focused on the\nVietnamese National High School Graduation Examination (VNHSGE) from 2019 to\n2023 to evaluate ChatGPT. However, these studies have mainly focused on how\nChatGPT solves the VNHSGE step by step. We aim to create a novel and\nhigh-quality dataset by providing structured guidelines for typing LaTeX\nformulas for mathematics, physics, chemistry, and biology. This dataset can be\nused to evaluate the MCSB ability of LLMs and smaller language models (LMs)\nbecause it is typed in a strict LaTeX style. We focus on predicting the\ncharacter (A, B, C, or D) that is the most likely answer to a question, given\nthe context of the question. Our evaluation of six well-known LLMs, namely\nBLOOMZ-7.1B-MT, LLaMA-2-7B, LLaMA-2-70B, GPT-3, GPT-3.5, and GPT-4.0, on the\nViMMRC 1.0 and ViMMRC 2.0 benchmarks and our proposed dataset shows promising\nresults on the MCSB ability of LLMs for Vietnamese. The dataset is available\nfor research purposes only.\n","authors":["Duc-Vu Nguyen","Quoc-Nam Nguyen"],"pdf_url":"https://arxiv.org/pdf/2310.12059v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12049v1","updated":"2023-10-18T15:34:37Z","published":"2023-10-18T15:34:37Z","title":"Concept-Guided Chain-of-Thought Prompting for Pairwise Comparison\n  Scaling of Texts with Large Language Models","summary":"  Existing text scaling methods often require a large corpus, struggle with\nshort texts, or require labeled data. We develop a text scaling method that\nleverages the pattern recognition capabilities of generative large language\nmodels (LLMs). Specifically, we propose concept-guided chain-of-thought\n(CGCoT), which uses prompts designed to summarize ideas and identify target\nparties in texts to generate concept-specific breakdowns, in many ways similar\nto guidance for human coder content analysis. CGCoT effectively shifts pairwise\ntext comparisons from a reasoning problem to a pattern recognition problem. We\nthen pairwise compare concept-specific breakdowns using an LLM. We use the\nresults of these pairwise comparisons to estimate a scale using the\nBradley-Terry model. We use this approach to scale affective speech on Twitter.\nOur measures correlate more strongly with human judgments than alternative\napproaches like Wordfish. Besides a small set of pilot data to develop the\nCGCoT prompts, our measures require no additional labeled data and produce\nbinary predictions comparable to a RoBERTa-Large model fine-tuned on thousands\nof human-labeled tweets. We demonstrate how combining substantive knowledge\nwith LLMs can create state-of-the-art measures of abstract concepts.\n","authors":["Patrick Y. Wu","Jonathan Nagler","Joshua A. Tucker","Solomon Messing"],"pdf_url":"https://arxiv.org/pdf/2310.12049v1.pdf","comment":"26 pages, 2 figures"},{"id":"http://arxiv.org/abs/2309.15028v2","updated":"2023-10-18T15:05:36Z","published":"2023-09-26T15:57:57Z","title":"Don't throw away your value model! Making PPO even better via\n  Value-Guided Monte-Carlo Tree Search decoding","summary":"  Inference-time search algorithms such as Monte-Carlo Tree Search (MCTS) may\nseem unnecessary when generating natural language text based on\nstate-of-the-art reinforcement learning such as Proximal Policy Optimization\n(PPO). In this paper, we demonstrate that it is possible to get extra mileage\nout of PPO by integrating MCTS on top. The key idea is not to throw out the\nvalue network, a byproduct of PPO training for evaluating partial output\nsequences, when decoding text out of the policy network. More concretely, we\npresent a novel value-guided decoding algorithm called PPO-MCTS, which can\nintegrate the value network from PPO to work closely with the policy network\nduring inference-time generation. Compared to prior approaches based on MCTS\nfor controlled text generation, the key strength of our approach is to reduce\nthe fundamental mismatch of the scoring mechanisms of the partial outputs\nbetween training and test. Evaluation on four text generation tasks demonstrate\nthat PPO-MCTS greatly improves the preferability of generated text compared to\nthe standard practice of using only the PPO policy. Our results demonstrate the\npromise of search algorithms even on top of the aligned language models from\nPPO, and the under-explored benefit of the value network.\n","authors":["Jiacheng Liu","Andrew Cohen","Ramakanth Pasunuru","Yejin Choi","Hannaneh Hajishirzi","Asli Celikyilmaz"],"pdf_url":"https://arxiv.org/pdf/2309.15028v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12024v1","updated":"2023-10-18T14:58:13Z","published":"2023-10-18T14:58:13Z","title":"CORE: A Few-Shot Company Relation Classification Dataset for Robust\n  Domain Adaptation","summary":"  We introduce CORE, a dataset for few-shot relation classification (RC)\nfocused on company relations and business entities. CORE includes 4,708\ninstances of 12 relation types with corresponding textual evidence extracted\nfrom company Wikipedia pages. Company names and business entities pose a\nchallenge for few-shot RC models due to the rich and diverse information\nassociated with them. For example, a company name may represent the legal\nentity, products, people, or business divisions depending on the context.\nTherefore, deriving the relation type between entities is highly dependent on\ntextual context. To evaluate the performance of state-of-the-art RC models on\nthe CORE dataset, we conduct experiments in the few-shot domain adaptation\nsetting. Our results reveal substantial performance gaps, confirming that\nmodels trained on different domains struggle to adapt to CORE. Interestingly,\nwe find that models trained on CORE showcase improved out-of-domain\nperformance, which highlights the importance of high-quality data for robust\ndomain adaptation. Specifically, the information richness embedded in business\nentities allows models to focus on contextual nuances, reducing their reliance\non superficial clues such as relation-specific verbs. In addition to the\ndataset, we provide relevant code snippets to facilitate reproducibility and\nencourage further research in the field.\n","authors":["Philipp Borchert","Jochen De Weerdt","Kristof Coussement","Arno De Caigny","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2310.12024v1.pdf","comment":"Accepted to EMNLP 2023 main conference"},{"id":"http://arxiv.org/abs/2310.12020v1","updated":"2023-10-18T14:53:14Z","published":"2023-10-18T14:53:14Z","title":"LoHoRavens: A Long-Horizon Language-Conditioned Benchmark for Robotic\n  Tabletop Manipulation","summary":"  The convergence of embodied agents and large language models (LLMs) has\nbrought significant advancements to embodied instruction following.\nParticularly, the strong reasoning capabilities of LLMs make it possible for\nrobots to perform long-horizon tasks without expensive annotated\ndemonstrations. However, public benchmarks for testing the long-horizon\nreasoning capabilities of language-conditioned robots in various scenarios are\nstill missing. To fill this gap, this work focuses on the tabletop manipulation\ntask and releases a simulation benchmark, \\textit{LoHoRavens}, which covers\nvarious long-horizon reasoning aspects spanning color, size, space, arithmetics\nand reference. Furthermore, there is a key modality bridging problem for\nlong-horizon manipulation tasks with LLMs: how to incorporate the observation\nfeedback during robot execution for the LLM's closed-loop planning, which is\nhowever less studied by prior work. We investigate two methods of bridging the\nmodality gap: caption generation and learnable interface for incorporating\nexplicit and implicit observation feedback to the LLM, respectively. These\nmethods serve as the two baselines for our proposed benchmark. Experiments show\nthat both methods struggle to solve some tasks, indicating long-horizon\nmanipulation tasks are still challenging for current popular models. We expect\nthe proposed public benchmark and baselines can help the community develop\nbetter models for long-horizon tabletop manipulation tasks.\n","authors":["Shengqiang Zhang","Philipp Wicke","Lütfi Kerem Şenel","Luis Figueredo","Abdeldjallil Naceri","Sami Haddadin","Barbara Plank","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2310.12020v1.pdf","comment":"6 pages, 4 figures. The video and code of LoHoRavens are available at\n  https://shengqiang-zhang.github.io/lohoravens-webpage/"},{"id":"http://arxiv.org/abs/2310.04921v2","updated":"2023-10-18T14:52:03Z","published":"2023-10-07T21:23:58Z","title":"Crystal: Introspective Reasoners Reinforced with Self-Feedback","summary":"  Extensive work has shown that the performance and interpretability of\ncommonsense reasoning can be improved via knowledge-augmented reasoning\nmethods, where the knowledge that underpins the reasoning process is explicitly\nverbalized and utilized. However, existing implementations, including\n\"chain-of-thought\" and its variants, fall short in capturing the introspective\nnature of knowledge required in commonsense reasoning, and in accounting for\nthe mutual adaptation between the generation and utilization of knowledge. We\npropose a novel method to develop an introspective commonsense reasoner,\nCrystal. To tackle commonsense problems, it first introspects for knowledge\nstatements related to the given question, and subsequently makes an informed\nprediction that is grounded in the previously introspected knowledge. The\nknowledge introspection and knowledge-grounded reasoning modes of the model are\ntuned via reinforcement learning to mutually adapt, where the reward derives\nfrom the feedback given by the model itself. Experiments show that Crystal\nsignificantly outperforms both the standard supervised finetuning and\nchain-of-thought distilled methods, and enhances the transparency of the\ncommonsense reasoning process. Our work ultimately validates the feasibility\nand potential of reinforcing a neural model with self-feedback.\n","authors":["Jiacheng Liu","Ramakanth Pasunuru","Hannaneh Hajishirzi","Yejin Choi","Asli Celikyilmaz"],"pdf_url":"https://arxiv.org/pdf/2310.04921v2.pdf","comment":"EMNLP 2023 main conference"},{"id":"http://arxiv.org/abs/2305.03695v3","updated":"2023-10-18T14:48:51Z","published":"2023-05-05T17:15:32Z","title":"Vera: A General-Purpose Plausibility Estimation Model for Commonsense\n  Statements","summary":"  Despite the much discussed capabilities of today's language models, they are\nstill prone to silly and unexpected commonsense failures. We consider a\nretrospective verification approach that reflects on the correctness of LM\noutputs, and introduce Vera, a general-purpose model that estimates the\nplausibility of declarative statements based on commonsense knowledge. Trained\non ~7M commonsense statements created from 19 QA datasets and two large-scale\nknowledge bases, and with a combination of three training objectives, Vera is a\nversatile model that effectively separates correct from incorrect statements\nacross diverse commonsense domains. When applied to solving commonsense\nproblems in the verification format, Vera substantially outperforms existing\nmodels that can be repurposed for commonsense verification, and it further\nexhibits generalization capabilities to unseen tasks and provides\nwell-calibrated outputs. We find that Vera excels at filtering LM-generated\ncommonsense knowledge and is useful in detecting erroneous commonsense\nstatements generated by models like ChatGPT in real-world settings.\n","authors":["Jiacheng Liu","Wenya Wang","Dianzhuo Wang","Noah A. Smith","Yejin Choi","Hannaneh Hajishirzi"],"pdf_url":"https://arxiv.org/pdf/2305.03695v3.pdf","comment":"EMNLP 2023 main conference"},{"id":"http://arxiv.org/abs/2310.12011v1","updated":"2023-10-18T14:43:07Z","published":"2023-10-18T14:43:07Z","title":"Gold: A Global and Local-aware Denoising Framework for Commonsense\n  Knowledge Graph Noise Detection","summary":"  Commonsense Knowledge Graphs (CSKGs) are crucial for commonsense reasoning,\nyet constructing them through human annotations can be costly. As a result,\nvarious automatic methods have been proposed to construct CSKG with larger\nsemantic coverage. However, these unsupervised approaches introduce spurious\nnoise that can lower the quality of the resulting CSKG, which cannot be tackled\neasily by existing denoising algorithms due to the unique characteristics of\nnodes and structures in CSKGs. To address this issue, we propose Gold (Global\nand Local-aware Denoising), a denoising framework for CSKGs that incorporates\nentity semantic information, global rules, and local structural information\nfrom the CSKG. Experiment results demonstrate that Gold outperforms all\nbaseline methods in noise detection tasks on synthetic noisy CSKG benchmarks.\nFurthermore, we show that denoising a real-world CSKG is effective and even\nbenefits the downstream zero-shot commonsense question-answering task.\n","authors":["Zheye Deng","Weiqi Wang","Zhaowei Wang","Xin Liu","Yangqiu Song"],"pdf_url":"https://arxiv.org/pdf/2310.12011v1.pdf","comment":"Accepted to EMNLP findings 2023"},{"id":"http://arxiv.org/abs/2302.04813v3","updated":"2023-10-18T14:42:15Z","published":"2023-02-09T18:02:34Z","title":"Explanation Selection Using Unlabeled Data for Chain-of-Thought\n  Prompting","summary":"  Recent work has shown how to prompt large language models with explanations\nto obtain strong performance on textual reasoning tasks, i.e., the\nchain-of-thought paradigm. However, subtly different explanations can yield\nwidely varying downstream task accuracy. Explanations that have not been\n\"tuned\" for a task, such as off-the-shelf explanations written by nonexperts,\nmay lead to mediocre performance. This paper tackles the problem of how to\noptimize explanation-infused prompts in a blackbox fashion. We first generate\nsets of candidate explanations for each example in the prompt using a\nleave-one-out scheme, then find an effective combination of these explanations\nwith a two-stage framework. We first evaluate explanations for each in-context\nexample in isolation according to two proxy metrics, log likelihood and\naccuracy on new examples. Then, we search over combinations of explanations to\nfind one that yields high performance against a silver-labeled development set.\nAcross four textual reasoning tasks spanning question answering, mathematical\nreasoning, and natural language inference, results show that our proxy metrics\ncorrelate with ground truth accuracy and our overall method can effectively\nimprove prompts over crowdworker annotations and naive search strategies\n","authors":["Xi Ye","Greg Durrett"],"pdf_url":"https://arxiv.org/pdf/2302.04813v3.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.12008v1","updated":"2023-10-18T14:41:09Z","published":"2023-10-18T14:41:09Z","title":"Multi-view Contrastive Learning for Entity Typing over Knowledge Graphs","summary":"  Knowledge graph entity typing (KGET) aims at inferring plausible types of\nentities in knowledge graphs. Existing approaches to KGET focus on how to\nbetter encode the knowledge provided by the neighbors and types of an entity\ninto its representation. However, they ignore the semantic knowledge provided\nby the way in which types can be clustered together. In this paper, we propose\na novel method called Multi-view Contrastive Learning for knowledge graph\nEntity Typing (MCLET), which effectively encodes the coarse-grained knowledge\nprovided by clusters into entity and type embeddings. MCLET is composed of\nthree modules: i) Multi-view Generation and Encoder module, which encodes\nstructured information from entity-type, entity-cluster and cluster-type views;\nii) Cross-view Contrastive Learning module, which encourages different views to\ncollaboratively improve view-specific representations of entities and types;\niii) Entity Typing Prediction module, which integrates multi-head attention and\na Mixture-of-Experts strategy to infer missing entity types. Extensive\nexperiments show the strong performance of MCLET compared to the\nstate-of-the-art\n","authors":["Zhiwei Hu","Víctor Gutiérrez-Basulto","Zhiliang Xiang","Ru Li","Jeff Z. Pan"],"pdf_url":"https://arxiv.org/pdf/2310.12008v1.pdf","comment":"Accepted at EMNLP 2023 Main"},{"id":"http://arxiv.org/abs/2305.12169v2","updated":"2023-10-18T14:19:40Z","published":"2023-05-20T11:16:59Z","title":"Learning to Compose Representations of Different Encoder Layers towards\n  Improving Compositional Generalization","summary":"  Recent studies have shown that sequence-to-sequence (seq2seq) models struggle\nwith compositional generalization (CG), i.e., the ability to systematically\ngeneralize to unseen compositions of seen components. There is mounting\nevidence that one of the reasons hindering CG is the representation of the\nencoder uppermost layer is entangled, i.e., the syntactic and semantic\nrepresentations of sequences are entangled. However, we consider that the\npreviously identified representation entanglement problem is not comprehensive\nenough. Additionally, we hypothesize that the source keys and values\nrepresentations passing into different decoder layers are also entangled.\nStarting from this intuition, we propose \\textsc{CompoSition} (\\textbf{Compo}se\n\\textbf{S}yntactic and Semant\\textbf{i}c Representa\\textbf{tion}s), an\nextension to seq2seq models which learns to compose representations of\ndifferent encoder layers dynamically for different tasks, since recent studies\nreveal that the bottom layers of the Transformer encoder contain more syntactic\ninformation and the top ones contain more semantic information. Specifically,\nwe introduce a \\textit{composed layer} between the encoder and decoder to\ncompose different encoder layers' representations to generate specific keys and\nvalues passing into different decoder layers. \\textsc{CompoSition} achieves\ncompetitive results on two comprehensive and realistic benchmarks, which\nempirically demonstrates the effectiveness of our proposal. Codes are available\nat~\\url{https://github.com/thinkaboutzero/COMPOSITION}.\n","authors":["Lei Lin","Shuangtao Li","Yafang Zheng","Biao Fu","Shan Liu","Yidong Chen","Xiaodong Shi"],"pdf_url":"https://arxiv.org/pdf/2305.12169v2.pdf","comment":"Accepted by Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2306.04528v4","updated":"2023-10-18T14:16:01Z","published":"2023-06-07T15:37:00Z","title":"PromptBench: Towards Evaluating the Robustness of Large Language Models\n  on Adversarial Prompts","summary":"  The increasing reliance on Large Language Models (LLMs) across academia and\nindustry necessitates a comprehensive understanding of their robustness to\nprompts. In response to this vital need, we introduce PromptBench, a robustness\nbenchmark designed to measure LLMs' resilience to adversarial prompts. This\nstudy uses a plethora of adversarial textual attacks targeting prompts across\nmultiple levels: character, word, sentence, and semantic. The adversarial\nprompts, crafted to mimic plausible user errors like typos or synonyms, aim to\nevaluate how slight deviations can affect LLM outcomes while maintaining\nsemantic integrity. These prompts are then employed in diverse tasks, such as\nsentiment analysis, natural language inference, reading comprehension, machine\ntranslation, and math problem-solving. Our study generates 4788 adversarial\nprompts, meticulously evaluated over 8 tasks and 13 datasets. Our findings\ndemonstrate that contemporary LLMs are not robust to adversarial prompts.\nFurthermore, we present comprehensive analysis to understand the mystery behind\nprompt robustness and its transferability. We then offer insightful robustness\nanalysis and pragmatic recommendations for prompt composition, beneficial to\nboth researchers and everyday users. Code is available at:\nhttps://github.com/microsoft/promptbench.\n","authors":["Kaijie Zhu","Jindong Wang","Jiaheng Zhou","Zichen Wang","Hao Chen","Yidong Wang","Linyi Yang","Wei Ye","Yue Zhang","Neil Zhenqiang Gong","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2306.04528v4.pdf","comment":"Technical report; code is at:\n  https://github.com/microsoft/promptbench"},{"id":"http://arxiv.org/abs/2310.11986v1","updated":"2023-10-18T14:13:58Z","published":"2023-10-18T14:13:58Z","title":"Sociotechnical Safety Evaluation of Generative AI Systems","summary":"  Generative AI systems produce a range of risks. To ensure the safety of\ngenerative AI systems, these risks must be evaluated. In this paper, we make\ntwo main contributions toward establishing such evaluations. First, we propose\na three-layered framework that takes a structured, sociotechnical approach to\nevaluating these risks. This framework encompasses capability evaluations,\nwhich are the main current approach to safety evaluation. It then reaches\nfurther by building on system safety principles, particularly the insight that\ncontext determines whether a given capability may cause harm. To account for\nrelevant context, our framework adds human interaction and systemic impacts as\nadditional layers of evaluation. Second, we survey the current state of safety\nevaluation of generative AI systems and create a repository of existing\nevaluations. Three salient evaluation gaps emerge from this analysis. We\npropose ways forward to closing these gaps, outlining practical steps as well\nas roles and responsibilities for different actors. Sociotechnical safety\nevaluation is a tractable approach to the robust and comprehensive safety\nevaluation of generative AI systems.\n","authors":["Laura Weidinger","Maribeth Rauh","Nahema Marchal","Arianna Manzini","Lisa Anne Hendricks","Juan Mateos-Garcia","Stevie Bergman","Jackie Kay","Conor Griffin","Ben Bariach","Iason Gabriel","Verena Rieser","William Isaac"],"pdf_url":"https://arxiv.org/pdf/2310.11986v1.pdf","comment":"main paper p.1-29, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2310.11984v1","updated":"2023-10-18T14:10:47Z","published":"2023-10-18T14:10:47Z","title":"From Interpolation to Extrapolation: Complete Length Generalization for\n  Arithmetic Transformers","summary":"  Since its introduction, the transformer model has demonstrated outstanding\nperformance across various tasks. However, there are still unresolved issues\nregarding length generalization, particularly in algorithmic tasks. In this\npaper, we investigate the inherent capabilities of transformer models in\nlearning arithmetic algorithms, such as addition and multiplication. Through\nexperiments and attention analysis, we identify a number of crucial factors for\nachieving optimal length generalization. We show that transformer models are\nable to generalize to long lengths with the help of targeted attention biasing.\nWe then introduce Attention Bias Calibration (ABC), a calibration stage that\nenables the model to automatically learn the proper attention biases, which we\nlink to mechanisms in relative position encoding. We demonstrate that using\nABC, the transformer model can achieve unprecedented perfect length\ngeneralization on certain arithmetic tasks.\n","authors":["Shaoxiong Duan","Yining Shi"],"pdf_url":"https://arxiv.org/pdf/2310.11984v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07521v2","updated":"2023-10-18T14:09:19Z","published":"2023-10-11T14:18:03Z","title":"Survey on Factuality in Large Language Models: Knowledge, Retrieval and\n  Domain-Specificity","summary":"  This survey addresses the crucial issue of factuality in Large Language\nModels (LLMs). As LLMs find applications across diverse domains, the\nreliability and accuracy of their outputs become vital. We define the\nFactuality Issue as the probability of LLMs to produce content inconsistent\nwith established facts. We first delve into the implications of these\ninaccuracies, highlighting the potential consequences and challenges posed by\nfactual errors in LLM outputs. Subsequently, we analyze the mechanisms through\nwhich LLMs store and process facts, seeking the primary causes of factual\nerrors. Our discussion then transitions to methodologies for evaluating LLM\nfactuality, emphasizing key metrics, benchmarks, and studies. We further\nexplore strategies for enhancing LLM factuality, including approaches tailored\nfor specific domains. We focus two primary LLM configurations standalone LLMs\nand Retrieval-Augmented LLMs that utilizes external data, we detail their\nunique challenges and potential enhancements. Our survey offers a structured\nguide for researchers aiming to fortify the factual reliability of LLMs.\n","authors":["Cunxiang Wang","Xiaoze Liu","Yuanhao Yue","Xiangru Tang","Tianhang Zhang","Cheng Jiayang","Yunzhi Yao","Wenyang Gao","Xuming Hu","Zehan Qi","Yidong Wang","Linyi Yang","Jindong Wang","Xing Xie","Zheng Zhang","Yue Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.07521v2.pdf","comment":"44 pages; 300+ references"},{"id":"http://arxiv.org/abs/2310.11976v1","updated":"2023-10-18T14:01:39Z","published":"2023-10-18T14:01:39Z","title":"InfoDiffusion: Information Entropy Aware Diffusion Process for\n  Non-Autoregressive Text Generation","summary":"  Diffusion models have garnered considerable interest in the field of text\ngeneration. Several studies have explored text diffusion models with different\nstructures and applied them to various tasks, including named entity\nrecognition and summarization. However, there exists a notable disparity\nbetween the \"easy-first\" text generation process of current diffusion models\nand the \"keyword-first\" natural text generation process of humans, which has\nreceived limited attention. To bridge this gap, we propose InfoDiffusion, a\nnon-autoregressive text diffusion model. Our approach introduces a\n\"keyinfo-first\" generation strategy and incorporates a noise schedule based on\nthe amount of text information. In addition, InfoDiffusion combines\nself-conditioning with a newly proposed partially noising model structure.\nExperimental results show that InfoDiffusion outperforms the baseline model in\nterms of generation quality and diversity, as well as exhibiting higher\nsampling efficiency.\n","authors":["Renzhi Wang","Jing Li","Piji Li"],"pdf_url":"https://arxiv.org/pdf/2310.11976v1.pdf","comment":"EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2305.11828v3","updated":"2023-10-18T13:54:15Z","published":"2023-05-19T17:09:19Z","title":"Appraising the Potential Uses and Harms of LLMs for Medical Systematic\n  Reviews","summary":"  Medical systematic reviews play a vital role in healthcare decision making\nand policy. However, their production is time-consuming, limiting the\navailability of high-quality and up-to-date evidence summaries. Recent\nadvancements in large language models (LLMs) offer the potential to\nautomatically generate literature reviews on demand, addressing this issue.\nHowever, LLMs sometimes generate inaccurate (and potentially misleading) texts\nby hallucination or omission. In healthcare, this can make LLMs unusable at\nbest and dangerous at worst. We conducted 16 interviews with international\nsystematic review experts to characterize the perceived utility and risks of\nLLMs in the specific context of medical evidence reviews. Experts indicated\nthat LLMs can assist in the writing process by drafting summaries, generating\ntemplates, distilling information, and crosschecking information. They also\nraised concerns regarding confidently composed but inaccurate LLM outputs and\nother potential downstream harms, including decreased accountability and\nproliferation of low-quality reviews. Informed by this qualitative analysis, we\nidentify criteria for rigorous evaluation of biomedical LLMs aligned with\ndomain expert views.\n","authors":["Hye Sun Yun","Iain J. Marshall","Thomas A. Trikalinos","Byron C. Wallace"],"pdf_url":"https://arxiv.org/pdf/2305.11828v3.pdf","comment":"18 pages, 2 figures, 8 tables. Accepted as an EMNLP 2023 main paper"},{"id":"http://arxiv.org/abs/2307.08813v2","updated":"2023-10-18T13:52:33Z","published":"2023-07-17T20:01:11Z","title":"Comparative Performance Evaluation of Large Language Models for\n  Extracting Molecular Interactions and Pathway Knowledge","summary":"  Understanding protein interactions and pathway knowledge is crucial for\nunraveling the complexities of living systems and investigating the underlying\nmechanisms of biological functions and complex diseases. While existing\ndatabases provide curated biological data from literature and other sources,\nthey are often incomplete and their maintenance is labor-intensive,\nnecessitating alternative approaches. In this study, we propose to harness the\ncapabilities of large language models to address these issues by automatically\nextracting such knowledge from the relevant scientific literature. Toward this\ngoal, in this work, we investigate the effectiveness of different large\nlanguage models in tasks that involve recognizing protein interactions,\nidentifying genes associated with pathways affected by low-dose radiation, and\ngene regulatory relations. We thoroughly evaluate the performance of various\nmodels, highlight the significant findings, and discuss both the future\nopportunities and the remaining challenges associated with this approach. The\ncode and data are available at: https://github.com/boxorange/BioIE-LLM\n","authors":["Gilchan Park","Byung-Jun Yoon","Xihaier Luo","Vanessa López-Marrero","Shinjae Yoo","Shantenu Jha"],"pdf_url":"https://arxiv.org/pdf/2307.08813v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09716v2","updated":"2023-10-18T13:48:03Z","published":"2023-10-15T03:04:17Z","title":"Enhancing Conversational Search: Large Language Model-Aided Informative\n  Query Rewriting","summary":"  Query rewriting plays a vital role in enhancing conversational search by\ntransforming context-dependent user queries into standalone forms. Existing\napproaches primarily leverage human-rewritten queries as labels to train query\nrewriting models. However, human rewrites may lack sufficient information for\noptimal retrieval performance. To overcome this limitation, we propose\nutilizing large language models (LLMs) as query rewriters, enabling the\ngeneration of informative query rewrites through well-designed instructions. We\ndefine four essential properties for well-formed rewrites and incorporate all\nof them into the instruction. In addition, we introduce the role of rewrite\neditors for LLMs when initial query rewrites are available, forming a\n\"rewrite-then-edit\" process. Furthermore, we propose distilling the rewriting\ncapabilities of LLMs into smaller models to reduce rewriting latency. Our\nexperimental evaluation on the QReCC dataset demonstrates that informative\nquery rewrites can yield substantially improved retrieval performance compared\nto human rewrites, especially with sparse retrievers.\n","authors":["Fanghua Ye","Meng Fang","Shenghui Li","Emine Yilmaz"],"pdf_url":"https://arxiv.org/pdf/2310.09716v2.pdf","comment":"22 pages, accepted to EMNLP Findings 2023"},{"id":"http://arxiv.org/abs/2310.11965v1","updated":"2023-10-18T13:44:58Z","published":"2023-10-18T13:44:58Z","title":"Filling in the Gaps: Efficient Event Coreference Resolution using Graph\n  Autoencoder Networks","summary":"  We introduce a novel and efficient method for Event Coreference Resolution\n(ECR) applied to a lower-resourced language domain. By framing ECR as a graph\nreconstruction task, we are able to combine deep semantic embeddings with\nstructural coreference chain knowledge to create a parameter-efficient family\nof Graph Autoencoder models (GAE). Our method significantly outperforms\nclassical mention-pair methods on a large Dutch event coreference corpus in\nterms of overall score, efficiency and training speed. Additionally, we show\nthat our models are consistently able to classify more difficult coreference\nlinks and are far more robust in low-data settings when compared to\ntransformer-based mention-pair coreference algorithms.\n","authors":["Loic De Langhe","Orphée De Clercq","Veronique Hoste"],"pdf_url":"https://arxiv.org/pdf/2310.11965v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11964v1","updated":"2023-10-18T13:44:26Z","published":"2023-10-18T13:44:26Z","title":"AMR Parsing with Causal Hierarchical Attention and Pointers","summary":"  Translation-based AMR parsers have recently gained popularity due to their\nsimplicity and effectiveness. They predict linearized graphs as free texts,\navoiding explicit structure modeling. However, this simplicity neglects\nstructural locality in AMR graphs and introduces unnecessary tokens to\nrepresent coreferences. In this paper, we introduce new target forms of AMR\nparsing and a novel model, CHAP, which is equipped with causal hierarchical\nattention and the pointer mechanism, enabling the integration of structures\ninto the Transformer decoder. We empirically explore various alternative\nmodeling options. Experiments show that our model outperforms baseline models\non four out of five benchmarks in the setting of no additional data.\n","authors":["Chao Lou","Kewei Tu"],"pdf_url":"https://arxiv.org/pdf/2310.11964v1.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.11960v1","updated":"2023-10-18T13:40:41Z","published":"2023-10-18T13:40:41Z","title":"Fast Multipole Attention: A Divide-and-Conquer Attention Mechanism for\n  Long Sequences","summary":"  Transformer-based models have achieved state-of-the-art performance in many\nareas. However, the quadratic complexity of self-attention with respect to the\ninput length hinders the applicability of Transformer-based models to long\nsequences. To address this, we present Fast Multipole Attention, a new\nattention mechanism that uses a divide-and-conquer strategy to reduce the time\nand memory complexity of attention for sequences of length $n$ from\n$\\mathcal{O}(n^2)$ to $\\mathcal{O}(n \\log n)$ or $O(n)$, while retaining a\nglobal receptive field. The hierarchical approach groups queries, keys, and\nvalues into $\\mathcal{O}( \\log n)$ levels of resolution, where groups at\ngreater distances are increasingly larger in size and the weights to compute\ngroup quantities are learned. As such, the interaction between tokens far from\neach other is considered in lower resolution in an efficient hierarchical\nmanner. The overall complexity of Fast Multipole Attention is $\\mathcal{O}(n)$\nor $\\mathcal{O}(n \\log n)$, depending on whether the queries are down-sampled\nor not. This multi-level divide-and-conquer strategy is inspired by fast\nsummation methods from $n$-body physics and the Fast Multipole Method. We\nperform evaluation on autoregressive and bidirectional language modeling tasks\nand compare our Fast Multipole Attention model with other efficient attention\nvariants on medium-size datasets. We find empirically that the Fast Multipole\nTransformer performs much better than other efficient transformers in terms of\nmemory size and accuracy. The Fast Multipole Attention mechanism has the\npotential to empower large language models with much greater sequence lengths,\ntaking the full context into account in an efficient, naturally hierarchical\nmanner during training and when generating long sequences.\n","authors":["Yanming Kang","Giang Tran","Hans De Sterck"],"pdf_url":"https://arxiv.org/pdf/2310.11960v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11958v1","updated":"2023-10-18T13:38:03Z","published":"2023-10-18T13:38:03Z","title":"Emptying the Ocean with a Spoon: Should We Edit Models?","summary":"  We call into question the recently popularized method of direct model editing\nas a means of correcting factual errors in LLM generations. We contrast model\nediting with three similar but distinct approaches that pursue better defined\nobjectives: (1) retrieval-based architectures, which decouple factual memory\nfrom inference and linguistic capabilities embodied in LLMs; (2) concept\nerasure methods, which aim at preventing systemic bias in generated text; and\n(3) attribution methods, which aim at grounding generations into identified\ntextual sources. We argue that direct model editing cannot be trusted as a\nsystematic remedy for the disadvantages inherent to LLMs, and while it has\nproven potential in improving model explainability, it opens risks by\nreinforcing the notion that models can be trusted for factuality. We call for\ncautious promotion and application of model editing as part of the LLM\ndeployment process, and for responsibly limiting the use cases of LLMs to those\nnot relying on editing as a critical component.\n","authors":["Yuval Pinter","Michael Elhadad"],"pdf_url":"https://arxiv.org/pdf/2310.11958v1.pdf","comment":"Findings of ACL: EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.11954v1","updated":"2023-10-18T13:31:10Z","published":"2023-10-18T13:31:10Z","title":"MusicAgent: An AI Agent for Music Understanding and Generation with\n  Large Language Models","summary":"  AI-empowered music processing is a diverse field that encompasses dozens of\ntasks, ranging from generation tasks (e.g., timbre synthesis) to comprehension\ntasks (e.g., music classification). For developers and amateurs, it is very\ndifficult to grasp all of these task to satisfy their requirements in music\nprocessing, especially considering the huge differences in the representations\nof music data and the model applicability across platforms among various tasks.\nConsequently, it is necessary to build a system to organize and integrate these\ntasks, and thus help practitioners to automatically analyze their demand and\ncall suitable tools as solutions to fulfill their requirements. Inspired by the\nrecent success of large language models (LLMs) in task automation, we develop a\nsystem, named MusicAgent, which integrates numerous music-related tools and an\nautonomous workflow to address user requirements. More specifically, we build\n1) toolset that collects tools from diverse sources, including Hugging Face,\nGitHub, and Web API, etc. 2) an autonomous workflow empowered by LLMs (e.g.,\nChatGPT) to organize these tools and automatically decompose user requests into\nmultiple sub-tasks and invoke corresponding music tools. The primary goal of\nthis system is to free users from the intricacies of AI-music tools, enabling\nthem to concentrate on the creative aspect. By granting users the freedom to\neffortlessly combine tools, the system offers a seamless and enriching music\nexperience.\n","authors":["Dingyao Yu","Kaitao Song","Peiling Lu","Tianyu He","Xu Tan","Wei Ye","Shikun Zhang","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2310.11954v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10160v2","updated":"2023-10-18T13:17:13Z","published":"2023-05-17T12:23:38Z","title":"Stop Uploading Test Data in Plain Text: Practical Strategies for\n  Mitigating Data Contamination by Evaluation Benchmarks","summary":"  Data contamination has become prevalent and challenging with the rise of\nmodels pretrained on large automatically-crawled corpora. For closed models,\nthe training data becomes a trade secret, and even for open models, it is not\ntrivial to detect contamination. Strategies such as leaderboards with hidden\nanswers, or using test data which is guaranteed to be unseen, are expensive and\nbecome fragile with time. Assuming that all relevant actors value clean test\ndata and will cooperate to mitigate data contamination, what can be done? We\npropose three strategies that can make a difference: (1) Test data made public\nshould be encrypted with a public key and licensed to disallow derivative\ndistribution; (2) demand training exclusion controls from closed API holders,\nand protect your test data by refusing to evaluate without them; (3) avoid data\nwhich appears with its solution on the internet, and release the web-page\ncontext of internet-derived data along with the data. These strategies are\npractical and can be effective in preventing data contamination.\n","authors":["Alon Jacovi","Avi Caciularu","Omer Goldman","Yoav Goldberg"],"pdf_url":"https://arxiv.org/pdf/2305.10160v2.pdf","comment":"Accepted to EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.11938v1","updated":"2023-10-18T13:05:50Z","published":"2023-10-18T13:05:50Z","title":"Grounded and Well-rounded: A Methodological Approach to the Study of\n  Cross-modal and Cross-lingual Grounding","summary":"  Grounding has been argued to be a crucial component towards the development\nof more complete and truly semantically competent artificial intelligence\nsystems. Literature has divided into two camps: While some argue that grounding\nallows for qualitatively different generalizations, others believe it can be\ncompensated by mono-modal data quantity. Limited empirical evidence has emerged\nfor or against either position, which we argue is due to the methodological\nchallenges that come with studying grounding and its effects on NLP systems.\n  In this paper, we establish a methodological framework for studying what the\neffects are - if any - of providing models with richer input sources than\ntext-only. The crux of it lies in the construction of comparable samples of\npopulations of models trained on different input modalities, so that we can\ntease apart the qualitative effects of different input sources from\nquantifiable model performances. Experiments using this framework reveal\nqualitative differences in model behavior between cross-modally grounded,\ncross-lingually grounded, and ungrounded models, which we measure both at a\nglobal dataset level as well as for specific word representations, depending on\nhow concrete their semantics is.\n","authors":["Timothee Mickus","Elaine Zosa","Denis Paperno"],"pdf_url":"https://arxiv.org/pdf/2310.11938v1.pdf","comment":"accepted to Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2305.14221v2","updated":"2023-10-18T12:44:58Z","published":"2023-05-23T16:35:16Z","title":"Question Answering as Programming for Solving Time-Sensitive Questions","summary":"  Question answering plays a pivotal role in human daily life because it\ninvolves our acquisition of knowledge about the world. However, due to the\ndynamic and ever-changing nature of real-world facts, the answer can be\ncompletely different when the time constraint in the question changes.\nRecently, Large Language Models (LLMs) have shown remarkable intelligence in\nquestion answering, while our experiments reveal that the aforementioned\nproblems still pose a significant challenge to existing LLMs. This can be\nattributed to the LLMs' inability to perform rigorous reasoning based on\nsurface-level text semantics. To overcome this limitation, rather than\nrequiring LLMs to directly answer the question, we propose a novel approach\nwhere we reframe the $\\textbf{Q}$uestion $\\textbf{A}$nswering task\n$\\textbf{a}$s $\\textbf{P}$rogramming ($\\textbf{QAaP}$). Concretely, by\nleveraging modern LLMs' superior capability in understanding both natural\nlanguage and programming language, we endeavor to harness LLMs to represent\ndiversely expressed text as well-structured code and select the best matching\nanswer from multiple candidates through programming. We evaluate our QAaP\nframework on several time-sensitive question answering datasets and achieve\ndecent improvement, up to $14.5$% over strong baselines. Our codes and data are\navailable at https://github.com/TianHongZXY/qaap\n","authors":["Xinyu Zhu","Cheng Yang","Bei Chen","Siheng Li","Jian-Guang Lou","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2305.14221v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11923v1","updated":"2023-10-18T12:32:07Z","published":"2023-10-18T12:32:07Z","title":"Investigating semantic subspaces of Transformer sentence embeddings\n  through linear structural probing","summary":"  The question of what kinds of linguistic information are encoded in different\nlayers of Transformer-based language models is of considerable interest for the\nNLP community. Existing work, however, has overwhelmingly focused on word-level\nrepresentations and encoder-only language models with the masked-token training\nobjective. In this paper, we present experiments with semantic structural\nprobing, a method for studying sentence-level representations via finding a\nsubspace of the embedding space that provides suitable task-specific pairwise\ndistances between data-points. We apply our method to language models from\ndifferent families (encoder-only, decoder-only, encoder-decoder) and of\ndifferent sizes in the context of two tasks, semantic textual similarity and\nnatural-language inference. We find that model families differ substantially in\ntheir performance and layer dynamics, but that the results are largely\nmodel-size invariant.\n","authors":["Dmitry Nikolaev","Sebastian Padó"],"pdf_url":"https://arxiv.org/pdf/2310.11923v1.pdf","comment":"Accepted to BlackboxNLP 2023"},{"id":"http://arxiv.org/abs/2310.11917v1","updated":"2023-10-18T12:13:13Z","published":"2023-10-18T12:13:13Z","title":"A Benchmark for Semi-Inductive Link Prediction in Knowledge Graphs","summary":"  Semi-inductive link prediction (LP) in knowledge graphs (KG) is the task of\npredicting facts for new, previously unseen entities based on context\ninformation. Although new entities can be integrated by retraining the model\nfrom scratch in principle, such an approach is infeasible for large-scale KGs,\nwhere retraining is expensive and new entities may arise frequently. In this\npaper, we propose and describe a large-scale benchmark to evaluate\nsemi-inductive LP models. The benchmark is based on and extends Wikidata5M: It\nprovides transductive, k-shot, and 0-shot LP tasks, each varying the available\ninformation from (i) only KG structure, to (ii) including textual mentions, and\n(iii) detailed descriptions of the entities. We report on a small study of\nrecent approaches and found that semi-inductive LP performance is far from\ntransductive performance on long-tail entities throughout all experiments. The\nbenchmark provides a test bed for further research into integrating context and\ntextual information in semi-inductive LP models.\n","authors":["Adrian Kochsiek","Rainer Gemulla"],"pdf_url":"https://arxiv.org/pdf/2310.11917v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11508v2","updated":"2023-10-18T12:11:44Z","published":"2023-05-19T08:18:24Z","title":"PlugMed: Improving Specificity in Patient-Centered Medical Dialogue\n  Generation using In-Context Learning","summary":"  The patient-centered medical dialogue systems strive to offer diagnostic\ninterpretation services to users who are less knowledgeable about medical\nknowledge, through emphasizing the importance of providing responses specific\nto the patients. It is difficult for the large language models (LLMs) to\nguarantee the specificity of responses in spite of its promising performance\neven in some tasks in medical field. Inspired by in-context learning, we\npropose PlugMed, a Plug-and-Play Medical Dialogue System, for addressing this\nchallenge. PlugMed is equipped with two modules, the prompt generation (PG)\nmodule and the response ranking (RR) module, to enhances LLMs' dialogue\nstrategies for improving the specificity of the dialogue. The PG module is\ndesigned to stimulate the imitative ability of LLMs by providing them with real\ndialogues from similar patients as prompts. The RR module incorporates\nfine-tuned small model as response filter to enable the selection of\nappropriate responses generated by LLMs. Furthermore, we introduce a new\nevaluation method based on matching both user's intent and high-frequency\nmedical term to effectively assess the specificity of the responses. We conduct\nexperimental evaluations on three medical dialogue datasets, and the results,\nincluding both automatic and human evaluation, demonstrate the effectiveness of\nour approach.\n","authors":["Chengfeng Dou","Zhi Jin","Wenping Jiao","Haiyan Zhao","Zhenwei Tao","Yongqiang Zhao"],"pdf_url":"https://arxiv.org/pdf/2305.11508v2.pdf","comment":"Accepted by EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2305.11096v3","updated":"2023-10-18T11:59:57Z","published":"2023-05-18T16:34:18Z","title":"Cross-modality Data Augmentation for End-to-End Sign Language\n  Translation","summary":"  End-to-end sign language translation (SLT) aims to convert sign language\nvideos into spoken language texts directly without intermediate\nrepresentations. It has been a challenging task due to the modality gap between\nsign videos and texts and the data scarcity of labeled data. To tackle these\nchallenges, we propose a novel Cross-modality Data Augmentation (XmDA)\nframework to transfer the powerful gloss-to-text translation capabilities to\nend-to-end sign language translation (i.e. video-to-text) by exploiting pseudo\ngloss-text pairs from the sign gloss translation model. Specifically, XmDA\nconsists of two key components, namely, cross-modality mix-up and\ncross-modality knowledge distillation. The former explicitly encourages the\nalignment between sign video features and gloss embeddings to bridge the\nmodality gap. The latter utilizes the generation knowledge from gloss-to-text\nteacher models to guide the spoken language text generation. Experimental\nresults on two widely used SLT datasets, i.e., PHOENIX-2014T and CSL-Daily,\ndemonstrate that the proposed XmDA framework significantly and consistently\noutperforms the baseline models. Extensive analyses confirm our claim that XmDA\nenhances spoken language text generation by reducing the representation\ndistance between videos and texts, as well as improving the processing of\nlow-frequency words and long sentences.\n","authors":["Jinhui Ye","Wenxiang Jiao","Xing Wang","Zhaopeng Tu","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2305.11096v3.pdf","comment":"Accepted to Findings EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.11906v1","updated":"2023-10-18T11:54:38Z","published":"2023-10-18T11:54:38Z","title":"Rather a Nurse than a Physician -- Contrastive Explanations under\n  Investigation","summary":"  Contrastive explanations, where one decision is explained in contrast to\nanother, are supposed to be closer to how humans explain a decision than\nnon-contrastive explanations, where the decision is not necessarily referenced\nto an alternative. This claim has never been empirically validated. We analyze\nfour English text-classification datasets (SST2, DynaSent, BIOS and\nDBpedia-Animals). We fine-tune and extract explanations from three different\nmodels (RoBERTa, GTP-2, and T5), each in three different sizes and apply three\npost-hoc explainability methods (LRP, GradientxInput, GradNorm). We furthermore\ncollect and release human rationale annotations for a subset of 100 samples\nfrom the BIOS dataset for contrastive and non-contrastive settings. A\ncross-comparison between model-based rationales and human annotations, both in\ncontrastive and non-contrastive settings, yields a high agreement between the\ntwo settings for models as well as for humans. Moreover, model-based\nexplanations computed in both settings align equally well with human\nrationales. Thus, we empirically find that humans do not necessarily explain in\na contrastive manner.9 pages, long paper at ACL 2022 proceedings.\n","authors":["Oliver Eberle","Ilias Chalkidis","Laura Cabello","Stephanie Brandl"],"pdf_url":"https://arxiv.org/pdf/2310.11906v1.pdf","comment":"9 pages, long paper at EMNLP 2023 proceedings"},{"id":"http://arxiv.org/abs/2310.08172v2","updated":"2023-10-18T11:37:43Z","published":"2023-10-12T09:55:45Z","title":"Exploring the Cognitive Knowledge Structure of Large Language Models: An\n  Educational Diagnostic Assessment Approach","summary":"  Large Language Models (LLMs) have not only exhibited exceptional performance\nacross various tasks, but also demonstrated sparks of intelligence. Recent\nstudies have focused on assessing their capabilities on human exams and\nrevealed their impressive competence in different domains. However, cognitive\nresearch on the overall knowledge structure of LLMs is still lacking. In this\npaper, based on educational diagnostic assessment method, we conduct an\nevaluation using MoocRadar, a meticulously annotated human test dataset based\non Bloom Taxonomy. We aim to reveal the knowledge structures of LLMs and gain\ninsights of their cognitive capabilities. This research emphasizes the\nsignificance of investigating LLMs' knowledge and understanding the disparate\ncognitive patterns of LLMs. By shedding light on models' knowledge, researchers\ncan advance development and utilization of LLMs in a more informed and\neffective manner.\n","authors":["Zheyuan Zhang","Jifan Yu","Juanzi Li","Lei Hou"],"pdf_url":"https://arxiv.org/pdf/2310.08172v2.pdf","comment":"Findings of EMNLP 2023 (Short Paper)"},{"id":"http://arxiv.org/abs/2310.11884v1","updated":"2023-10-18T11:08:02Z","published":"2023-10-18T11:08:02Z","title":"From Neural Activations to Concepts: A Survey on Explaining Concepts in\n  Neural Networks","summary":"  In this paper, we review recent approaches for explaining concepts in neural\nnetworks. Concepts can act as a natural link between learning and reasoning:\nonce the concepts are identified that a neural learning system uses, one can\nintegrate those concepts with a reasoning system for inference or use a\nreasoning system to act upon them to improve or enhance the learning system. On\nthe other hand, knowledge can not only be extracted from neural networks but\nconcept knowledge can also be inserted into neural network architectures. Since\nintegrating learning and reasoning is at the core of neuro-symbolic AI, the\ninsights gained from this survey can serve as an important step towards\nrealizing neuro-symbolic AI based on explainable concepts.\n","authors":["Jae Hee Lee","Sergio Lanza","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2310.11884v1.pdf","comment":"Submitted to Neurosymbolic Artificial Intelligence\n  (https://neurosymbolic-ai-journal.com/paper/neural-activations-concepts-survey-explaining-concepts-neural-networks)"},{"id":"http://arxiv.org/abs/2310.11878v1","updated":"2023-10-18T11:04:31Z","published":"2023-10-18T11:04:31Z","title":"From Dissonance to Insights: Dissecting Disagreements in Rationale\n  Dataset Construction for Case Outcome Classification","summary":"  In legal NLP, Case Outcome Classification (COC) must not only be accurate but\nalso trustworthy and explainable. Existing work in explainable COC has been\nlimited to annotations by a single expert. However, it is well-known that\nlawyers may disagree in their assessment of case facts. We hence collect a\nnovel dataset RAVE: Rationale Variation in ECHR1, which is obtained from two\nexperts in the domain of international human rights law, for whom we observe\nweak agreement. We study their disagreements and build a two-level\ntask-independent taxonomy, supplemented with COC-specific subcategories. To our\nknowledge, this is the first work in the legal NLP that focuses on human label\nvariation. We quantitatively assess different taxonomy categories and find that\ndisagreements mainly stem from underspecification of the legal context, which\nposes challenges given the typically limited granularity and noise in COC\nmetadata. We further assess the explainablility of SOTA COC models on RAVE and\nobserve limited agreement between models and experts. Overall, our case study\nreveals hitherto underappreciated complexities in creating benchmark datasets\nin legal NLP that revolve around identifying aspects of a case's facts\nsupposedly relevant to its outcome.\n","authors":["Shanshan Xu","Santosh T. Y. S. S","Oana Ichim","Isabella Risini","Barbara Plank","Matthias Grabmair"],"pdf_url":"https://arxiv.org/pdf/2310.11878v1.pdf","comment":"Accepted to EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.11877v1","updated":"2023-10-18T11:01:09Z","published":"2023-10-18T11:01:09Z","title":"The Curious Case of Hallucinatory Unanswerablity: Finding Truths in the\n  Hidden States of Over-Confident Large Language Models","summary":"  Large language models (LLMs) have been shown to possess impressive\ncapabilities, while also raising crucial concerns about the faithfulness of\ntheir responses. A primary issue arising in this context is the management of\nunanswerable queries by LLMs, which often results in hallucinatory behavior,\ndue to overconfidence. In this paper, we explore the behavior of LLMs when\npresented with unanswerable queries. We ask: do models \\textbf{represent} the\nfact that the question is unanswerable when generating a hallucinatory answer?\nOur results show strong indications that such models encode the answerability\nof an input query, with the representation of the first decoded token often\nbeing a strong indicator. These findings shed new light on the spatial\norganization within the latent representations of LLMs, unveiling previously\nunexplored facets of these models. Moreover, they pave the way for the\ndevelopment of improved decoding techniques with better adherence to factual\ngeneration, particularly in scenarios where query unanswerability is a concern.\n","authors":["Aviv Slobodkin","Omer Goldman","Avi Caciularu","Ido Dagan","Shauli Ravfogel"],"pdf_url":"https://arxiv.org/pdf/2310.11877v1.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.11870v1","updated":"2023-10-18T10:36:52Z","published":"2023-10-18T10:36:52Z","title":"AI Nushu: An Exploration of Language Emergence in Sisterhood -Through\n  the Lens of Computational Linguistics","summary":"  This paper presents \"AI Nushu,\" an emerging language system inspired by Nushu\n(women's scripts), the unique language created and used exclusively by ancient\nChinese women who were thought to be illiterate under a patriarchal society. In\nthis interactive installation, two artificial intelligence (AI) agents are\ntrained in the Chinese dictionary and the Nushu corpus. By continually\nobserving their environment and communicating, these agents collaborate towards\ncreating a standard writing system to encode Chinese. It offers an artistic\ninterpretation of the creation of a non-western script from a computational\nlinguistics perspective, integrating AI technology with Chinese cultural\nheritage and a feminist viewpoint.\n","authors":["Yuqian Sun","Yuying Tang","Ze Gao","Zhijun Pan","Chuyan Xu","Yurou Chen","Kejiang Qian","Zhigang Wang","Tristan Braud","Chang Hee Lee","Ali Asadipour"],"pdf_url":"https://arxiv.org/pdf/2310.11870v1.pdf","comment":"Accepted for publication at SIGGRAPH Asia 2023"},{"id":"http://arxiv.org/abs/2307.07280v2","updated":"2023-10-18T10:36:36Z","published":"2023-07-14T11:20:22Z","title":"Replay to Remember: Continual Layer-Specific Fine-tuning for German\n  Speech Recognition","summary":"  While Automatic Speech Recognition (ASR) models have shown significant\nadvances with the introduction of unsupervised or self-supervised training\ntechniques, these improvements are still only limited to a subsection of\nlanguages and speakers. Transfer learning enables the adaptation of large-scale\nmultilingual models to not only low-resource languages but also to more\nspecific speaker groups. However, fine-tuning on data from new domains is\nusually accompanied by a decrease in performance on the original domain.\nTherefore, in our experiments, we examine how well the performance of\nlarge-scale ASR models can be approximated for smaller domains, with our own\ndataset of German Senior Voice Commands (SVC-de), and how much of the general\nspeech recognition performance can be preserved by selectively freezing parts\nof the model during training. To further increase the robustness of the ASR\nmodel to vocabulary and speakers outside of the fine-tuned domain, we apply\nExperience Replay for continual learning. By adding only a fraction of data\nfrom the original domain, we are able to reach Word-Error-Rates (WERs) below\n5\\% on the new domain, while stabilizing performance for general speech\nrecognition at acceptable WERs.\n","authors":["Theresa Pekarek Rosin","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2307.07280v2.pdf","comment":"13 pages, 7 figures, accepted and presented at ICANN 2023"},{"id":"http://arxiv.org/abs/2310.09619v3","updated":"2023-10-18T09:28:54Z","published":"2023-10-14T17:00:28Z","title":"An Expression Tree Decoding Strategy for Mathematical Equation\n  Generation","summary":"  Generating mathematical equations from natural language requires an accurate\nunderstanding of the relations among math expressions. Existing approaches can\nbe broadly categorized into token-level and expression-level generation. The\nformer treats equations as a mathematical language, sequentially generating\nmath tokens. Expression-level methods generate each expression one by one.\nHowever, each expression represents a solving step, and there naturally exist\nparallel or dependent relations between these steps, which are ignored by\ncurrent sequential methods. Therefore, we integrate tree structure into the\nexpression-level generation and advocate an expression tree decoding strategy.\nTo generate a tree with expression as its node, we employ a layer-wise parallel\ndecoding strategy: we decode multiple independent expressions (leaf nodes) in\nparallel at each layer and repeat parallel decoding layer by layer to\nsequentially generate these parent node expressions that depend on others.\nBesides, a bipartite matching algorithm is adopted to align multiple\npredictions with annotations for each layer. Experiments show our method\noutperforms other baselines, especially for these equations with complex\nstructures.\n","authors":["Wenqi Zhang","Yongliang Shen","Qingpeng Nong","Zeqi Tan","Yanna Ma","Weiming Lu"],"pdf_url":"https://arxiv.org/pdf/2310.09619v3.pdf","comment":"Accepted to EMNLP-2023, camera-ready version"},{"id":"http://arxiv.org/abs/2305.14124v2","updated":"2023-10-18T09:17:37Z","published":"2023-05-23T14:48:42Z","title":"When Does Monolingual Data Help Multilingual Translation: The Role of\n  Domain and Model Scale","summary":"  Multilingual machine translation (MMT), trained on a mixture of parallel and\nmonolingual data, is key for improving translation in low-resource language\npairs. However, the literature offers conflicting results on the performance of\ndifferent methods of including monolingual data. To resolve this, we examine\nhow denoising autoencoding (DAE) and backtranslation (BT) impact MMT under\ndifferent data conditions and model scales. Unlike prior studies, we use a\nrealistic dataset of 100 translation directions and consider many domain\ncombinations of monolingual and test data. We find that monolingual data\ngenerally helps MMT, but models are surprisingly brittle to domain mismatches,\nespecially at smaller model scales. BT is beneficial when the parallel,\nmonolingual, and test data sources are similar but can be detrimental\notherwise, while DAE is less effective than previously reported. Next, we\nanalyze the impact of scale (from 90M to 1.6B parameters) and find it is\nimportant for both methods, particularly DAE. As scale increases, DAE\ntransitions from underperforming the parallel-only baseline at 90M to\nconverging with BT performance at 1.6B, and even surpassing it in low-resource.\nThese results offer new insights into how to best use monolingual data in MMT.\n","authors":["Christos Baziotis","Biao Zhang","Alexandra Birch","Barry Haddow"],"pdf_url":"https://arxiv.org/pdf/2305.14124v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2305.01951v2","updated":"2023-10-18T09:05:26Z","published":"2023-05-03T08:08:07Z","title":"Can LMs Generalize to Future Data? An Empirical Analysis on Text\n  Summarization","summary":"  Recent pre-trained language models (PLMs) achieve promising results in\nexisting abstractive summarization datasets. However, existing summarization\nbenchmarks overlap in time with the standard pre-training corpora and\nfinetuning datasets. Hence, the strong performance of PLMs may rely on the\nparametric knowledge that is memorized during pre-training and fine-tuning.\nMoreover, the knowledge memorized by PLMs may quickly become outdated, which\naffects the generalization performance of PLMs on future data. In this work, we\npropose TempoSum, a novel benchmark that contains data samples from 2010 to\n2022, to understand the temporal generalization ability of abstractive\nsummarization models. Through extensive human evaluation, we show that\nparametric knowledge stored in summarization models significantly affects the\nfaithfulness of the generated summaries on future data. Moreover, existing\nfaithfulness enhancement methods cannot reliably improve the faithfulness of\nsummarization models on future data. Finally, we discuss several\nrecommendations to the research community on how to evaluate and improve the\ntemporal generalization capability of text summarization models.\n","authors":["Chi Seng Cheang","Hou Pong Chan","Derek F. Wong","Xuebo Liu","Zhaocong Li","Yanming Sun","Shudong Liu","Lidia S. Chao"],"pdf_url":"https://arxiv.org/pdf/2305.01951v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00313v2","updated":"2023-10-18T08:53:46Z","published":"2023-09-30T09:01:35Z","title":"In-Context Learning in Large Language Models: A Neuroscience-inspired\n  Analysis of Representations","summary":"  Large language models (LLMs) exhibit remarkable performance improvement\nthrough in-context learning (ICL) by leveraging task-specific examples in the\ninput. However, the mechanisms behind this improvement remain elusive. In this\nwork, we investigate embeddings and attention representations in Llama-2 70B\nand Vicuna 13B. Specifically, we study how embeddings and attention change\nafter in-context-learning, and how these changes mediate improvement in\nbehavior. We employ neuroscience-inspired techniques, such as representational\nsimilarity analysis (RSA), and propose novel methods for parameterized probing\nand attention ratio analysis (ARA, measuring the ratio of attention to relevant\nvs. irrelevant information). We designed three tasks with a priori\nrelationships among their conditions: reading comprehension, linear regression,\nand adversarial prompt injection. We formed hypotheses about expected\nsimilarities in task representations to investigate latent changes in\nembeddings and attention. Our analyses revealed a meaningful correlation\nbetween changes in both embeddings and attention representations with\nimprovements in behavioral performance after ICL. This empirical framework\nempowers a nuanced understanding of how latent representations affect LLM\nbehavior with and without ICL, offering valuable tools and insights for future\nresearch and practical applications.\n","authors":["Safoora Yousefi","Leo Betthauser","Hosein Hasanbeig","Akanksha Saran","Raphaël Millière","Ida Momennejad"],"pdf_url":"https://arxiv.org/pdf/2310.00313v2.pdf","comment":"Added overview figures 1-3 in this version"},{"id":"http://arxiv.org/abs/2304.11060v2","updated":"2023-10-18T08:38:41Z","published":"2023-04-17T08:43:20Z","title":"SkillGPT: a RESTful API service for skill extraction and standardization\n  using a Large Language Model","summary":"  We present SkillGPT, a tool for skill extraction and standardization (SES)\nfrom free-style job descriptions and user profiles with an open-source Large\nLanguage Model (LLM) as backbone. Most previous methods for similar tasks\neither need supervision or rely on heavy data-preprocessing and feature\nengineering. Directly prompting the latest conversational LLM for standard\nskills, however, is slow, costly and inaccurate. In contrast, SkillGPT utilizes\na LLM to perform its tasks in steps via summarization and vector similarity\nsearch, to balance speed with precision. The backbone LLM of SkillGPT is based\non Llama, free for academic use and thus useful for exploratory research and\nprototype development. Hence, our cost-free SkillGPT gives users the\nconvenience of conversational SES, efficiently and reliably.\n","authors":["Nan Li","Bo Kang","Tijl De Bie"],"pdf_url":"https://arxiv.org/pdf/2304.11060v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11780v1","updated":"2023-10-18T08:19:53Z","published":"2023-10-18T08:19:53Z","title":"Text Annotation Handbook: A Practical Guide for Machine Learning\n  Projects","summary":"  This handbook is a hands-on guide on how to approach text annotation tasks.\nIt provides a gentle introduction to the topic, an overview of theoretical\nconcepts as well as practical advice. The topics covered are mostly technical,\nbut business, ethical and regulatory issues are also touched upon. The focus\nlies on readability and conciseness rather than completeness and scientific\nrigor. Experience with annotation and knowledge of machine learning are useful\nbut not required. The document may serve as a primer or reference book for a\nwide range of professions such as team leaders, project managers, IT\narchitects, software developers and machine learning engineers.\n","authors":["Felix Stollenwerk","Joey Öhman","Danila Petrelli","Emma Wallerö","Fredrik Olsson","Camilla Bengtsson","Andreas Horndahl","Gabriela Zarzar Gandler"],"pdf_url":"https://arxiv.org/pdf/2310.11780v1.pdf","comment":"30 pages, white paper"},{"id":"http://arxiv.org/abs/2310.11778v1","updated":"2023-10-18T08:16:29Z","published":"2023-10-18T08:16:29Z","title":"Language Agents for Detecting Implicit Stereotypes in Text-to-image\n  Models at Scale","summary":"  The recent surge in the research of diffusion models has accelerated the\nadoption of text-to-image models in various Artificial Intelligence Generated\nContent (AIGC) commercial products. While these exceptional AIGC products are\ngaining increasing recognition and sparking enthusiasm among consumers, the\nquestions regarding whether, when, and how these models might unintentionally\nreinforce existing societal stereotypes remain largely unaddressed. Motivated\nby recent advancements in language agents, here we introduce a novel agent\narchitecture tailored for stereotype detection in text-to-image models. This\nversatile agent architecture is capable of accommodating free-form detection\ntasks and can autonomously invoke various tools to facilitate the entire\nprocess, from generating corresponding instructions and images, to detecting\nstereotypes. We build the stereotype-relevant benchmark based on multiple\nopen-text datasets, and apply this architecture to commercial products and\npopular open source text-to-image models. We find that these models often\ndisplay serious stereotypes when it comes to certain prompts about personal\ncharacteristics, social cultural context and crime-related aspects. In summary,\nthese empirical findings underscore the pervasive existence of stereotypes\nacross social dimensions, including gender, race, and religion, which not only\nvalidate the effectiveness of our proposed approach, but also emphasize the\ncritical necessity of addressing potential ethical risks in the burgeoning\nrealm of AIGC. As AIGC continues its rapid expansion trajectory, with new\nmodels and plugins emerging daily in staggering numbers, the challenge lies in\nthe timely detection and mitigation of potential biases within these models.\n","authors":["Qichao Wang","Tian Bian","Yian Yin","Tingyang Xu","Hong Cheng","Helen M. Meng","Zibin Zheng","Liang Chen","Bingzhe Wu"],"pdf_url":"https://arxiv.org/pdf/2310.11778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11772v1","updated":"2023-10-18T07:58:47Z","published":"2023-10-18T07:58:47Z","title":"Improving Long Document Topic Segmentation Models With Enhanced\n  Coherence Modeling","summary":"  Topic segmentation is critical for obtaining structured long documents and\nimproving downstream tasks like information retrieval. Due to its ability of\nautomatically exploring clues of topic shift from a large amount of labeled\ndata, recent supervised neural models have greatly promoted the development of\nlong document topic segmentation, but leaving the deeper relationship of\nsemantic coherence and topic segmentation underexplored. Therefore, this paper\nenhances the supervised model's ability to capture coherence from both\nstructure and similarity perspectives to further improve the topic segmentation\nperformance, including the Topic-aware Sentence Structure Prediction (TSSP) and\nContrastive Semantic Similarity Learning (CSSL). Specifically, the TSSP task is\nproposed to force the model to comprehend structural information by learning\nthe original relations of adjacent sentences in a disarrayed document, which is\nconstructed by jointly disrupting the original document at the topic and\nsentence levels. In addition, we utilize inter- and intra-topic information to\nconstruct contrastive samples and design the CSSL objective to ensure that the\nsentences representations in the same topic have higher semantic similarity,\nwhile those in different topics are less similar. Extensive experiments show\nthat the Longformer with our approach significantly outperforms old\nstate-of-the-art (SOTA) methods. Our approach improves $F_{1}$ of old SOTA by\n3.42 (73.74 -> 77.16) and reduces $P_{k}$ by 1.11 points (15.0 -> 13.89) on\nWIKI-727K and achieves an average reduction of 0.83 points on $P_{k}$ on\nWikiSection. The average $P_{k}$ drop of 2.82 points on the two out-of-domain\ndatasets also illustrates the robustness of our approach\n","authors":["Hai Yu","Chong Deng","Qinglin Zhang","Jiaqing Liu","Qian Chen","Wen Wang"],"pdf_url":"https://arxiv.org/pdf/2310.11772v1.pdf","comment":"Accepted by EMNLP 2023. Codes is available at\n  https://github.com/alibaba-damo-academy/SpokenNLP/"},{"id":"http://arxiv.org/abs/2310.11769v1","updated":"2023-10-18T07:55:53Z","published":"2023-10-18T07:55:53Z","title":"Annotated Job Ads with Named Entity Recognition","summary":"  We have trained a named entity recognition (NER) model that screens Swedish\njob ads for different kinds of useful information (e.g. skills required from a\njob seeker). It was obtained by fine-tuning KB-BERT. The biggest challenge we\nfaced was the creation of a labelled dataset, which required manual annotation.\nThis paper gives an overview of the methods we employed to make the annotation\nprocess more efficient and to ensure high quality data. We also report on the\nperformance of the resulting model.\n","authors":["Felix Stollenwerk","Niklas Fastlund","Anna Nyqvist","Joey Öhman"],"pdf_url":"https://arxiv.org/pdf/2310.11769v1.pdf","comment":"SLTC 2022"},{"id":"http://arxiv.org/abs/2310.11761v1","updated":"2023-10-18T07:38:04Z","published":"2023-10-18T07:38:04Z","title":"A Comprehensive Evaluation of Large Language Models on Legal Judgment\n  Prediction","summary":"  Large language models (LLMs) have demonstrated great potential for\ndomain-specific applications, such as the law domain. However, recent disputes\nover GPT-4's law evaluation raise questions concerning their performance in\nreal-world legal tasks. To systematically investigate their competency in the\nlaw, we design practical baseline solutions based on LLMs and test on the task\nof legal judgment prediction. In our solutions, LLMs can work alone to answer\nopen questions or coordinate with an information retrieval (IR) system to learn\nfrom similar cases or solve simplified multi-choice questions. We show that\nsimilar cases and multi-choice options, namely label candidates, included in\nprompts can help LLMs recall domain knowledge that is critical for expertise\nlegal reasoning. We additionally present an intriguing paradox wherein an IR\nsystem surpasses the performance of LLM+IR due to limited gains acquired by\nweaker LLMs from powerful IR systems. In such cases, the role of LLMs becomes\nredundant. Our evaluation pipeline can be easily extended into other tasks to\nfacilitate evaluations in other domains. Code is available at\nhttps://github.com/srhthu/LM-CompEval-Legal\n","authors":["Ruihao Shui","Yixin Cao","Xiang Wang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2310.11761v1.pdf","comment":"EMNLP Findings 2023"},{"id":"http://arxiv.org/abs/2310.11753v1","updated":"2023-10-18T07:28:12Z","published":"2023-10-18T07:28:12Z","title":"Bias in Emotion Recognition with ChatGPT","summary":"  This technical report explores the ability of ChatGPT in recognizing emotions\nfrom text, which can be the basis of various applications like interactive\nchatbots, data annotation, and mental health analysis. While prior research has\nshown ChatGPT's basic ability in sentiment analysis, its performance in more\nnuanced emotion recognition is not yet explored. Here, we conducted experiments\nto evaluate its performance of emotion recognition across different datasets\nand emotion labels. Our findings indicate a reasonable level of reproducibility\nin its performance, with noticeable improvement through fine-tuning. However,\nthe performance varies with different emotion labels and datasets, highlighting\nan inherent instability and possible bias. The choice of dataset and emotion\nlabels significantly impacts ChatGPT's emotion recognition performance. This\npaper sheds light on the importance of dataset and label selection, and the\npotential of fine-tuning in enhancing ChatGPT's emotion recognition\ncapabilities, providing a groundwork for better integration of emotion analysis\nin applications using ChatGPT.\n","authors":["Naoki Wake","Atsushi Kanehira","Kazuhiro Sasabuchi","Jun Takamatsu","Katsushi Ikeuchi"],"pdf_url":"https://arxiv.org/pdf/2310.11753v1.pdf","comment":"5 pages, 4 figures, 6 tables"},{"id":"http://arxiv.org/abs/2302.02676v8","updated":"2023-10-18T07:11:12Z","published":"2023-02-06T10:28:16Z","title":"Chain of Hindsight Aligns Language Models with Feedback","summary":"  Learning from human preferences is important for language models to match\nhuman needs and to align with human and social values. Prior works have\nachieved remarkable successes by learning from human feedback to understand and\nfollow instructions. Nonetheless, these methods are either founded on\nhand-picked model generations that are favored by human annotators, rendering\nthem inefficient in terms of data utilization and challenging to apply in\ngeneral, or they depend on reinforcement learning, which often suffers from\nimperfect reward functions and relies on extremely challenging optimizations.\nIn this work, we propose a novel technique, Chain of Hindsight, that is easy to\noptimize and can learn from any form of feedback, regardless of its polarity.\nOur idea is inspired by how humans learn from extensive feedback presented in\nthe form of languages. We convert all types of feedback into sequences of\nsentences, which are then used to fine-tune the model, allowing us to take\nadvantage of the language comprehension capabilities of language models. We\ncondition the model on a sequence of model generations paired with feedback. By\ndoing so, the model is trained to generate outputs based on feedback, while\nlearning to identify and correct negative attributes or errors. Applying our\nmethod to large language models, we observed that Chain of Hindsight\nsignificantly surpasses previous methods in aligning language models with human\npreferences. We report significant improvements on summarization and dialogue\nbenchmarks, with our approach markedly preferred in human evaluations.\n","authors":["Hao Liu","Carmelo Sferrazza","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2302.02676v8.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10520v2","updated":"2023-10-18T06:56:29Z","published":"2023-10-16T15:38:02Z","title":"Semantic Parsing by Large Language Models for Intricate Updating\n  Strategies of Zero-Shot Dialogue State Tracking","summary":"  Zero-shot Dialogue State Tracking (DST) addresses the challenge of acquiring\nand annotating task-oriented dialogues, which can be time consuming and costly.\nHowever, DST extends beyond simple slot-filling and requires effective updating\nstrategies for tracking dialogue state as conversations progress. In this\npaper, we propose ParsingDST, a new In-Context Learning (ICL) method, to\nintroduce additional intricate updating strategies in zero-shot DST. Our\napproach reformulates the DST task by leveraging powerful Large Language Models\n(LLMs) and translating the original dialogue text to JSON through semantic\nparsing as an intermediate state. We also design a novel framework that\nincludes more modules to ensure the effectiveness of updating strategies in the\ntext-to-JSON process. Experimental results demonstrate that our approach\noutperforms existing zero-shot DST methods on MultiWOZ, exhibiting significant\nimprovements in Joint Goal Accuracy (JGA) and slot accuracy compared to\nexisting ICL methods.\n","authors":["Yuxiang Wu","Guanting Dong","Weiran Xu"],"pdf_url":"https://arxiv.org/pdf/2310.10520v2.pdf","comment":"Accepted to the Findings of EMNLP 2023 (Short Paper)"},{"id":"http://arxiv.org/abs/2310.07487v2","updated":"2023-10-18T06:48:37Z","published":"2023-10-11T13:34:22Z","title":"Cognate Transformer for Automated Phonological Reconstruction and\n  Cognate Reflex Prediction","summary":"  Phonological reconstruction is one of the central problems in historical\nlinguistics where a proto-word of an ancestral language is determined from the\nobserved cognate words of daughter languages. Computational approaches to\nhistorical linguistics attempt to automate the task by learning models on\navailable linguistic data. Several ideas and techniques drawn from\ncomputational biology have been successfully applied in the area of\ncomputational historical linguistics. Following these lines, we adapt MSA\nTransformer, a protein language model, to the problem of automated phonological\nreconstruction. MSA Transformer trains on multiple sequence alignments as input\nand is, thus, apt for application on aligned cognate words. We, hence, name our\nmodel as Cognate Transformer. We also apply the model on another associated\ntask, namely, cognate reflex prediction, where a reflex word in a daughter\nlanguage is predicted based on cognate words from other daughter languages. We\nshow that our model outperforms the existing models on both tasks, especially\nwhen it is pre-trained on masked word prediction task.\n","authors":["V. S. D. S. Mahesh Akavarapu","Arnab Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2310.07487v2.pdf","comment":"Accepted at EMNLP-2023 (Main)"},{"id":"http://arxiv.org/abs/2305.11595v3","updated":"2023-10-18T06:32:15Z","published":"2023-05-19T11:15:33Z","title":"Examining Inter-Consistency of Large Language Models Collaboration: An\n  In-depth Analysis via Debate","summary":"  Large Language Models (LLMs) have shown impressive capabilities in various\napplications, but they still face various inconsistency issues. Existing works\nprimarily focus on the inconsistency issues within a single LLM, while we\ncomplementarily explore the inter-consistency among multiple LLMs for\ncollaboration. To examine whether LLMs can collaborate effectively to achieve a\nconsensus for a shared goal, we focus on commonsense reasoning, and introduce a\nformal debate framework (FORD) to conduct a three-stage debate among LLMs with\nreal-world scenarios alignment: fair debate, mismatched debate, and roundtable\ndebate. Through extensive experiments on various datasets, LLMs can effectively\ncollaborate to reach a consensus despite noticeable inter-inconsistencies, but\nimbalances in their abilities can lead to domination by superior LLMs.\nLeveraging a more advanced LLM like GPT-4 as an authoritative judge can boost\ncollaboration performance. Our work contributes to understanding the\ninter-consistency among LLMs and lays the foundation for developing future\ncollaboration methods. Codes and data are available at\nhttps://github.com/Waste-Wood/FORD\n","authors":["Kai Xiong","Xiao Ding","Yixin Cao","Ting Liu","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2305.11595v3.pdf","comment":"EMNLP 2023 Findings Camera Ready Version"},{"id":"http://arxiv.org/abs/2310.11732v1","updated":"2023-10-18T06:07:28Z","published":"2023-10-18T06:07:28Z","title":"Investigating Uncertainty Calibration of Aligned Language Models under\n  the Multiple-Choice Setting","summary":"  Despite the significant progress made in practical applications of aligned\nlanguage models (LMs), they tend to be overconfident in output answers compared\nto the corresponding pre-trained LMs. In this work, we systematically evaluate\nthe impact of the alignment process on logit-based uncertainty calibration of\nLMs under the multiple-choice setting. We first conduct a thoughtful empirical\nstudy on how aligned LMs differ in calibration from their pre-trained\ncounterparts. Experimental results reveal that there are two distinct\nuncertainties in LMs under the multiple-choice setting, which are responsible\nfor the answer decision and the format preference of the LMs, respectively.\nThen, we investigate the role of these two uncertainties on aligned LM's\ncalibration through fine-tuning in simple synthetic alignment schemes and\nconclude that one reason for aligned LMs' overconfidence is the conflation of\nthese two types of uncertainty. Furthermore, we examine the utility of common\npost-hoc calibration methods for aligned LMs and propose an easy-to-implement\nand sample-efficient method to calibrate aligned LMs. We hope our findings\ncould provide insights into the design of more reliable alignment processes for\nLMs.\n","authors":["Guande He","Peng Cui","Jianfei Chen","Wenbo Hu","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.11732v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11722v1","updated":"2023-10-18T05:42:22Z","published":"2023-10-18T05:42:22Z","title":"Quantify Health-Related Atomic Knowledge in Chinese Medical Large\n  Language Models: A Computational Analysis","summary":"  Large Language Models (LLMs) have the potential to revolutionize the way\nusers self-diagnose through search engines by offering direct and efficient\nsuggestions. Recent studies primarily focused on the quality of LLMs evaluated\nby GPT-4 or their ability to pass medical exams, no studies have quantified the\nextent of health-related atomic knowledge stored in LLMs' memory, which is the\nbasis of LLMs to provide more factual suggestions. In this paper, we first\nconstructed a benchmark, including the most common types of atomic knowledge in\nuser self-diagnosis queries, with 17 atomic types and a total of 14, 048 pieces\nof atomic knowledge. Then, we evaluated both generic and specialized LLMs on\nthe benchmark. The experimental results showcased that generic LLMs perform\nbetter than specialized LLMs in terms of atomic knowledge and\ninstruction-following ability. Error analysis revealed that both generic and\nspecialized LLMs are sycophantic, e.g., always catering to users' claims when\nit comes to unknown knowledge. Besides, generic LLMs showed stronger safety,\nwhich can be learned by specialized LLMs through distilled data. We further\nexplored different types of data commonly adopted for fine-tuning specialized\nLLMs, i.e., real-world, semi-distilled, and distilled data, and found that\ndistilled data can benefit LLMs most.\n","authors":["Yaxin Fan","Feng Jiang","Peifeng Li","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2310.11722v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12444v2","updated":"2023-10-18T05:39:31Z","published":"2023-09-21T19:36:48Z","title":"Foundation Metrics: Quantifying Effectiveness of Healthcare\n  Conversations powered by Generative AI","summary":"  Generative Artificial Intelligence is set to revolutionize healthcare\ndelivery by transforming traditional patient care into a more personalized,\nefficient, and proactive process. Chatbots, serving as interactive\nconversational models, will probably drive this patient-centered transformation\nin healthcare. Through the provision of various services, including diagnosis,\npersonalized lifestyle recommendations, and mental health support, the\nobjective is to substantially augment patient health outcomes, all the while\nmitigating the workload burden on healthcare providers. The life-critical\nnature of healthcare applications necessitates establishing a unified and\ncomprehensive set of evaluation metrics for conversational models. Existing\nevaluation metrics proposed for various generic large language models (LLMs)\ndemonstrate a lack of comprehension regarding medical and health concepts and\ntheir significance in promoting patients' well-being. Moreover, these metrics\nneglect pivotal user-centered aspects, including trust-building, ethics,\npersonalization, empathy, user comprehension, and emotional support. The\npurpose of this paper is to explore state-of-the-art LLM-based evaluation\nmetrics that are specifically applicable to the assessment of interactive\nconversational models in healthcare. Subsequently, we present an comprehensive\nset of evaluation metrics designed to thoroughly assess the performance of\nhealthcare chatbots from an end-user perspective. These metrics encompass an\nevaluation of language processing abilities, impact on real-world clinical\ntasks, and effectiveness in user-interactive conversations. Finally, we engage\nin a discussion concerning the challenges associated with defining and\nimplementing these metrics, with particular emphasis on confounding factors\nsuch as the target audience, evaluation methods, and prompt techniques involved\nin the evaluation process.\n","authors":["Mahyar Abbasian","Elahe Khatibi","Iman Azimi","David Oniani","Zahra Shakeri Hossein Abad","Alexander Thieme","Ram Sriram","Zhongqi Yang","Yanshan Wang","Bryant Lin","Olivier Gevaert","Li-Jia Li","Ramesh Jain","Amir M. Rahmani"],"pdf_url":"https://arxiv.org/pdf/2309.12444v2.pdf","comment":"14 pages, 4 figures, 2 tables, journal paper"},{"id":"http://arxiv.org/abs/2310.11721v1","updated":"2023-10-18T05:39:20Z","published":"2023-10-18T05:39:20Z","title":"Chain-of-Thought Tuning: Masked Language Models can also Think Step By\n  Step in Natural Language Understanding","summary":"  Chain-of-Thought (CoT) is a technique that guides Large Language Models\n(LLMs) to decompose complex tasks into multi-step reasoning through\nintermediate steps in natural language form. Briefly, CoT enables LLMs to think\nstep by step. However, although many Natural Language Understanding (NLU) tasks\nalso require thinking step by step, LLMs perform less well than small-scale\nMasked Language Models (MLMs). To migrate CoT from LLMs to MLMs, we propose\nChain-of-Thought Tuning (CoTT), a two-step reasoning framework based on prompt\ntuning, to implement step-by-step thinking for MLMs on NLU tasks. From the\nperspective of CoT, CoTT's two-step framework enables MLMs to implement task\ndecomposition; CoTT's prompt tuning allows intermediate steps to be used in\nnatural language form. Thereby, the success of CoT can be extended to NLU tasks\nthrough MLMs. To verify the effectiveness of CoTT, we conduct experiments on\ntwo NLU tasks: hierarchical classification and relation extraction, and the\nresults show that CoTT outperforms baselines and achieves state-of-the-art\nperformance.\n","authors":["Caoyun Fan","Jidong Tian","Yitian Li","Wenqing Chen","Hao He","Yaohui Jin"],"pdf_url":"https://arxiv.org/pdf/2310.11721v1.pdf","comment":"EMNLP2023 Main Conference"},{"id":"http://arxiv.org/abs/2310.11716v1","updated":"2023-10-18T05:13:47Z","published":"2023-10-18T05:13:47Z","title":"Reflection-Tuning: Data Recycling Improves LLM Instruction-Tuning","summary":"  Recent advancements in Large Language Models (LLMs) have expanded the\nhorizons of natural language understanding and generation. Notably, the output\ncontrol and alignment with the input of LLMs can be refined through instruction\ntuning. However, as highlighted in several studies, low-quality data in the\ntraining set are usually detrimental to instruction tuning, resulting in\ninconsistent or even misleading LLM outputs. We propose a novel method, termed\n\"reflection-tuning,\" which addresses the problem by self-improvement and\njudging capabilities of LLMs. This approach utilizes an oracle LLM to recycle\nthe original training data by introspecting and enhancing the quality of\ninstructions and responses in the data. Extensive experiments on widely used\nevaluation benchmarks show that LLMs trained with our recycled data outperform\nthose trained with existing datasets in various benchmarks.\n","authors":["Ming Li","Lichang Chen","Jiuhai Chen","Shwai He","Heng Huang","Jiuxiang Gu","Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.11716v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11715v1","updated":"2023-10-18T05:13:34Z","published":"2023-10-18T05:13:34Z","title":"Enhancing Low-resource Fine-grained Named Entity Recognition by\n  Leveraging Coarse-grained Datasets","summary":"  Named Entity Recognition (NER) frequently suffers from the problem of\ninsufficient labeled data, particularly in fine-grained NER scenarios. Although\n$K$-shot learning techniques can be applied, their performance tends to\nsaturate when the number of annotations exceeds several tens of labels. To\novercome this problem, we utilize existing coarse-grained datasets that offer a\nlarge number of annotations. A straightforward approach to address this problem\nis pre-finetuning, which employs coarse-grained data for representation\nlearning. However, it cannot directly utilize the relationships between\nfine-grained and coarse-grained entities, although a fine-grained entity type\nis likely to be a subcategory of a coarse-grained entity type. We propose a\nfine-grained NER model with a Fine-to-Coarse(F2C) mapping matrix to leverage\nthe hierarchical structure explicitly. In addition, we present an inconsistency\nfiltering method to eliminate coarse-grained entities that are inconsistent\nwith fine-grained entity types to avoid performance degradation. Our\nexperimental results show that our method outperforms both $K$-shot learning\nand supervised learning methods when dealing with a small number of\nfine-grained annotations.\n","authors":["Su Ah Lee","Seokjin Oh","Woohwan Jung"],"pdf_url":"https://arxiv.org/pdf/2310.11715v1.pdf","comment":"Accepted to EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.11710v1","updated":"2023-10-18T04:54:32Z","published":"2023-10-18T04:54:32Z","title":"Learning Co-Speech Gesture for Multimodal Aphasia Type Detection","summary":"  Aphasia, a language disorder resulting from brain damage, requires accurate\nidentification of specific aphasia types, such as Broca's and Wernicke's\naphasia, for effective treatment. However, little attention has been paid to\ndeveloping methods to detect different types of aphasia. Recognizing the\nimportance of analyzing co-speech gestures for distinguish aphasia types, we\npropose a multimodal graph neural network for aphasia type detection using\nspeech and corresponding gesture patterns. By learning the correlation between\nthe speech and gesture modalities for each aphasia type, our model can generate\ntextual representations sensitive to gesture information, leading to accurate\naphasia type detection. Extensive experiments demonstrate the superiority of\nour approach over existing methods, achieving state-of-the-art results (F1\n84.2\\%). We also show that gesture features outperform acoustic features,\nhighlighting the significance of gesture expression in detecting aphasia types.\nWe provide the codes for reproducibility purposes\\footnote{Code:\n\\url{https://github.com/DSAIL-SKKU/Multimodal-Aphasia-Type-Detection_EMNLP_2023}}.\n","authors":["Daeun Lee","Sejung Son","Hyolim Jeon","Seungbae Kim","Jinyoung Han"],"pdf_url":"https://arxiv.org/pdf/2310.11710v1.pdf","comment":"EMNLP 2023 accepted"},{"id":"http://arxiv.org/abs/2305.12532v4","updated":"2023-10-18T04:20:05Z","published":"2023-05-21T18:25:07Z","title":"Multilingual Simplification of Medical Texts","summary":"  Automated text simplification aims to produce simple versions of complex\ntexts. This task is especially useful in the medical domain, where the latest\nmedical findings are typically communicated via complex and technical articles.\nThis creates barriers for laypeople seeking access to up-to-date medical\nfindings, consequently impeding progress on health literacy. Most existing work\non medical text simplification has focused on monolingual settings, with the\nresult that such evidence would be available only in just one language (most\noften, English). This work addresses this limitation via multilingual\nsimplification, i.e., directly simplifying complex texts into simplified texts\nin multiple languages. We introduce MultiCochrane, the first sentence-aligned\nmultilingual text simplification dataset for the medical domain in four\nlanguages: English, Spanish, French, and Farsi. We evaluate fine-tuned and\nzero-shot models across these languages, with extensive human assessments and\nanalyses. Although models can now generate viable simplified texts, we identify\noutstanding challenges that this dataset might be used to address.\n","authors":["Sebastian Joseph","Kathryn Kazanas","Keziah Reina","Vishnesh J. Ramanathan","Wei Xu","Byron C. Wallace","Junyi Jessy Li"],"pdf_url":"https://arxiv.org/pdf/2305.12532v4.pdf","comment":"This version will be in EMNLP 2023 main"},{"id":"http://arxiv.org/abs/2310.11699v1","updated":"2023-10-18T04:15:12Z","published":"2023-10-18T04:15:12Z","title":"MISAR: A Multimodal Instructional System with Augmented Reality","summary":"  Augmented reality (AR) requires the seamless integration of visual, auditory,\nand linguistic channels for optimized human-computer interaction. While\nauditory and visual inputs facilitate real-time and contextual user guidance,\nthe potential of large language models (LLMs) in this landscape remains largely\nuntapped. Our study introduces an innovative method harnessing LLMs to\nassimilate information from visual, auditory, and contextual modalities.\nFocusing on the unique challenge of task performance quantification in AR, we\nutilize egocentric video, speech, and context analysis. The integration of LLMs\nfacilitates enhanced state estimation, marking a step towards more adaptive AR\nsystems. Code, dataset, and demo will be available at\nhttps://github.com/nguyennm1024/misar.\n","authors":["Jing Bi","Nguyen Manh Nguyen","Ali Vosoughi","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2310.11699v1.pdf","comment":"Accepted at ICCV 2023 - AV4D, 6 figures, 2 tables"},{"id":"http://arxiv.org/abs/2310.09238v2","updated":"2023-10-18T03:51:38Z","published":"2023-10-13T16:46:38Z","title":"BanglaNLP at BLP-2023 Task 2: Benchmarking different Transformer Models\n  for Sentiment Analysis of Bangla Social Media Posts","summary":"  Bangla is the 7th most widely spoken language globally, with a staggering 234\nmillion native speakers primarily hailing from India and Bangladesh. This\nmorphologically rich language boasts a rich literary tradition, encompassing\ndiverse dialects and language-specific challenges. Despite its linguistic\nrichness and history, Bangla remains categorized as a low-resource language\nwithin the natural language processing (NLP) and speech community. This paper\npresents our submission to Task 2 (Sentiment Analysis of Bangla Social Media\nPosts) of the BLP Workshop. We experiment with various Transformer-based\narchitectures to solve this task. Our quantitative results show that transfer\nlearning really helps in better learning of the models in this low-resource\nlanguage scenario. This becomes evident when we further finetune a model which\nhas already been finetuned on twitter data for sentiment analysis task and that\nfinetuned model performs the best among all other models. We also perform a\ndetailed error analysis where we find some instances where ground truth labels\nneed to be relooked at. We obtain a micro-F1 of 67.02\\% on the test set and our\nperformance in this shared task is ranked at 21 in the leaderboard.\n","authors":["Saumajit Saha","Albert Nanda"],"pdf_url":"https://arxiv.org/pdf/2310.09238v2.pdf","comment":"7 pages, 2 figures, workshop"},{"id":"http://arxiv.org/abs/2310.11689v1","updated":"2023-10-18T03:34:59Z","published":"2023-10-18T03:34:59Z","title":"Adaptation with Self-Evaluation to Improve Selective Prediction in LLMs","summary":"  Large language models (LLMs) have recently shown great advances in a variety\nof tasks, including natural language understanding and generation. However,\ntheir use in high-stakes decision-making scenarios is still limited due to the\npotential for errors. Selective prediction is a technique that can be used to\nimprove the reliability of the LLMs by allowing them to abstain from making\npredictions when they are unsure of the answer. In this work, we propose a\nnovel framework for adaptation with self-evaluation to improve the selective\nprediction performance of LLMs. Our framework is based on the idea of using\nparameter-efficient tuning to adapt the LLM to the specific task at hand while\nimproving its ability to perform self-evaluation. We evaluate our method on a\nvariety of question-answering (QA) datasets and show that it outperforms\nstate-of-the-art selective prediction methods. For example, on the CoQA\nbenchmark, our method improves the AUACC from 91.23% to 92.63% and improves the\nAUROC from 74.61% to 80.25%.\n","authors":["Jiefeng Chen","Jinsung Yoon","Sayna Ebrahimi","Sercan O Arik","Tomas Pfister","Somesh Jha"],"pdf_url":"https://arxiv.org/pdf/2310.11689v1.pdf","comment":"Paper published at Findings of the Association for Computational\n  Linguistics: EMNLP, 2023"},{"id":"http://arxiv.org/abs/2303.01580v2","updated":"2023-10-18T03:31:02Z","published":"2023-03-02T21:13:56Z","title":"Mixture of Soft Prompts for Controllable Data Generation","summary":"  Large language models (LLMs) effectively generate fluent text when the target\noutput follows natural language patterns. However, structured prediction tasks\nconfine the output format to a limited ontology, causing even very large models\nto struggle since they were never trained with such restrictions in mind. The\ndifficulty of using LLMs for direct prediction is exacerbated in few-shot\nlearning scenarios, which commonly arise due to domain shift and resource\nlimitations. We flip the problem on its head by leveraging the LLM as a tool\nfor data augmentation rather than direct prediction. Our proposed Mixture of\nSoft Prompts (MSP) serves as a parameter-efficient procedure for generating\ndata in a controlled manner. Denoising mechanisms are further applied to\nimprove the quality of synthesized data. Automatic metrics show our method is\ncapable of producing diverse and natural text, while preserving label\nsemantics. Moreover, MSP achieves state-of-the-art results on three benchmarks\nwhen compared against strong baselines. Our method offers an alternate\ndata-centric approach for applying LLMs to complex prediction tasks.\n","authors":["Derek Chen","Celine Lee","Yunan Lu","Domenic Rosati","Zhou Yu"],"pdf_url":"https://arxiv.org/pdf/2303.01580v2.pdf","comment":"19 pages, 13 Tables, 2 Figures. Accepted at EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.11685v1","updated":"2023-10-18T03:17:57Z","published":"2023-10-18T03:17:57Z","title":"Superiority of Softmax: Unveiling the Performance Edge Over Linear\n  Attention","summary":"  Large transformer models have achieved state-of-the-art results in numerous\nnatural language processing tasks. Among the pivotal components of the\ntransformer architecture, the attention mechanism plays a crucial role in\ncapturing token interactions within sequences through the utilization of\nsoftmax function.\n  Conversely, linear attention presents a more computationally efficient\nalternative by approximating the softmax operation with linear complexity.\nHowever, it exhibits substantial performance degradation when compared to the\ntraditional softmax attention mechanism.\n  In this paper, we bridge the gap in our theoretical understanding of the\nreasons behind the practical performance gap between softmax and linear\nattention. By conducting a comprehensive comparative analysis of these two\nattention mechanisms, we shed light on the underlying reasons for why softmax\nattention outperforms linear attention in most scenarios.\n","authors":["Yichuan Deng","Zhao Song","Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.11685v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11681v1","updated":"2023-10-18T03:10:25Z","published":"2023-10-18T03:10:25Z","title":"Descriptive Knowledge Graph in Biomedical Domain","summary":"  We present a novel system that automatically extracts and generates\ninformative and descriptive sentences from the biomedical corpus and\nfacilitates the efficient search for relational knowledge. Unlike previous\nsearch engines or exploration systems that retrieve unconnected passages, our\nsystem organizes descriptive sentences as a relational graph, enabling\nresearchers to explore closely related biomedical entities (e.g., diseases\ntreated by a chemical) or indirectly connected entities (e.g., potential drugs\nfor treating a disease). Our system also uses ChatGPT and a fine-tuned relation\nsynthesis model to generate concise and reliable descriptive sentences from\nretrieved information, reducing the need for extensive human reading effort.\nWith our system, researchers can easily obtain both high-level knowledge and\ndetailed references and interactively steer to the information of interest. We\nspotlight the application of our system in COVID-19 research, illustrating its\nutility in areas such as drug repurposing and literature curation.\n","authors":["Kerui Zhu","Jie Huang","Kevin Chen-Chuan Chang"],"pdf_url":"https://arxiv.org/pdf/2310.11681v1.pdf","comment":"EMNLP 2023 Demo"},{"id":"http://arxiv.org/abs/2305.18703v6","updated":"2023-10-18T02:55:30Z","published":"2023-05-30T03:00:30Z","title":"Domain Specialization as the Key to Make Large Language Models\n  Disruptive: A Comprehensive Survey","summary":"  Large language models (LLMs) have significantly advanced the field of natural\nlanguage processing (NLP), providing a highly useful, task-agnostic foundation\nfor a wide range of applications. However, directly applying LLMs to solve\nsophisticated problems in specific domains meets many hurdles, caused by the\nheterogeneity of domain data, the sophistication of domain knowledge, the\nuniqueness of domain objectives, and the diversity of the constraints (e.g.,\nvarious social norms, cultural conformity, religious beliefs, and ethical\nstandards in the domain applications). Domain specification techniques are key\nto make large language models disruptive in many applications. Specifically, to\nsolve these hurdles, there has been a notable increase in research and\npractices conducted in recent years on the domain specialization of LLMs. This\nemerging field of study, with its substantial potential for impact,\nnecessitates a comprehensive and systematic review to better summarize and\nguide ongoing work in this area. In this article, we present a comprehensive\nsurvey on domain specification techniques for large language models, an\nemerging direction critical for large language model applications. First, we\npropose a systematic taxonomy that categorizes the LLM domain-specialization\ntechniques based on the accessibility to LLMs and summarizes the framework for\nall the subcategories as well as their relations and differences to each other.\nSecond, we present an extensive taxonomy of critical application domains that\ncan benefit dramatically from specialized LLMs, discussing their practical\nsignificance and open challenges. Last, we offer our insights into the current\nresearch status and future trends in this area.\n","authors":["Chen Ling","Xujiang Zhao","Jiaying Lu","Chengyuan Deng","Can Zheng","Junxiang Wang","Tanmoy Chowdhury","Yun Li","Hejie Cui","Xuchao Zhang","Tianjiao Zhao","Amit Panalkar","Wei Cheng","Haoyu Wang","Yanchi Liu","Zhengzhang Chen","Haifeng Chen","Chris White","Quanquan Gu","Jian Pei","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2305.18703v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11672v1","updated":"2023-10-18T02:45:54Z","published":"2023-10-18T02:45:54Z","title":"Open-ended Commonsense Reasoning with Unrestricted Answer Scope","summary":"  Open-ended Commonsense Reasoning is defined as solving a commonsense question\nwithout providing 1) a short list of answer candidates and 2) a pre-defined\nanswer scope. Conventional ways of formulating the commonsense question into a\nquestion-answering form or utilizing external knowledge to learn\nretrieval-based methods are less applicable in the open-ended setting due to an\ninherent challenge. Without pre-defining an answer scope or a few candidates,\nopen-ended commonsense reasoning entails predicting answers by searching over\nan extremely large searching space. Moreover, most questions require implicit\nmulti-hop reasoning, which presents even more challenges to our problem. In\nthis work, we leverage pre-trained language models to iteratively retrieve\nreasoning paths on the external knowledge base, which does not require\ntask-specific supervision. The reasoning paths can help to identify the most\nprecise answer to the commonsense question. We conduct experiments on two\ncommonsense benchmark datasets. Compared to other approaches, our proposed\nmethod achieves better performance both quantitatively and qualitatively.\n","authors":["Chen Ling","Xuchao Zhang","Xujiang Zhao","Yanchi Liu","Wei Cheng","Takao Osaki","Katsushi Matsuda","Haifeng Chen","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.11672v1.pdf","comment":"Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.11671v1","updated":"2023-10-18T02:45:51Z","published":"2023-10-18T02:45:51Z","title":"MixEdit: Revisiting Data Augmentation and Beyond for Grammatical Error\n  Correction","summary":"  Data Augmentation through generating pseudo data has been proven effective in\nmitigating the challenge of data scarcity in the field of Grammatical Error\nCorrection (GEC). Various augmentation strategies have been widely explored,\nmost of which are motivated by two heuristics, i.e., increasing the\ndistribution similarity and diversity of pseudo data. However, the underlying\nmechanism responsible for the effectiveness of these strategies remains poorly\nunderstood. In this paper, we aim to clarify how data augmentation improves GEC\nmodels. To this end, we introduce two interpretable and computationally\nefficient measures: Affinity and Diversity. Our findings indicate that an\nexcellent GEC data augmentation strategy characterized by high Affinity and\nappropriate Diversity can better improve the performance of GEC models. Based\non this observation, we propose MixEdit, a data augmentation approach that\nstrategically and dynamically augments realistic data, without requiring extra\nmonolingual corpora. To verify the correctness of our findings and the\neffectiveness of the proposed MixEdit, we conduct experiments on mainstream\nEnglish and Chinese GEC datasets. The results show that MixEdit substantially\nimproves GEC models and is complementary to traditional data augmentation\nmethods.\n","authors":["Jingheng Ye","Yinghui Li","Yangning Li","Hai-Tao Zheng"],"pdf_url":"https://arxiv.org/pdf/2310.11671v1.pdf","comment":"Accepted to Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.11670v1","updated":"2023-10-18T02:42:17Z","published":"2023-10-18T02:42:17Z","title":"Prototype-based HyperAdapter for Sample-Efficient Multi-task Tuning","summary":"  Parameter-efficient fine-tuning (PEFT) has shown its effectiveness in\nadapting the pre-trained language models to downstream tasks while only\nupdating a small number of parameters. Despite the success, most existing\nmethods independently adapt to each task without considering knowledge transfer\nbetween tasks and are limited to low-data regimes. To overcome this issue, we\npropose Prototype-based HyperAdapter (PHA), a novel framework built on the\nadapter-tuning and hypernetwork. It introduces an instance-dense retriever and\na prototypical hypernetwork to generate the conditional modules in a\nsample-efficient manner. This leads to comparable performance improvements\nagainst existing PEFT methods on multi-task learning and few-shot transfer\nlearning. More importantly, when the available data size gets smaller, our\nmethod outperforms other strong baselines by a large margin. Based on our\nextensive empirical experiments across various datasets, we demonstrate that\nPHA strikes a better trade-off between trainable parameters, accuracy on stream\ntasks, and sample efficiency.\n","authors":["Hao Zhao","Jie Fu","Zhaofeng He"],"pdf_url":"https://arxiv.org/pdf/2310.11670v1.pdf","comment":"Accepted by EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.07923v3","updated":"2023-10-18T02:38:02Z","published":"2023-10-11T22:35:18Z","title":"The Expressive Power of Transformers with Chain of Thought","summary":"  Recent theoretical work has identified surprisingly simple reasoning\nproblems, such as checking if two nodes in a graph are connected or simulating\nfinite-state machines, that are provably unsolvable by standard transformers\nthat answer immediately after reading their input. However, in practice,\ntransformers' reasoning can be improved by allowing them to use a \"chain of\nthought\" or \"scratchpad\", i.e., generate and condition on a sequence of\nintermediate tokens before answering. Motivated by this, we ask: Does such\nintermediate generation fundamentally extend the computational power of a\ndecoder-only transformer? We show that the answer is yes, but the amount of\nincrease depends crucially on the amount of intermediate generation. For\ninstance, we find that transformer decoders with a logarithmic number of\ndecoding steps (w.r.t. the input length) push the limits of standard\ntransformers only slightly, while a linear number of decoding steps adds a\nclear new ability (under standard complexity conjectures): recognizing all\nregular languages. Our results also imply that linear steps keep transformer\ndecoders within context-sensitive languages, and polynomial steps make them\nrecognize exactly the class of polynomial-time solvable problems -- the first\nexact characterization of a type of transformers in terms of standard\ncomplexity classes. Together, our results provide a nuanced framework for\nunderstanding how the length of a transformer's chain of thought or scratchpad\nimpacts its reasoning power.\n","authors":["William Merrill","Ashish Sabharwal"],"pdf_url":"https://arxiv.org/pdf/2310.07923v3.pdf","comment":"9-page preprint"},{"id":"http://arxiv.org/abs/2304.14770v2","updated":"2023-10-18T02:30:55Z","published":"2023-04-28T11:28:56Z","title":"RexUIE: A Recursive Method with Explicit Schema Instructor for Universal\n  Information Extraction","summary":"  Universal Information Extraction (UIE) is an area of interest due to the\nchallenges posed by varying targets, heterogeneous structures, and\ndemand-specific schemas. However, previous works have only achieved limited\nsuccess by unifying a few tasks, such as Named Entity Recognition (NER) and\nRelation Extraction (RE), which fall short of being authentic UIE models\nparticularly when extracting other general schemas such as quadruples and\nquintuples. Additionally, these models used an implicit structural schema\ninstructor, which could lead to incorrect links between types, hindering the\nmodel's generalization and performance in low-resource scenarios. In this\npaper, we redefine the authentic UIE with a formal formulation that encompasses\nalmost all extraction schemas. To the best of our knowledge, we are the first\nto introduce UIE for any kind of schemas. In addition, we propose RexUIE, which\nis a Recursive Method with Explicit Schema Instructor for UIE. To avoid\ninterference between different types, we reset the position ids and attention\nmask matrices. RexUIE shows strong performance under both full-shot and\nfew-shot settings and achieves State-of-the-Art results on the tasks of\nextracting complex schemas.\n","authors":["Chengyuan Liu","Fubang Zhao","Yangyang Kang","Jingyuan Zhang","Xiang Zhou","Changlong Sun","Kun Kuang","Fei Wu"],"pdf_url":"https://arxiv.org/pdf/2304.14770v2.pdf","comment":"Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2303.13408v2","updated":"2023-10-18T02:29:55Z","published":"2023-03-23T16:29:27Z","title":"Paraphrasing evades detectors of AI-generated text, but retrieval is an\n  effective defense","summary":"  The rise in malicious usage of large language models, such as fake content\ncreation and academic plagiarism, has motivated the development of approaches\nthat identify AI-generated text, including those based on watermarking or\noutlier detection. However, the robustness of these detection algorithms to\nparaphrases of AI-generated text remains unclear. To stress test these\ndetectors, we build a 11B parameter paraphrase generation model (DIPPER) that\ncan paraphrase paragraphs, condition on surrounding context, and control\nlexical diversity and content reordering. Using DIPPER to paraphrase text\ngenerated by three large language models (including GPT3.5-davinci-003)\nsuccessfully evades several detectors, including watermarking, GPTZero,\nDetectGPT, and OpenAI's text classifier. For example, DIPPER drops detection\naccuracy of DetectGPT from 70.3% to 4.6% (at a constant false positive rate of\n1%), without appreciably modifying the input semantics.\n  To increase the robustness of AI-generated text detection to paraphrase\nattacks, we introduce a simple defense that relies on retrieving\nsemantically-similar generations and must be maintained by a language model API\nprovider. Given a candidate text, our algorithm searches a database of\nsequences previously generated by the API, looking for sequences that match the\ncandidate text within a certain threshold. We empirically verify our defense\nusing a database of 15M generations from a fine-tuned T5-XXL model and find\nthat it can detect 80% to 97% of paraphrased generations across different\nsettings while only classifying 1% of human-written sequences as AI-generated.\nWe open-source our models, code and data.\n","authors":["Kalpesh Krishna","Yixiao Song","Marzena Karpinska","John Wieting","Mohit Iyyer"],"pdf_url":"https://arxiv.org/pdf/2303.13408v2.pdf","comment":"NeurIPS 2023 camera ready (32 pages). Code, models, data available in\n  https://github.com/martiansideofthemoon/ai-detection-paraphrases"},{"id":"http://arxiv.org/abs/2310.11667v1","updated":"2023-10-18T02:27:01Z","published":"2023-10-18T02:27:01Z","title":"SOTOPIA: Interactive Evaluation for Social Intelligence in Language\n  Agents","summary":"  Humans are social beings; we pursue social goals in our daily interactions,\nwhich is a crucial aspect of social intelligence. Yet, AI systems' abilities in\nthis realm remain elusive. We present SOTOPIA, an open-ended environment to\nsimulate complex social interactions between artificial agents and evaluate\ntheir social intelligence. In our environment, agents role-play and interact\nunder a wide variety of scenarios; they coordinate, collaborate, exchange, and\ncompete with each other to achieve complex social goals. We simulate the\nrole-play interaction between LLM-based agents and humans within this task\nspace and evaluate their performance with a holistic evaluation framework\ncalled SOTOPIA-Eval. With SOTOPIA, we find significant differences between\nthese models in terms of their social intelligence, and we identify a subset of\nSOTOPIA scenarios, SOTOPIA-hard, that is generally challenging for all models.\nWe find that on this subset, GPT-4 achieves a significantly lower goal\ncompletion rate than humans and struggles to exhibit social commonsense\nreasoning and strategic communication skills. These findings demonstrate\nSOTOPIA's promise as a general platform for research on evaluating and\nimproving social intelligence in artificial agents.\n","authors":["Xuhui Zhou","Hao Zhu","Leena Mathur","Ruohong Zhang","Haofei Yu","Zhengyang Qi","Louis-Philippe Morency","Yonatan Bisk","Daniel Fried","Graham Neubig","Maarten Sap"],"pdf_url":"https://arxiv.org/pdf/2310.11667v1.pdf","comment":"Preprint, 43 pages. The first two authors contribute equally"},{"id":"http://arxiv.org/abs/2305.12733v2","updated":"2023-10-18T02:09:24Z","published":"2023-05-22T05:50:11Z","title":"MADNet: Maximizing Addressee Deduction Expectation for Multi-Party\n  Conversation Generation","summary":"  Modeling multi-party conversations (MPCs) with graph neural networks has been\nproven effective at capturing complicated and graphical information flows.\nHowever, existing methods rely heavily on the necessary addressee labels and\ncan only be applied to an ideal setting where each utterance must be tagged\nwith an addressee label. To study the scarcity of addressee labels which is a\ncommon issue in MPCs, we propose MADNet that maximizes addressee deduction\nexpectation in heterogeneous graph neural networks for MPC generation. Given an\nMPC with a few addressee labels missing, existing methods fail to build a\nconsecutively connected conversation graph, but only a few separate\nconversation fragments instead. To ensure message passing between these\nconversation fragments, four additional types of latent edges are designed to\ncomplete a fully-connected graph. Besides, to optimize the edge-type-dependent\nmessage passing for those utterances without addressee labels, an\nExpectation-Maximization-based method that iteratively generates silver\naddressee labels (E step), and optimizes the quality of generated responses (M\nstep), is designed. Experimental results on two Ubuntu IRC channel benchmarks\nshow that MADNet outperforms various baseline models on the task of MPC\ngeneration, especially under the more common and challenging setting where part\nof addressee labels are missing.\n","authors":["Jia-Chen Gu","Chao-Hong Tan","Caiyuan Chu","Zhen-Hua Ling","Chongyang Tao","Quan Liu","Cong Liu"],"pdf_url":"https://arxiv.org/pdf/2305.12733v2.pdf","comment":"Accepted by EMNLP 2023. arXiv admin note: text overlap with\n  arXiv:2203.08500"},{"id":"http://arxiv.org/abs/2306.15895v2","updated":"2023-10-18T02:07:12Z","published":"2023-06-28T03:31:31Z","title":"Large Language Model as Attributed Training Data Generator: A Tale of\n  Diversity and Bias","summary":"  Large language models (LLMs) have been recently leveraged as training data\ngenerators for various natural language processing (NLP) tasks. While previous\nresearch has explored different approaches to training models using generated\ndata, they generally rely on simple class-conditional prompts, which may limit\nthe diversity of the generated data and inherit systematic biases of LLM. Thus,\nwe investigate training data generation with diversely attributed prompts\n(e.g., specifying attributes like length and style), which have the potential\nto yield diverse and attributed generated data. Our investigation focuses on\ndatasets with high cardinality and diverse domains, wherein we demonstrate that\nattributed prompts outperform simple class-conditional prompts in terms of the\nresulting model's performance. Additionally, we present a comprehensive\nempirical study on data generation encompassing vital aspects like bias,\ndiversity, and efficiency, and highlight three key observations: firstly,\nsynthetic datasets generated by simple prompts exhibit significant biases, such\nas regional bias; secondly, attribute diversity plays a pivotal role in\nenhancing model performance; lastly, attributed prompts achieve the performance\nof simple class-conditional prompts while utilizing only 5\\% of the querying\ncost of ChatGPT associated with the latter. The data and code are available on\n\\url{https://github.com/yueyu1030/AttrPrompt}.\n","authors":["Yue Yu","Yuchen Zhuang","Jieyu Zhang","Yu Meng","Alexander Ratner","Ranjay Krishna","Jiaming Shen","Chao Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.15895v2.pdf","comment":"Accepted to NeurIPS 2023 (Datasets and Benchmarks Track)"},{"id":"http://arxiv.org/abs/2310.11655v1","updated":"2023-10-18T01:56:16Z","published":"2023-10-18T01:56:16Z","title":"Field-testing items using artificial intelligence: Natural language\n  processing with transformers","summary":"  Five thousand variations of the RoBERTa model, an artificially intelligent\n\"transformer\" that can understand text language, completed an English literacy\nexam with 29 multiple-choice questions. Data were used to calculate the\npsychometric properties of the items, which showed some degree of agreement to\nthose obtained from human examinee data.\n","authors":["Hotaka Maeda"],"pdf_url":"https://arxiv.org/pdf/2310.11655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11648v1","updated":"2023-10-18T01:20:16Z","published":"2023-10-18T01:20:16Z","title":"Zero-shot Faithfulness Evaluation for Text Summarization with Foundation\n  Language Model","summary":"  Despite tremendous improvements in natural language generation, summarization\nmodels still suffer from the unfaithfulness issue. Previous work evaluates\nfaithfulness either using models trained on the other tasks or in-domain\nsynthetic data, or prompting a large model such as ChatGPT. This paper proposes\nto do zero-shot faithfulness evaluation simply with a moderately-sized\nfoundation language model. We introduce a new metric FFLM, which is a\ncombination of probability changes based on the intuition that prefixing a\npiece of text that is consistent with the output will increase the probability\nof predicting the output. Experiments show that FFLM performs competitively\nwith or even outperforms ChatGPT on both inconsistency detection and\nfaithfulness rating with 24x fewer parameters. FFLM also achieves improvements\nover other strong baselines.\n","authors":["Qi Jia","Siyu Ren","Yizhu Liu","Kenny Q. Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.11648v1.pdf","comment":"Accepted by EMNLP2023"},{"id":"http://arxiv.org/abs/2305.12084v2","updated":"2023-10-18T01:02:56Z","published":"2023-05-20T03:48:31Z","title":"Revisiting Entropy Rate Constancy in Text","summary":"  The uniform information density (UID) hypothesis states that humans tend to\ndistribute information roughly evenly across an utterance or discourse. Early\nevidence in support of the UID hypothesis came from Genzel & Charniak (2002),\nwhich proposed an entropy rate constancy principle based on the probability of\nEnglish text under n-gram language models. We re-evaluate the claims of Genzel\n& Charniak (2002) with neural language models, failing to find clear evidence\nin support of entropy rate constancy. We conduct a range of experiments across\ndatasets, model sizes, and languages and discuss implications for the uniform\ninformation density hypothesis and linguistic theories of efficient\ncommunication more broadly.\n","authors":["Vivek Verma","Nicholas Tomlin","Dan Klein"],"pdf_url":"https://arxiv.org/pdf/2305.12084v2.pdf","comment":"Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.10706v2","updated":"2023-10-18T00:55:57Z","published":"2023-10-16T15:11:01Z","title":"Harnessing the Power of LLMs: Evaluating Human-AI Text Co-Creation\n  through the Lens of News Headline Generation","summary":"  To explore how humans can best leverage LLMs for writing and how interacting\nwith these models affects feelings of ownership and trust in the writing\nprocess, we compared common human-AI interaction types (e.g., guiding system,\nselecting from system outputs, post-editing outputs) in the context of\nLLM-assisted news headline generation. While LLMs alone can generate\nsatisfactory news headlines, on average, human control is needed to fix\nundesirable model outputs. Of the interaction methods, guiding and selecting\nmodel output added the most benefit with the lowest cost (in time and effort).\nFurther, AI assistance did not harm participants' perception of control\ncompared to freeform editing.\n","authors":["Zijian Ding","Alison Smith-Renner","Wenjuan Zhang","Joel R. Tetreault","Alejandro Jaimes"],"pdf_url":"https://arxiv.org/pdf/2310.10706v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11638v1","updated":"2023-10-18T00:20:50Z","published":"2023-10-18T00:20:50Z","title":"Systematic Assessment of Factual Knowledge in Large Language Models","summary":"  Previous studies have relied on existing question-answering benchmarks to\nevaluate the knowledge stored in large language models (LLMs). However, this\napproach has limitations regarding factual knowledge coverage, as it mostly\nfocuses on generic domains which may overlap with the pretraining data. This\npaper proposes a framework to systematically assess the factual knowledge of\nLLMs by leveraging knowledge graphs (KGs). Our framework automatically\ngenerates a set of questions and expected answers from the facts stored in a\ngiven KG, and then evaluates the accuracy of LLMs in answering these questions.\nWe systematically evaluate the state-of-the-art LLMs with KGs in generic and\nspecific domains. The experiment shows that ChatGPT is consistently the top\nperformer across all domains. We also find that LLMs performance depends on the\ninstruction finetuning, domain and question complexity and is prone to\nadversarial context.\n","authors":["Linhao Luo","Thuy-Trang Vu","Dinh Phung","Gholamreza Haffari"],"pdf_url":"https://arxiv.org/pdf/2310.11638v1.pdf","comment":"Accepted by EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2210.15237v2","updated":"2023-10-18T00:15:10Z","published":"2022-10-27T07:48:18Z","title":"Seq2Seq-SC: End-to-End Semantic Communication Systems with Pre-trained\n  Language Model","summary":"  In this work, we propose a realistic semantic network called seq2seq-SC,\ndesigned to be compatible with 5G NR and capable of working with generalized\ntext datasets using a pre-trained language model. The goal is to achieve\nunprecedented communication efficiency by focusing on the meaning of messages\nin semantic communication. We employ a performance metric called semantic\nsimilarity, measured by BLEU for lexical similarity and SBERT for semantic\nsimilarity. Our findings demonstrate that seq2seq-SC outperforms previous\nmodels in extracting semantically meaningful information while maintaining\nsuperior performance. This study paves the way for continued advancements in\nsemantic communication and its prospective incorporation with future wireless\nsystems in 6G networks.\n","authors":["Ju-Hyung Lee","Dong-Ho Lee","Eunsoo Sheen","Thomas Choi","Jay Pujara"],"pdf_url":"https://arxiv.org/pdf/2210.15237v2.pdf","comment":"Accepted to IEEE ASILOMAR 2023 conference, 4 pages, 2 figures, 3\n  tables"},{"id":"http://arxiv.org/abs/2310.11634v1","updated":"2023-10-18T00:02:38Z","published":"2023-10-18T00:02:38Z","title":"MAGNIFICo: Evaluating the In-Context Learning Ability of Large Language\n  Models to Generalize to Novel Interpretations","summary":"  Humans possess a remarkable ability to assign novel interpretations to\nlinguistic expressions, enabling them to learn new words and understand\ncommunity-specific connotations. However, Large Language Models (LLMs) have a\nknowledge cutoff and are costly to finetune repeatedly. Therefore, it is\ncrucial for LLMs to learn novel interpretations in-context. In this paper, we\nsystematically analyse the ability of LLMs to acquire novel interpretations\nusing in-context learning. To facilitate our study, we introduce MAGNIFICo, an\nevaluation suite implemented within a text-to-SQL semantic parsing framework\nthat incorporates diverse tokens and prompt settings to simulate real-world\ncomplexity. Experimental results on MAGNIFICo demonstrate that LLMs exhibit a\nsurprisingly robust capacity for comprehending novel interpretations from\nnatural language descriptions as well as from discussions within long\nconversations. Nevertheless, our findings also highlight the need for further\nimprovements, particularly when interpreting unfamiliar words or when composing\nmultiple novel interpretations simultaneously in the same example.\nAdditionally, our analysis uncovers the semantic predispositions in LLMs and\nreveals the impact of recency bias for information presented in long contexts.\n","authors":["Arkil Patel","Satwik Bhattamishra","Siva Reddy","Dzmitry Bahdanau"],"pdf_url":"https://arxiv.org/pdf/2310.11634v1.pdf","comment":"EMNLP 2023"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2310.12153v1","updated":"2023-10-18T17:59:45Z","published":"2023-10-18T17:59:45Z","title":"Probabilistic Sampling of Balanced K-Means using Adiabatic Quantum\n  Computing","summary":"  Adiabatic quantum computing (AQC) is a promising quantum computing approach\nfor discrete and often NP-hard optimization problems. Current AQCs allow to\nimplement problems of research interest, which has sparked the development of\nquantum representations for many machine learning and computer vision tasks.\nDespite requiring multiple measurements from the noisy AQC, current approaches\nonly utilize the best measurement, discarding information contained in the\nremaining ones. In this work, we explore the potential of using this\ninformation for probabilistic balanced k-means clustering. Instead of\ndiscarding non-optimal solutions, we propose to use them to compute calibrated\nposterior probabilities with little additional compute cost. This allows us to\nidentify ambiguous solutions and data points, which we demonstrate on a D-Wave\nAQC on synthetic and real data.\n","authors":["Jan-Nico Zaech","Martin Danelljan","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2310.12153v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12152v1","updated":"2023-10-18T17:59:41Z","published":"2023-10-18T17:59:41Z","title":"Learning from Rich Semantics and Coarse Locations for Long-tailed Object\n  Detection","summary":"  Long-tailed object detection (LTOD) aims to handle the extreme data imbalance\nin real-world datasets, where many tail classes have scarce instances. One\npopular strategy is to explore extra data with image-level labels, yet it\nproduces limited results due to (1) semantic ambiguity -- an image-level label\nonly captures a salient part of the image, ignoring the remaining rich\nsemantics within the image; and (2) location sensitivity -- the label highly\ndepends on the locations and crops of the original image, which may change\nafter data transformations like random cropping. To remedy this, we propose\nRichSem, a simple but effective method, which is robust to learn rich semantics\nfrom coarse locations without the need of accurate bounding boxes. RichSem\nleverages rich semantics from images, which are then served as additional soft\nsupervision for training detectors. Specifically, we add a semantic branch to\nour detector to learn these soft semantics and enhance feature representations\nfor long-tailed object detection. The semantic branch is only used for training\nand is removed during inference. RichSem achieves consistent improvements on\nboth overall and rare-category of LVIS under different backbones and detectors.\nOur method achieves state-of-the-art performance without requiring complex\ntraining and testing procedures. Moreover, we show the effectiveness of our\nmethod on other long-tailed datasets with additional experiments. Code is\navailable at \\url{https://github.com/MengLcool/RichSem}.\n","authors":["Lingchen Meng","Xiyang Dai","Jianwei Yang","Dongdong Chen","Yinpeng Chen","Mengchen Liu","Yi-Ling Chen","Zuxuan Wu","Lu Yuan","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2310.12152v1.pdf","comment":"Accepted by NeurIPS2023"},{"id":"http://arxiv.org/abs/2310.12149v1","updated":"2023-10-18T17:59:02Z","published":"2023-10-18T17:59:02Z","title":"Object-aware Inversion and Reassembly for Image Editing","summary":"  By comparing the original and target prompts in editing task, we can obtain\nnumerous editing pairs, each comprising an object and its corresponding editing\ntarget. To allow editability while maintaining fidelity to the input image,\nexisting editing methods typically involve a fixed number of inversion steps\nthat project the whole input image to its noisier latent representation,\nfollowed by a denoising process guided by the target prompt. However, we find\nthat the optimal number of inversion steps for achieving ideal editing results\nvaries significantly among different editing pairs, owing to varying editing\ndifficulties. Therefore, the current literature, which relies on a fixed number\nof inversion steps, produces sub-optimal generation quality, especially when\nhandling multiple editing pairs in a natural image. To this end, we propose a\nnew image editing paradigm, dubbed Object-aware Inversion and Reassembly (OIR),\nto enable object-level fine-grained editing. Specifically, we design a new\nsearch metric, which determines the optimal inversion steps for each editing\npair, by jointly considering the editability of the target and the fidelity of\nthe non-editing region. We use our search metric to find the optimal inversion\nstep for each editing pair when editing an image. We then edit these editing\npairs separately to avoid concept mismatch. Subsequently, we propose an\nadditional reassembly step to seamlessly integrate the respective editing\nresults and the non-editing region to obtain the final edited image. To\nsystematically evaluate the effectiveness of our method, we collect two\ndatasets for benchmarking single- and multi-object editing, respectively.\nExperiments demonstrate that our method achieves superior performance in\nediting object shapes, colors, materials, categories, etc., especially in\nmulti-object editing scenarios.\n","authors":["Zhen Yang","Dinggang Gui","Wen Wang","Hao Chen","Bohan Zhuang","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2310.12149v1.pdf","comment":"Project Page: https://aim-uofa.github.io/OIR-Diffusion/"},{"id":"http://arxiv.org/abs/2310.12147v1","updated":"2023-10-18T17:57:05Z","published":"2023-10-18T17:57:05Z","title":"InViG: Benchmarking Interactive Visual Grounding with 500K Human-Robot\n  Interactions","summary":"  Ambiguity is ubiquitous in human communication. Previous approaches in\nHuman-Robot Interaction (HRI) have often relied on predefined interaction\ntemplates, leading to reduced performance in realistic and open-ended\nscenarios. To address these issues, we present a large-scale dataset, \\invig,\nfor interactive visual grounding under language ambiguity. Our dataset\ncomprises over 520K images accompanied by open-ended goal-oriented\ndisambiguation dialogues, encompassing millions of object instances and\ncorresponding question-answer pairs. Leveraging the \\invig dataset, we conduct\nextensive studies and propose a set of baseline solutions for end-to-end\ninteractive visual disambiguation and grounding, achieving a 45.6\\% success\nrate during validation. To the best of our knowledge, the \\invig dataset is the\nfirst large-scale dataset for resolving open-ended interactive visual\ngrounding, presenting a practical yet highly challenging benchmark for\nambiguity-aware HRI. Codes and datasets are available at:\n\\href{https://openivg.github.io}{https://openivg.github.io}.\n","authors":["Hanbo Zhang","Jie Xu","Yuchen Mo","Tao Kong"],"pdf_url":"https://arxiv.org/pdf/2310.12147v1.pdf","comment":"8 pages, 9 figures, 3 tables, under review"},{"id":"http://arxiv.org/abs/2310.12128v1","updated":"2023-10-18T17:37:10Z","published":"2023-10-18T17:37:10Z","title":"DiagrammerGPT: Generating Open-Domain, Open-Platform Diagrams via LLM\n  Planning","summary":"  Text-to-image (T2I) generation has seen significant growth over the past few\nyears. Despite this, there has been little work on generating diagrams with T2I\nmodels. A diagram is a symbolic/schematic representation that explains\ninformation using structurally rich and spatially complex visualizations (e.g.,\na dense combination of related objects, text labels, directional arrows,\nconnection lines, etc.). Existing state-of-the-art T2I models often fail at\ndiagram generation because they lack fine-grained object layout control when\nmany objects are densely connected via complex relations such as arrows/lines\nand also often fail to render comprehensible text labels. To address this gap,\nwe present DiagrammerGPT, a novel two-stage text-to-diagram generation\nframework that leverages the layout guidance capabilities of LLMs (e.g., GPT-4)\nto generate more accurate open-domain, open-platform diagrams. In the first\nstage, we use LLMs to generate and iteratively refine 'diagram plans' (in a\nplanner-auditor feedback loop) which describe all the entities (objects and\ntext labels), their relationships (arrows or lines), and their bounding box\nlayouts. In the second stage, we use a diagram generator, DiagramGLIGEN, and a\ntext label rendering module to generate diagrams following the diagram plans.\nTo benchmark the text-to-diagram generation task, we introduce AI2D-Caption, a\ndensely annotated diagram dataset built on top of the AI2D dataset. We show\nquantitatively and qualitatively that our DiagrammerGPT framework produces more\naccurate diagrams, outperforming existing T2I models. We also provide\ncomprehensive analysis including open-domain diagram generation, vector graphic\ndiagram generation in different platforms, human-in-the-loop diagram plan\nediting, and multimodal planner/auditor LLMs (e.g., GPT-4Vision). We hope our\nwork can inspire further research on diagram generation via T2I models and\nLLMs.\n","authors":["Abhay Zala","Han Lin","Jaemin Cho","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2310.12128v1.pdf","comment":"Project page: https://diagrammerGPT.github.io/"},{"id":"http://arxiv.org/abs/2310.08745v2","updated":"2023-10-18T17:18:10Z","published":"2023-10-12T22:15:06Z","title":"AcTExplore: Active Tactile Exploration on Unknown Objects","summary":"  Tactile exploration plays a crucial role in understanding object structures\nfor fundamental robotics tasks such as grasping and manipulation. However,\nefficiently exploring such objects using tactile sensors is challenging,\nprimarily due to the large-scale unknown environments and limited sensing\ncoverage of these sensors. To this end, we present AcTExplore, an active\ntactile exploration method driven by reinforcement learning for object\nreconstruction at scales that automatically explores the object surfaces in a\nlimited number of steps. Through sufficient exploration, our algorithm\nincrementally collects tactile data and reconstructs 3D shapes of the objects\nas well, which can serve as a representation for higher-level downstream tasks.\nOur method achieves an average of 95.97% IoU coverage on unseen YCB objects\nwhile just being trained on primitive shapes. Project Webpage:\nhttps://prg.cs.umd$.$edu/AcTExplore\n","authors":["Amir-Hossein Shahidzadeh","Seong Jong Yoo","Pavan Mantripragada","Chahat Deep Singh","Cornelia Fermüller","Yiannis Aloimonos"],"pdf_url":"https://arxiv.org/pdf/2310.08745v2.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2310.12100v1","updated":"2023-10-18T16:43:08Z","published":"2023-10-18T16:43:08Z","title":"Non-Intrusive Adaptation: Input-Centric Parameter-efficient Fine-Tuning\n  for Versatile Multimodal Modeling","summary":"  Large language models (LLMs) and vision language models (VLMs) demonstrate\nexcellent performance on a wide range of tasks by scaling up parameter counts\nfrom O(10^9) to O(10^{12}) levels and further beyond. These large scales make\nit impossible to adapt and deploy fully specialized models given a task of\ninterest. Parameter-efficient fine-tuning (PEFT) emerges as a promising\ndirection to tackle the adaptation and serving challenges for such large\nmodels. We categorize PEFT techniques into two types: intrusive and\nnon-intrusive. Intrusive PEFT techniques directly change a model's internal\narchitecture. Though more flexible, they introduce significant complexities for\ntraining and serving. Non-intrusive PEFT techniques leave the internal\narchitecture unchanged and only adapt model-external parameters, such as\nembeddings for input. In this work, we describe AdaLink as a non-intrusive PEFT\ntechnique that achieves competitive performance compared to SoTA intrusive PEFT\n(LoRA) and full model fine-tuning (FT) on various tasks. We evaluate using both\ntext-only and multimodal tasks, with experiments that account for both\nparameter-count scaling and training regime (with and without instruction\ntuning).\n","authors":["Yaqing Wang","Jialin Wu","Tanmaya Dabral","Jiageng Zhang","Geoff Brown","Chun-Ta Lu","Frederick Liu","Yi Liang","Bo Pang","Michael Bendersky","Radu Soricut"],"pdf_url":"https://arxiv.org/pdf/2310.12100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12092v1","updated":"2023-10-18T16:37:01Z","published":"2023-10-18T16:37:01Z","title":"HSTR-Net: Reference Based Video Super-resolution for Aerial Surveillance\n  with Dual Cameras","summary":"  Aerial surveillance requires high spatio-temporal resolution (HSTR) video for\nmore accurate detection and tracking of objects. This is especially true for\nwide-area surveillance (WAS), where the surveyed region is large and the\nobjects of interest are small. This paper proposes a dual camera system for the\ngeneration of HSTR video using reference-based super-resolution (RefSR). One\ncamera captures high spatial resolution low frame rate (HSLF) video while the\nother captures low spatial resolution high frame rate (LSHF) video\nsimultaneously for the same scene. A novel deep learning architecture is\nproposed to fuse HSLF and LSHF video feeds and synthesize HSTR video frames at\nthe output. The proposed model combines optical flow estimation and\n(channel-wise and spatial) attention mechanisms to capture the fine motion and\nintricate dependencies between frames of the two video feeds. Simulations show\nthat the proposed model provides significant improvement over existing\nreference-based SR techniques in terms of PSNR and SSIM metrics. The method\nalso exhibits sufficient frames per second (FPS) for WAS when deployed on a\npower-constrained drone equipped with dual cameras.\n","authors":["H. Umut Suluhan","Hasan F. Ates","Bahadir K. Gunturk"],"pdf_url":"https://arxiv.org/pdf/2310.12092v1.pdf","comment":"15 pages, 8 figures, 8 tables"},{"id":"http://arxiv.org/abs/2302.08357v3","updated":"2023-10-18T16:35:49Z","published":"2023-02-16T15:21:46Z","title":"Boundary Guided Learning-Free Semantic Control with Diffusion Models","summary":"  Applying pre-trained generative denoising diffusion models (DDMs) for\ndownstream tasks such as image semantic editing usually requires either\nfine-tuning DDMs or learning auxiliary editing networks in the existing\nliterature. In this work, we present our BoundaryDiffusion method for\nefficient, effective and light-weight semantic control with frozen pre-trained\nDDMs, without learning any extra networks. As one of the first learning-free\ndiffusion editing works, we start by seeking a comprehensive understanding of\nthe intermediate high-dimensional latent spaces by theoretically and\nempirically analyzing their probabilistic and geometric behaviors in the Markov\nchain. We then propose to further explore the critical step for editing in the\ndenoising trajectory that characterizes the convergence of a pre-trained DDM\nand introduce an automatic search method. Last but not least, in contrast to\nthe conventional understanding that DDMs have relatively poor semantic\nbehaviors, we prove that the critical latent space we found already exhibits\nsemantic subspace boundaries at the generic level in unconditional DDMs, which\nallows us to do controllable manipulation by guiding the denoising trajectory\ntowards the targeted boundary via a single-step operation. We conduct extensive\nexperiments on multiple DPMs architectures (DDPM, iDDPM) and datasets (CelebA,\nCelebA-HQ, LSUN-church, LSUN-bedroom, AFHQ-dog) with different resolutions (64,\n256), achieving superior or state-of-the-art performance in various task\nscenarios (image semantic editing, text-based editing, unconditional semantic\ncontrol) to demonstrate the effectiveness.\n","authors":["Ye Zhu","Yu Wu","Zhiwei Deng","Olga Russakovsky","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2302.08357v3.pdf","comment":"NeurIPS 2023. 27 pages including appendices, code at\n  https://github.com/L-YeZhu/BoundaryDiffusion"},{"id":"http://arxiv.org/abs/2310.12086v1","updated":"2023-10-18T16:27:49Z","published":"2023-10-18T16:27:49Z","title":"Unveiling the Siren's Song: Towards Reliable Fact-Conflicting\n  Hallucination Detection","summary":"  Large Language Models (LLMs), such as ChatGPT/GPT-4, have garnered widespread\nattention owing to their myriad of practical applications, yet their adoption\nhas been constrained by issues of fact-conflicting hallucinations across web\nplatforms. The assessment of factuality in text, produced by LLMs, remains\ninadequately explored, extending not only to the judgment of vanilla facts but\nalso encompassing the evaluation of factual errors emerging in complex\ninferential tasks like multi-hop, and etc. In response, we introduce FactCHD, a\nfact-conflicting hallucination detection benchmark meticulously designed for\nLLMs. Functioning as a pivotal tool in evaluating factuality within\n\"Query-Respons\" contexts, our benchmark assimilates a large-scale dataset,\nencapsulating a broad spectrum of factuality patterns, such as vanilla,\nmulti-hops, comparison, and set-operation patterns. A distinctive feature of\nour benchmark is its incorporation of fact-based chains of evidence, thereby\nfacilitating comprehensive and conducive factual reasoning throughout the\nassessment process. We evaluate multiple LLMs, demonstrating the effectiveness\nof the benchmark and current methods fall short of faithfully detecting factual\nerrors. Furthermore, we present TRUTH-TRIANGULATOR that synthesizes reflective\nconsiderations by tool-enhanced ChatGPT and LoRA-tuning based on Llama2, aiming\nto yield more credible detection through the amalgamation of predictive results\nand evidence. The benchmark dataset and source code will be made available in\nhttps://github.com/zjunlp/FactCHD.\n","authors":["Xiang Chen","Duanzheng Song","Honghao Gui","Chengxi Wang","Ningyu Zhang","Fei Huang","Chengfei Lv","Dan Zhang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.12086v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2310.12085v1","updated":"2023-10-18T16:27:06Z","published":"2023-10-18T16:27:06Z","title":"On the Benefit of Generative Foundation Models for Human Activity\n  Recognition","summary":"  In human activity recognition (HAR), the limited availability of annotated\ndata presents a significant challenge. Drawing inspiration from the latest\nadvancements in generative AI, including Large Language Models (LLMs) and\nmotion synthesis models, we believe that generative AI can address this data\nscarcity by autonomously generating virtual IMU data from text descriptions.\nBeyond this, we spotlight several promising research pathways that could\nbenefit from generative AI for the community, including the generating\nbenchmark datasets, the development of foundational models specific to HAR, the\nexploration of hierarchical structures within HAR, breaking down complex\nactivities, and applications in health sensing and activity summarization.\n","authors":["Zikang Leng","Hyeokhyen Kwon","Thomas Plötz"],"pdf_url":"https://arxiv.org/pdf/2310.12085v1.pdf","comment":"Generative AI for Pervasive Computing (GenAI4PC) Symposium within\n  UbiComp/ISWC 2023"},{"id":"http://arxiv.org/abs/2310.12077v1","updated":"2023-10-18T16:13:35Z","published":"2023-10-18T16:13:35Z","title":"One-Shot Imitation Learning: A Pose Estimation Perspective","summary":"  In this paper, we study imitation learning under the challenging setting of:\n(1) only a single demonstration, (2) no further data collection, and (3) no\nprior task or object knowledge. We show how, with these constraints, imitation\nlearning can be formulated as a combination of trajectory transfer and unseen\nobject pose estimation. To explore this idea, we provide an in-depth study on\nhow state-of-the-art unseen object pose estimators perform for one-shot\nimitation learning on ten real-world tasks, and we take a deep dive into the\neffects that camera calibration, pose estimation error, and spatial\ngeneralisation have on task success rates. For videos, please visit\nhttps://www.robot-learning.uk/pose-estimation-perspective.\n","authors":["Pietro Vitiello","Kamil Dreczkowski","Edward Johns"],"pdf_url":"https://arxiv.org/pdf/2310.12077v1.pdf","comment":"Published at the 7th Conference on Robot Learning (CoRL 2023). For\n  more details please visit\n  https://www.robot-learning.uk/pose-estimation-perspective"},{"id":"http://arxiv.org/abs/2310.12076v1","updated":"2023-10-18T16:13:22Z","published":"2023-10-18T16:13:22Z","title":"Exploring Fairness in Pre-trained Visual Transformer based Natural and\n  GAN Generated Image Detection Systems and Understanding the Impact of Image\n  Compression in Fairness","summary":"  It is not only sufficient to construct computational models that can\naccurately classify or detect fake images from real images taken from a camera,\nbut it is also important to ensure whether these computational models are fair\nenough or produce biased outcomes that can eventually harm certain social\ngroups or cause serious security threats. Exploring fairness in forensic\nalgorithms is an initial step towards correcting these biases. Since visual\ntransformers are recently being widely used in most image classification based\ntasks due to their capability to produce high accuracies, this study tries to\nexplore bias in the transformer based image forensic algorithms that classify\nnatural and GAN generated images. By procuring a bias evaluation corpora, this\nstudy analyzes bias in gender, racial, affective, and intersectional domains\nusing a wide set of individual and pairwise bias evaluation measures. As the\ngeneralizability of the algorithms against image compression is an important\nfactor to be considered in forensic tasks, this study also analyzes the role of\nimage compression on model bias. Hence to study the impact of image compression\non model bias, a two phase evaluation setting is followed, where a set of\nexperiments is carried out in the uncompressed evaluation setting and the other\nin the compressed evaluation setting.\n","authors":["Manjary P. Gangan","Anoop Kadan","Lajish V L"],"pdf_url":"https://arxiv.org/pdf/2310.12076v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12672v2","updated":"2023-10-18T16:05:32Z","published":"2023-07-24T10:20:14Z","title":"Global k-Space Interpolation for Dynamic MRI Reconstruction using Masked\n  Image Modeling","summary":"  In dynamic Magnetic Resonance Imaging (MRI), k-space is typically\nundersampled due to limited scan time, resulting in aliasing artifacts in the\nimage domain. Hence, dynamic MR reconstruction requires not only modeling\nspatial frequency components in the x and y directions of k-space but also\nconsidering temporal redundancy. Most previous works rely on image-domain\nregularizers (priors) to conduct MR reconstruction. In contrast, we focus on\ninterpolating the undersampled k-space before obtaining images with Fourier\ntransform. In this work, we connect masked image modeling with k-space\ninterpolation and propose a novel Transformer-based k-space Global\nInterpolation Network, termed k-GIN. Our k-GIN learns global dependencies among\nlow- and high-frequency components of 2D+t k-space and uses it to interpolate\nunsampled data. Further, we propose a novel k-space Iterative Refinement Module\n(k-IRM) to enhance the high-frequency components learning. We evaluate our\napproach on 92 in-house 2D+t cardiac MR subjects and compare it to MR\nreconstruction methods with image-domain regularizers. Experiments show that\nour proposed k-space interpolation method quantitatively and qualitatively\noutperforms baseline methods. Importantly, the proposed approach achieves\nsubstantially higher robustness and generalizability in cases of\nhighly-undersampled MR data. For video presentation, poster, GIF results and\ncode please check our project page:\nhttps://jzpeterpan.github.io/k-gin.github.io/.\n","authors":["Jiazhen Pan","Suprosanna Shit","Özgün Turgut","Wenqi Huang","Hongwei Bran Li","Nil Stolt-Ansó","Thomas Küstner","Kerstin Hammernik","Daniel Rueckert"],"pdf_url":"https://arxiv.org/pdf/2307.12672v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12062v1","updated":"2023-10-18T15:50:48Z","published":"2023-10-18T15:50:48Z","title":"On the use of Vision-Language models for Visual Sentiment Analysis: a\n  study on CLIP","summary":"  This work presents a study on how to exploit the CLIP embedding space to\nperform Visual Sentiment Analysis. We experiment with two architectures built\non top of the CLIP embedding space, which we denote by CLIP-E. We train the\nCLIP-E models with WEBEmo, the largest publicly available and manually labeled\nbenchmark for Visual Sentiment Analysis, and perform two sets of experiments.\nFirst, we test on WEBEmo and compare the CLIP-E architectures with\nstate-of-the-art (SOTA) models and with CLIP Zero-Shot. Second, we perform\ncross dataset evaluation, and test the CLIP-E architectures trained with WEBEmo\non other Visual Sentiment Analysis benchmarks. Our results show that the CLIP-E\napproaches outperform SOTA models in WEBEmo fine grained categorization, and\nthey also generalize better when tested on datasets that have not been seen\nduring training. Interestingly, we observed that for the FI dataset, CLIP\nZero-Shot produces better accuracies than SOTA models and CLIP-E trained on\nWEBEmo. These results motivate several questions that we discuss in this paper,\nsuch as how we should design new benchmarks and evaluate Visual Sentiment\nAnalysis, and whether we should keep designing tailored Deep Learning models\nfor Visual Sentiment Analysis or focus our efforts on better using the\nknowledge encoded in large vision-language models such as CLIP for this task.\n","authors":["Cristina Bustos","Carles Civit","Brian Du","Albert Sole-Ribalta","Agata Lapedriza"],"pdf_url":"https://arxiv.org/pdf/2310.12062v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12060v1","updated":"2023-10-18T15:49:46Z","published":"2023-10-18T15:49:46Z","title":"Robust Class-Conditional Distribution Alignment for Partial Domain\n  Adaptation","summary":"  Unwanted samples from private source categories in the learning objective of\na partial domain adaptation setup can lead to negative transfer and reduce\nclassification performance. Existing methods, such as re-weighting or\naggregating target predictions, are vulnerable to this issue, especially during\ninitial training stages, and do not adequately address overlapping categorical\ndistributions. We propose a solution to overcome these limitations by exploring\nbeyond the first-order moments for robust alignment of categorical\ndistributions. We employ objectives that optimize the intra and inter-class\ndistributions in a domain-invariant fashion and design a robust pseudo-labeling\nfor efficient target supervision. Our approach incorporates a complement\nentropy objective module to reduce classification uncertainty and flatten\nincorrect category predictions. The experimental findings and ablation analysis\nof the proposed modules demonstrate the superior performance of our proposed\nmodel compared to benchmarks.\n","authors":["Sandipan Choudhuri","Arunabha Sen"],"pdf_url":"https://arxiv.org/pdf/2310.12060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16661v2","updated":"2023-10-18T15:36:04Z","published":"2023-09-28T17:58:05Z","title":"SA2-Net: Scale-aware Attention Network for Microscopic Image\n  Segmentation","summary":"  Microscopic image segmentation is a challenging task, wherein the objective\nis to assign semantic labels to each pixel in a given microscopic image. While\nconvolutional neural networks (CNNs) form the foundation of many existing\nframeworks, they often struggle to explicitly capture long-range dependencies.\nAlthough transformers were initially devised to address this issue using\nself-attention, it has been proven that both local and global features are\ncrucial for addressing diverse challenges in microscopic images, including\nvariations in shape, size, appearance, and target region density. In this\npaper, we introduce SA2-Net, an attention-guided method that leverages\nmulti-scale feature learning to effectively handle diverse structures within\nmicroscopic images. Specifically, we propose scale-aware attention (SA2) module\ndesigned to capture inherent variations in scales and shapes of microscopic\nregions, such as cells, for accurate segmentation. This module incorporates\nlocal attention at each level of multi-stage features, as well as global\nattention across multiple resolutions. Furthermore, we address the issue of\nblurred region boundaries (e.g., cell boundaries) by introducing a novel\nupsampling strategy called the Adaptive Up-Attention (AuA) module. This module\nenhances the discriminative ability for improved localization of microscopic\nregions using an explicit attention mechanism. Extensive experiments on five\nchallenging datasets demonstrate the benefits of our SA2-Net model. Our source\ncode is publicly available at \\url{https://github.com/mustansarfiaz/SA2-Net}.\n","authors":["Mustansar Fiaz","Rao Muhammad Anwer","Hisham Cholakkal"],"pdf_url":"https://arxiv.org/pdf/2309.16661v2.pdf","comment":"BMVC 2023 accepted as oral"},{"id":"http://arxiv.org/abs/2309.08523v2","updated":"2023-10-18T15:34:22Z","published":"2023-09-15T16:34:51Z","title":"Breathing New Life into 3D Assets with Generative Repainting","summary":"  Diffusion-based text-to-image models ignited immense attention from the\nvision community, artists, and content creators. Broad adoption of these models\nis due to significant improvement in the quality of generations and efficient\nconditioning on various modalities, not just text. However, lifting the rich\ngenerative priors of these 2D models into 3D is challenging. Recent works have\nproposed various pipelines powered by the entanglement of diffusion models and\nneural fields. We explore the power of pretrained 2D diffusion models and\nstandard 3D neural radiance fields as independent, standalone tools and\ndemonstrate their ability to work together in a non-learned fashion. Such\nmodularity has the intrinsic advantage of eased partial upgrades, which became\nan important property in such a fast-paced domain. Our pipeline accepts any\nlegacy renderable geometry, such as textured or untextured meshes, orchestrates\nthe interaction between 2D generative refinement and 3D consistency enforcement\ntools, and outputs a painted input geometry in several formats. We conduct a\nlarge-scale study on a wide range of objects and categories from the\nShapeNetSem dataset and demonstrate the advantages of our approach, both\nqualitatively and quantitatively. Project page:\nhttps://www.obukhov.ai/repainting_3d_assets\n","authors":["Tianfu Wang","Menelaos Kanakis","Konrad Schindler","Luc Van Gool","Anton Obukhov"],"pdf_url":"https://arxiv.org/pdf/2309.08523v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12031v1","updated":"2023-10-18T15:15:13Z","published":"2023-10-18T15:15:13Z","title":"SegmATRon: Embodied Adaptive Semantic Segmentation for Indoor\n  Environment","summary":"  This paper presents an adaptive transformer model named SegmATRon for\nembodied image semantic segmentation. Its distinctive feature is the adaptation\nof model weights during inference on several images using a hybrid\nmulticomponent loss function. We studied this model on datasets collected in\nthe photorealistic Habitat and the synthetic AI2-THOR Simulators. We showed\nthat obtaining additional images using the agent's actions in an indoor\nenvironment can improve the quality of semantic segmentation. The code of the\nproposed approach and datasets are publicly available at\nhttps://github.com/wingrune/SegmATRon.\n","authors":["Tatiana Zemskova","Margarita Kichik","Dmitry Yudin","Aleksei Staroverov","Aleksandr Panov"],"pdf_url":"https://arxiv.org/pdf/2310.12031v1.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2310.12020v1","updated":"2023-10-18T14:53:14Z","published":"2023-10-18T14:53:14Z","title":"LoHoRavens: A Long-Horizon Language-Conditioned Benchmark for Robotic\n  Tabletop Manipulation","summary":"  The convergence of embodied agents and large language models (LLMs) has\nbrought significant advancements to embodied instruction following.\nParticularly, the strong reasoning capabilities of LLMs make it possible for\nrobots to perform long-horizon tasks without expensive annotated\ndemonstrations. However, public benchmarks for testing the long-horizon\nreasoning capabilities of language-conditioned robots in various scenarios are\nstill missing. To fill this gap, this work focuses on the tabletop manipulation\ntask and releases a simulation benchmark, \\textit{LoHoRavens}, which covers\nvarious long-horizon reasoning aspects spanning color, size, space, arithmetics\nand reference. Furthermore, there is a key modality bridging problem for\nlong-horizon manipulation tasks with LLMs: how to incorporate the observation\nfeedback during robot execution for the LLM's closed-loop planning, which is\nhowever less studied by prior work. We investigate two methods of bridging the\nmodality gap: caption generation and learnable interface for incorporating\nexplicit and implicit observation feedback to the LLM, respectively. These\nmethods serve as the two baselines for our proposed benchmark. Experiments show\nthat both methods struggle to solve some tasks, indicating long-horizon\nmanipulation tasks are still challenging for current popular models. We expect\nthe proposed public benchmark and baselines can help the community develop\nbetter models for long-horizon tabletop manipulation tasks.\n","authors":["Shengqiang Zhang","Philipp Wicke","Lütfi Kerem Şenel","Luis Figueredo","Abdeldjallil Naceri","Sami Haddadin","Barbara Plank","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2310.12020v1.pdf","comment":"6 pages, 4 figures. The video and code of LoHoRavens are available at\n  https://shengqiang-zhang.github.io/lohoravens-webpage/"},{"id":"http://arxiv.org/abs/2310.12017v1","updated":"2023-10-18T14:49:54Z","published":"2023-10-18T14:49:54Z","title":"Exploring Decision-based Black-box Attacks on Face Forgery Detection","summary":"  Face forgery generation technologies generate vivid faces, which have raised\npublic concerns about security and privacy. Many intelligent systems, such as\nelectronic payment and identity verification, rely on face forgery detection.\nAlthough face forgery detection has successfully distinguished fake faces,\nrecent studies have demonstrated that face forgery detectors are very\nvulnerable to adversarial examples. Meanwhile, existing attacks rely on network\narchitectures or training datasets instead of the predicted labels, which leads\nto a gap in attacking deployed applications. To narrow this gap, we first\nexplore the decision-based attacks on face forgery detection. However, applying\nexisting decision-based attacks directly suffers from perturbation\ninitialization failure and low image quality. First, we propose cross-task\nperturbation to handle initialization failures by utilizing the high\ncorrelation of face features on different tasks. Then, inspired by using\nfrequency cues by face forgery detection, we propose the frequency\ndecision-based attack. We add perturbations in the frequency domain and then\nconstrain the visual quality in the spatial domain. Finally, extensive\nexperiments demonstrate that our method achieves state-of-the-art attack\nperformance on FaceForensics++, CelebDF, and industrial APIs, with high query\nefficiency and guaranteed image quality. Further, the fake faces by our method\ncan pass face forgery detection and face recognition, which exposes the\nsecurity problems of face forgery detectors.\n","authors":["Zhaoyu Chen","Bo Li","Kaixun Jiang","Shuang Wu","Shouhong Ding","Wenqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.12017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01406v3","updated":"2023-10-18T14:47:54Z","published":"2023-09-04T07:26:42Z","title":"Learning Residual Elastic Warps for Image Stitching under Dirichlet\n  Boundary Condition","summary":"  Trendy suggestions for learning-based elastic warps enable the deep image\nstitchings to align images exposed to large parallax errors. Despite the\nremarkable alignments, the methods struggle with occasional holes or\ndiscontinuity between overlapping and non-overlapping regions of a target image\nas the applied training strategy mostly focuses on overlap region alignment. As\na result, they require additional modules such as seam finder and image\ninpainting for hiding discontinuity and filling holes, respectively. In this\nwork, we suggest Recurrent Elastic Warps (REwarp) that address the problem with\nDirichlet boundary condition and boost performances by residual learning for\nrecurrent misalign correction. Specifically, REwarp predicts a homography and a\nThin-plate Spline (TPS) under the boundary constraint for discontinuity and\nhole-free image stitching. Our experiments show the favorable aligns and the\ncompetitive computational costs of REwarp compared to the existing stitching\nmethods. Our source code is available at https://github.com/minshu-kim/REwarp.\n","authors":["Minsu Kim","Yongjun Lee","Woo Kyoung Han","Kyong Hwan Jin"],"pdf_url":"https://arxiv.org/pdf/2309.01406v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12007v1","updated":"2023-10-18T14:40:52Z","published":"2023-10-18T14:40:52Z","title":"KI-PMF: Knowledge Integrated Plausible Motion Forecasting","summary":"  Accurately forecasting the motion of traffic actors is crucial for the\ndeployment of autonomous vehicles at a large scale. Current trajectory\nforecasting approaches primarily concentrate on optimizing a loss function with\na specific metric, which can result in predictions that do not adhere to\nphysical laws or violate external constraints. Our objective is to incorporate\nexplicit knowledge priors that allow a network to forecast future trajectories\nin compliance with both the kinematic constraints of a vehicle and the geometry\nof the driving environment. To achieve this, we introduce a non-parametric\npruning layer and attention layers to integrate the defined knowledge priors.\nOur proposed method is designed to ensure reachability guarantees for traffic\nactors in both complex and dynamic situations. By conditioning the network to\nfollow physical laws, we can obtain accurate and safe predictions, essential\nfor maintaining autonomous vehicles' safety and efficiency in real-world\nsettings.In summary, this paper presents concepts that prevent off-road\npredictions for safe and reliable motion forecasting by incorporating knowledge\npriors into the training process.\n","authors":["Abhishek Vivekanandan","Ahmed Abouelazm","Philip Schörner","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2310.12007v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12004v1","updated":"2023-10-18T14:39:25Z","published":"2023-10-18T14:39:25Z","title":"Image Super-resolution Via Latent Diffusion: A Sampling-space Mixture Of\n  Experts And Frequency-augmented Decoder Approach","summary":"  The recent use of diffusion prior, enhanced by pre-trained text-image models,\nhas markedly elevated the performance of image super-resolution (SR). To\nalleviate the huge computational cost required by pixel-based diffusion SR,\nlatent-based methods utilize a feature encoder to transform the image and then\nimplement the SR image generation in a compact latent space. Nevertheless,\nthere are two major issues that limit the performance of latent-based\ndiffusion. First, the compression of latent space usually causes reconstruction\ndistortion. Second, huge computational cost constrains the parameter scale of\nthe diffusion model. To counteract these issues, we first propose a frequency\ncompensation module that enhances the frequency components from latent space to\npixel space. The reconstruction distortion (especially for high-frequency\ninformation) can be significantly decreased. Then, we propose to use\nSample-Space Mixture of Experts (SS-MoE) to achieve more powerful latent-based\nSR, which steadily improves the capacity of the model without a significant\nincrease in inference costs. These carefully crafted designs contribute to\nperformance improvements in largely explored 4x blind super-resolution\nbenchmarks and extend to large magnification factors, i.e., 8x image SR\nbenchmarks. The code is available at https://github.com/amandaluof/moe_sr.\n","authors":["Feng Luo","Jinxi Xiang","Jun Zhang","Xiao Han","Wei Yang"],"pdf_url":"https://arxiv.org/pdf/2310.12004v1.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2310.12001v1","updated":"2023-10-18T14:32:20Z","published":"2023-10-18T14:32:20Z","title":"Bayesian Flow Networks in Continual Learning","summary":"  Bayesian Flow Networks (BFNs) has been recently proposed as one of the most\npromising direction to universal generative modelling, having ability to learn\nany of the data type. Their power comes from the expressiveness of neural\nnetworks and Bayesian inference which make them suitable in the context of\ncontinual learning. We delve into the mechanics behind BFNs and conduct the\nexperiments to empirically verify the generative capabilities on non-stationary\ndata.\n","authors":["Mateusz Pyla","Kamil Deja","Bartłomiej Twardowski","Tomasz Trzciński"],"pdf_url":"https://arxiv.org/pdf/2310.12001v1.pdf","comment":"Submitted to NeurIPS 2023 Workshop on Diffusion Models"},{"id":"http://arxiv.org/abs/2309.13609v2","updated":"2023-10-18T14:29:08Z","published":"2023-09-24T11:17:38Z","title":"Vulnerabilities in Video Quality Assessment Models: The Challenge of\n  Adversarial Attacks","summary":"  No-Reference Video Quality Assessment (NR-VQA) plays an essential role in\nimproving the viewing experience of end-users. Driven by deep learning, recent\nNR-VQA models based on Convolutional Neural Networks (CNNs) and Transformers\nhave achieved outstanding performance. To build a reliable and practical\nassessment system, it is of great necessity to evaluate their robustness.\nHowever, such issue has received little attention in the academic community. In\nthis paper, we make the first attempt to evaluate the robustness of NR-VQA\nmodels against adversarial attacks, and propose a patch-based random search\nmethod for black-box attack. Specifically, considering both the attack effect\non quality score and the visual quality of adversarial video, the attack\nproblem is formulated as misleading the estimated quality score under the\nconstraint of just-noticeable difference (JND). Built upon such formulation, a\nnovel loss function called Score-Reversed Boundary Loss is designed to push the\nadversarial video's estimated quality score far away from its ground-truth\nscore towards a specific boundary, and the JND constraint is modeled as a\nstrict $L_2$ and $L_\\infty$ norm restriction. By this means, both white-box and\nblack-box attacks can be launched in an effective and imperceptible manner. The\nsource code is available at https://github.com/GZHU-DVL/AttackVQA.\n","authors":["Ao-Xiang Zhang","Yu Ran","Weixuan Tang","Yuan-Gen Wang"],"pdf_url":"https://arxiv.org/pdf/2309.13609v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01409v4","updated":"2023-10-18T14:18:46Z","published":"2023-09-04T07:40:30Z","title":"Implicit Neural Image Stitching With Enhanced and Blended Feature\n  Reconstruction","summary":"  Existing frameworks for image stitching often provide visually reasonable\nstitchings. However, they suffer from blurry artifacts and disparities in\nillumination, depth level, etc. Although the recent learning-based stitchings\nrelax such disparities, the required methods impose sacrifice of image\nqualities failing to capture high-frequency details for stitched images. To\naddress the problem, we propose a novel approach, implicit Neural Image\nStitching (NIS) that extends arbitrary-scale super-resolution. Our method\nestimates Fourier coefficients of images for quality-enhancing warps. Then, the\nsuggested model blends color mismatches and misalignment in the latent space\nand decodes the features into RGB values of stitched images. Our experiments\nshow that our approach achieves improvement in resolving the low-definition\nimaging of the previous deep image stitching with favorable accelerated\nimage-enhancing methods. Our source code is available at\nhttps://github.com/minshu-kim/NIS.\n","authors":["Minsu Kim","Jaewon Lee","Byeonghun Lee","Sunghoon Im","Kyong Hwan Jin"],"pdf_url":"https://arxiv.org/pdf/2309.01409v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.14133v3","updated":"2023-10-18T13:19:52Z","published":"2023-04-27T12:28:29Z","title":"VERITE: A Robust Benchmark for Multimodal Misinformation Detection\n  Accounting for Unimodal Bias","summary":"  Multimedia content has become ubiquitous on social media platforms, leading\nto the rise of multimodal misinformation (MM) and the urgent need for effective\nstrategies to detect and prevent its spread. In recent years, the challenge of\nmultimodal misinformation detection (MMD) has garnered significant attention by\nresearchers and has mainly involved the creation of annotated, weakly\nannotated, or synthetically generated training datasets, along with the\ndevelopment of various deep learning MMD models. However, the problem of\nunimodal bias has been overlooked, where specific patterns and biases in MMD\nbenchmarks can result in biased or unimodal models outperforming their\nmultimodal counterparts on an inherently multimodal task; making it difficult\nto assess progress. In this study, we systematically investigate and identify\nthe presence of unimodal bias in widely-used MMD benchmarks, namely VMU-Twitter\nand COSMOS. To address this issue, we introduce the \"VERification of Image-TExt\npairs\" (VERITE) benchmark for MMD which incorporates real-world data, excludes\n\"asymmetric multimodal misinformation\" and utilizes \"modality balancing\". We\nconduct an extensive comparative study with a Transformer-based architecture\nthat shows the ability of VERITE to effectively address unimodal bias,\nrendering it a robust evaluation framework for MMD. Furthermore, we introduce a\nnew method -- termed Crossmodal HArd Synthetic MisAlignment (CHASMA) -- for\ngenerating realistic synthetic training data that preserve crossmodal relations\nbetween legitimate images and false human-written captions. By leveraging\nCHASMA in the training process, we observe consistent and notable improvements\nin predictive performance on VERITE; with a 9.2% increase in accuracy. We\nrelease our code at: https://github.com/stevejpapad/image-text-verification\n","authors":["Stefanos-Iordanis Papadopoulos","Christos Koutlis","Symeon Papadopoulos","Panagiotis C. Petrantonakis"],"pdf_url":"https://arxiv.org/pdf/2304.14133v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10299v3","updated":"2023-10-18T13:13:25Z","published":"2023-05-17T15:36:08Z","title":"Binarized Spectral Compressive Imaging","summary":"  Existing deep learning models for hyperspectral image (HSI) reconstruction\nachieve good performance but require powerful hardwares with enormous memory\nand computational resources. Consequently, these methods can hardly be deployed\non resource-limited mobile devices. In this paper, we propose a novel method,\nBinarized Spectral-Redistribution Network (BiSRNet), for efficient and\npractical HSI restoration from compressed measurement in snapshot compressive\nimaging (SCI) systems. Firstly, we redesign a compact and easy-to-deploy base\nmodel to be binarized. Then we present the basic unit, Binarized\nSpectral-Redistribution Convolution (BiSR-Conv). BiSR-Conv can adaptively\nredistribute the HSI representations before binarizing activation and uses a\nscalable hyperbolic tangent function to closer approximate the Sign function in\nbackpropagation. Based on our BiSR-Conv, we customize four binarized\nconvolutional modules to address the dimension mismatch and propagate\nfull-precision information throughout the whole network. Finally, our BiSRNet\nis derived by using the proposed techniques to binarize the base model.\nComprehensive quantitative and qualitative experiments manifest that our\nproposed BiSRNet outperforms state-of-the-art binarization methods and achieves\ncomparable performance with full-precision algorithms. Code and models are\npublicly available at https://github.com/caiyuanhao1998/BiSCI and\nhttps://github.com/caiyuanhao1998/MST\n","authors":["Yuanhao Cai","Yuxin Zheng","Jing Lin","Xin Yuan","Yulun Zhang","Haoqian Wang"],"pdf_url":"https://arxiv.org/pdf/2305.10299v3.pdf","comment":"NeurIPS 2023; The first work to study binarized spectral compressive\n  imaging reconstruction problem"},{"id":"http://arxiv.org/abs/2308.06444v2","updated":"2023-10-18T12:45:32Z","published":"2023-08-12T02:38:43Z","title":"TongueSAM: An Universal Tongue Segmentation Model Based on SAM with\n  Zero-Shot","summary":"  Tongue segmentation serves as the primary step in automated TCM tongue\ndiagnosis, which plays a significant role in the diagnostic results. Currently,\nnumerous deep learning based methods have achieved promising results. However,\nwhen confronted with tongue images that differ from the training set or possess\nchallenging backgrounds, these methods demonstrate limited performance. To\naddress this issue, this paper proposes a universal tongue segmentation model\nnamed TongueSAM based on SAM (Segment Anything Model). SAM is a large-scale\npretrained interactive segmentation model known for its powerful zero-shot\ngeneralization capability. Applying SAM to tongue segmentation leverages its\nlearned prior knowledge from natural images, enabling the achievement of\nzero-shot segmentation for various types of tongue images. In this study, a\nPrompt Generator based on object detection is integrated into SAM to enable an\nend-to-end automated tongue segmentation method. Experiments demonstrate that\nTongueSAM achieves exceptional performance across various of tongue\nsegmentation datasets, particularly under zero-shot. Even when dealing with\nchallenging background tongue images, TongueSAM achieves a mIoU of 95.23\\%\nunder zero-shot conditions, surpassing other segmentation methods. As far as we\nknow, this is the first application of large-scale pretrained model for tongue\nsegmentation. The project and pretrained model will be made public when the\npaper is accepted.\n","authors":["Shan Cao","Qunsheng Ruan","Qingfeng Wu"],"pdf_url":"https://arxiv.org/pdf/2308.06444v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11448v2","updated":"2023-10-18T12:16:45Z","published":"2023-10-17T17:57:38Z","title":"4K4D: Real-Time 4D View Synthesis at 4K Resolution","summary":"  This paper targets high-fidelity and real-time view synthesis of dynamic 3D\nscenes at 4K resolution. Recently, some methods on dynamic view synthesis have\nshown impressive rendering quality. However, their speed is still limited when\nrendering high-resolution images. To overcome this problem, we propose 4K4D, a\n4D point cloud representation that supports hardware rasterization and enables\nunprecedented rendering speed. Our representation is built on a 4D feature grid\nso that the points are naturally regularized and can be robustly optimized. In\naddition, we design a novel hybrid appearance model that significantly boosts\nthe rendering quality while preserving efficiency. Moreover, we develop a\ndifferentiable depth peeling algorithm to effectively learn the proposed model\nfrom RGB videos. Experiments show that our representation can be rendered at\nover 400 FPS on the DNA-Rendering dataset at 1080p resolution and 80 FPS on the\nENeRF-Outdoor dataset at 4K resolution using an RTX 4090 GPU, which is 30x\nfaster than previous methods and achieves the state-of-the-art rendering\nquality. Our project page is available at https://zju3dv.github.io/4k4d/.\n","authors":["Zhen Xu","Sida Peng","Haotong Lin","Guangzhao He","Jiaming Sun","Yujun Shen","Hujun Bao","Xiaowei Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.11448v2.pdf","comment":"Project Page: https://zju3dv.github.io/4k4d"},{"id":"http://arxiv.org/abs/2209.09035v2","updated":"2023-10-18T12:08:23Z","published":"2022-09-19T14:12:09Z","title":"Fairness in Face Presentation Attack Detection","summary":"  Face recognition (FR) algorithms have been proven to exhibit discriminatory\nbehaviors against certain demographic and non-demographic groups, raising\nethical and legal concerns regarding their deployment in real-world scenarios.\nDespite the growing number of fairness studies in FR, the fairness of face\npresentation attack detection (PAD) has been overlooked, mainly due to the lack\nof appropriately annotated data. To avoid and mitigate the potential negative\nimpact of such behavior, it is essential to assess the fairness in face PAD and\ndevelop fair PAD models. To enable fairness analysis in face PAD, we present a\nCombined Attribute Annotated PAD Dataset (CAAD-PAD), offering seven\nhuman-annotated attribute labels. Then, we comprehensively analyze the fairness\nof PAD and its relation to the nature of the training data and the Operational\nDecision Threshold Assignment (ODTA) through a set of face PAD solutions.\nAdditionally, we propose a novel metric, the Accuracy Balanced Fairness (ABF),\nthat jointly represents both the PAD fairness and the absolute PAD performance.\nThe experimental results pointed out that female and faces with occluding\nfeatures (e.g. eyeglasses, beard, etc.) are relatively less protected than male\nand non-occlusion groups by all PAD solutions. To alleviate this observed\nunfairness, we propose a plug-and-play data augmentation method, FairSWAP, to\ndisrupt the identity/semantic information and encourage models to mine the\nattack clues. The extensive experimental results indicate that FairSWAP leads\nto better-performing and fairer face PADs in 10 out of 12 investigated cases.\n","authors":["Meiling Fang","Wufei Yang","Arjan Kuijper","Vitomir Struc","Naser Damer"],"pdf_url":"https://arxiv.org/pdf/2209.09035v2.pdf","comment":"Accepted at Pattern Recognition"},{"id":"http://arxiv.org/abs/2310.11910v1","updated":"2023-10-18T11:59:35Z","published":"2023-10-18T11:59:35Z","title":"Multi-modal Medical Neurological Image Fusion using Wavelet Pooled Edge\n  Preserving Autoencoder","summary":"  Medical image fusion integrates the complementary diagnostic information of\nthe source image modalities for improved visualization and analysis of\nunderlying anomalies. Recently, deep learning-based models have excelled the\nconventional fusion methods by executing feature extraction, feature selection,\nand feature fusion tasks, simultaneously. However, most of the existing\nconvolutional neural network (CNN) architectures use conventional pooling or\nstrided convolutional strategies to downsample the feature maps. It causes the\nblurring or loss of important diagnostic information and edge details available\nin the source images and dilutes the efficacy of the feature extraction\nprocess. Therefore, this paper presents an end-to-end unsupervised fusion model\nfor multimodal medical images based on an edge-preserving dense autoencoder\nnetwork. In the proposed model, feature extraction is improved by using wavelet\ndecomposition-based attention pooling of feature maps. This helps in preserving\nthe fine edge detail information present in both the source images and enhances\nthe visual perception of fused images. Further, the proposed model is trained\non a variety of medical image pairs which helps in capturing the intensity\ndistributions of the source images and preserves the diagnostic information\neffectively. Substantial experiments are conducted which demonstrate that the\nproposed method provides improved visual and quantitative results as compared\nto the other state-of-the-art fusion methods.\n","authors":["Manisha Das","Deep Gupta","Petia Radeva","Ashwini M Bakde"],"pdf_url":"https://arxiv.org/pdf/2310.11910v1.pdf","comment":"8 pages, 5 figures, 6 tables"},{"id":"http://arxiv.org/abs/2307.12348v3","updated":"2023-10-18T11:50:49Z","published":"2023-07-23T15:10:02Z","title":"ResShift: Efficient Diffusion Model for Image Super-resolution by\n  Residual Shifting","summary":"  Diffusion-based image super-resolution (SR) methods are mainly limited by the\nlow inference speed due to the requirements of hundreds or even thousands of\nsampling steps. Existing acceleration sampling techniques inevitably sacrifice\nperformance to some extent, leading to over-blurry SR results. To address this\nissue, we propose a novel and efficient diffusion model for SR that\nsignificantly reduces the number of diffusion steps, thereby eliminating the\nneed for post-acceleration during inference and its associated performance\ndeterioration. Our method constructs a Markov chain that transfers between the\nhigh-resolution image and the low-resolution image by shifting the residual\nbetween them, substantially improving the transition efficiency. Additionally,\nan elaborate noise schedule is developed to flexibly control the shifting speed\nand the noise strength during the diffusion process. Extensive experiments\ndemonstrate that the proposed method obtains superior or at least comparable\nperformance to current state-of-the-art methods on both synthetic and\nreal-world datasets, even only with 15 sampling steps. Our code and model are\navailable at https://github.com/zsyOAOA/ResShift.\n","authors":["Zongsheng Yue","Jianyi Wang","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2307.12348v3.pdf","comment":"NeurIPS 2023 (Spotlight). Project page:\n  https://zsyoaoa.github.io/projects/resshift/"},{"id":"http://arxiv.org/abs/2310.11896v1","updated":"2023-10-18T11:29:53Z","published":"2023-10-18T11:29:53Z","title":"A New Multimodal Medical Image Fusion based on Laplacian Autoencoder\n  with Channel Attention","summary":"  Medical image fusion combines the complementary information of multimodal\nmedical images to assist medical professionals in the clinical diagnosis of\npatients' disorders and provide guidance during preoperative and\nintra-operative procedures. Deep learning (DL) models have achieved end-to-end\nimage fusion with highly robust and accurate fusion performance. However, most\nDL-based fusion models perform down-sampling on the input images to minimize\nthe number of learnable parameters and computations. During this process,\nsalient features of the source images become irretrievable leading to the loss\nof crucial diagnostic edge details and contrast of various brain tissues. In\nthis paper, we propose a new multimodal medical image fusion model is proposed\nthat is based on integrated Laplacian-Gaussian concatenation with attention\npooling (LGCA). We prove that our model preserves effectively complementary\ninformation and important tissue structures.\n","authors":["Payal Wankhede","Manisha Das","Deep Gupta","Petia Radeva","Ashwini M Bakde"],"pdf_url":"https://arxiv.org/pdf/2310.11896v1.pdf","comment":"10 pages, 6 figures, % tables"},{"id":"http://arxiv.org/abs/2310.11890v1","updated":"2023-10-18T11:19:32Z","published":"2023-10-18T11:19:32Z","title":"IRAD: Implicit Representation-driven Image Resampling against\n  Adversarial Attacks","summary":"  We introduce a novel approach to counter adversarial attacks, namely, image\nresampling. Image resampling transforms a discrete image into a new one,\nsimulating the process of scene recapturing or rerendering as specified by a\ngeometrical transformation. The underlying rationale behind our idea is that\nimage resampling can alleviate the influence of adversarial perturbations while\npreserving essential semantic information, thereby conferring an inherent\nadvantage in defending against adversarial attacks. To validate this concept,\nwe present a comprehensive study on leveraging image resampling to defend\nagainst adversarial attacks. We have developed basic resampling methods that\nemploy interpolation strategies and coordinate shifting magnitudes. Our\nanalysis reveals that these basic methods can partially mitigate adversarial\nattacks. However, they come with apparent limitations: the accuracy of clean\nimages noticeably decreases, while the improvement in accuracy on adversarial\nexamples is not substantial. We propose implicit representation-driven image\nresampling (IRAD) to overcome these limitations. First, we construct an\nimplicit continuous representation that enables us to represent any input image\nwithin a continuous coordinate space. Second, we introduce SampleNet, which\nautomatically generates pixel-wise shifts for resampling in response to\ndifferent inputs. Furthermore, we can extend our approach to the\nstate-of-the-art diffusion-based method, accelerating it with fewer time steps\nwhile preserving its defense capability. Extensive experiments demonstrate that\nour method significantly enhances the adversarial robustness of diverse deep\nmodels against various attacks while maintaining high accuracy on clean images.\n","authors":["Yue Cao","Tianlin Li","Xiaofeng Cao","Ivor Tsang","Yang Liu","Qing Guo"],"pdf_url":"https://arxiv.org/pdf/2310.11890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11888v1","updated":"2023-10-18T11:14:46Z","published":"2023-10-18T11:14:46Z","title":"Analyze Mass Spectrometry data with Artificial Intelligence to assist\n  the understanding of past habitability of Mars and provide insights for\n  future missions","summary":"  This paper presents an application of artificial intelligence on mass\nspectrometry data for detecting habitability potential of ancient Mars.\nAlthough data was collected for planet Mars the same approach can be replicated\nfor any terrestrial object of our solar system. Furthermore, proposed\nmethodology can be adapted to any domain that uses mass spectrometry. This\nresearch is focused in data analysis of two mass spectrometry techniques,\nevolved gas analysis (EGA-MS) and gas chromatography (GC-MS), which are used to\nidentify specific chemical compounds in geological material samples. The study\ndemonstrates the applicability of EGA-MS and GC-MS data to extra-terrestrial\nmaterial analysis. Most important features of proposed methodology includes\nsquare root transformation of mass spectrometry values, conversion of raw data\nto 2D sprectrograms and utilization of specific machine learning models and\ntechniques to avoid overfitting on relative small datasets. Both EGA-MS and\nGC-MS datasets come from NASA and two machine learning competitions that the\nauthor participated and exploited. Complete running code for the GC-MS\ndataset/competition is available at GitHub.1 Raw training mass spectrometry\ndata include [0, 1] labels of specific chemical compounds, selected to provide\nvaluable insights and contribute to our understanding of the potential past\nhabitability of Mars.\n","authors":["Ioannis Nasios"],"pdf_url":"https://arxiv.org/pdf/2310.11888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11884v1","updated":"2023-10-18T11:08:02Z","published":"2023-10-18T11:08:02Z","title":"From Neural Activations to Concepts: A Survey on Explaining Concepts in\n  Neural Networks","summary":"  In this paper, we review recent approaches for explaining concepts in neural\nnetworks. Concepts can act as a natural link between learning and reasoning:\nonce the concepts are identified that a neural learning system uses, one can\nintegrate those concepts with a reasoning system for inference or use a\nreasoning system to act upon them to improve or enhance the learning system. On\nthe other hand, knowledge can not only be extracted from neural networks but\nconcept knowledge can also be inserted into neural network architectures. Since\nintegrating learning and reasoning is at the core of neuro-symbolic AI, the\ninsights gained from this survey can serve as an important step towards\nrealizing neuro-symbolic AI based on explainable concepts.\n","authors":["Jae Hee Lee","Sergio Lanza","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2310.11884v1.pdf","comment":"Submitted to Neurosymbolic Artificial Intelligence\n  (https://neurosymbolic-ai-journal.com/paper/neural-activations-concepts-survey-explaining-concepts-neural-networks)"},{"id":"http://arxiv.org/abs/2310.11881v1","updated":"2023-10-18T11:06:41Z","published":"2023-10-18T11:06:41Z","title":"A Comparative Study of Image Restoration Networks for General Backbone\n  Network Design","summary":"  Despite the significant progress made by deep models in various image\nrestoration tasks, existing image restoration networks still face challenges in\nterms of task generality. An intuitive manifestation is that networks which\nexcel in certain tasks often fail to deliver satisfactory results in others. To\nillustrate this point, we select five representative image restoration networks\nand conduct a comparative study on five classic image restoration tasks. First,\nwe provide a detailed explanation of the characteristics of different image\nrestoration tasks and backbone networks. Following this, we present the\nbenchmark results and analyze the reasons behind the performance disparity of\ndifferent models across various tasks. Drawing from this comparative study, we\npropose that a general image restoration backbone network needs to meet the\nfunctional requirements of diverse tasks. Based on this principle, we design a\nnew general image restoration backbone network, X-Restormer. Extensive\nexperiments demonstrate that X-Restormer possesses good task generality and\nachieves state-of-the-art performance across a variety of tasks.\n","authors":["Xiangyu Chen","Zheyuan Li","Yuandong Pu","Yihao Liu","Jiantao Zhou","Yu Qiao","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2310.11881v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11875v1","updated":"2023-10-18T10:49:29Z","published":"2023-10-18T10:49:29Z","title":"Fractional Concepts in Neural Networks: Enhancing Activation and Loss\n  Functions","summary":"  The paper presents a method for using fractional concepts in a neural network\nto modify the activation and loss functions. The methodology allows the neural\nnetwork to define and optimize its activation functions by determining the\nfractional derivative order of the training process as an additional\nhyperparameter. This will enable neurons in the network to adjust their\nactivation functions to match input data better and reduce output errors,\npotentially improving the network's overall performance.\n","authors":["Zahra Alijani","Vojtech Molek"],"pdf_url":"https://arxiv.org/pdf/2310.11875v1.pdf","comment":"12 pages, 6 figures, submitted to Neurocomputing journal"},{"id":"http://arxiv.org/abs/2310.11868v1","updated":"2023-10-18T10:36:34Z","published":"2023-10-18T10:36:34Z","title":"To Generate or Not? Safety-Driven Unlearned Diffusion Models Are Still\n  Easy To Generate Unsafe Images ... For Now","summary":"  The recent advances in diffusion models (DMs) have revolutionized the\ngeneration of complex and diverse images. However, these models also introduce\npotential safety hazards, such as the production of harmful content and\ninfringement of data copyrights. Although there have been efforts to create\nsafety-driven unlearning methods to counteract these challenges, doubts remain\nabout their capabilities. To bridge this uncertainty, we propose an evaluation\nframework built upon adversarial attacks (also referred to as adversarial\nprompts), in order to discern the trustworthiness of these safety-driven\nunlearned DMs. Specifically, our research explores the (worst-case) robustness\nof unlearned DMs in eradicating unwanted concepts, styles, and objects,\nassessed by the generation of adversarial prompts. We develop a novel\nadversarial learning approach called UnlearnDiff that leverages the inherent\nclassification capabilities of DMs to streamline the generation of adversarial\nprompts, making it as simple for DMs as it is for image classification attacks.\nThis technique streamlines the creation of adversarial prompts, making the\nprocess as intuitive for generative modeling as it is for image classification\nassaults. Through comprehensive benchmarking, we assess the unlearning\nrobustness of five prevalent unlearned DMs across multiple tasks. Our results\nunderscore the effectiveness and efficiency of UnlearnDiff when compared to\nstate-of-the-art adversarial prompting methods. Codes are available at\nhttps://github.com/OPTML-Group/Diffusion-MU-Attack. WARNING: This paper\ncontains model outputs that may be offensive in nature.\n","authors":["Yimeng Zhang","Jinghan Jia","Xin Chen","Aochuan Chen","Yihua Zhang","Jiancheng Liu","Ke Ding","Sijia Liu"],"pdf_url":"https://arxiv.org/pdf/2310.11868v1.pdf","comment":"Codes are available at\n  https://github.com/OPTML-Group/Diffusion-MU-Attack"},{"id":"http://arxiv.org/abs/2310.11867v1","updated":"2023-10-18T10:32:39Z","published":"2023-10-18T10:32:39Z","title":"Evaluating the Fairness of Discriminative Foundation Models in Computer\n  Vision","summary":"  We propose a novel taxonomy for bias evaluation of discriminative foundation\nmodels, such as Contrastive Language-Pretraining (CLIP), that are used for\nlabeling tasks. We then systematically evaluate existing methods for mitigating\nbias in these models with respect to our taxonomy. Specifically, we evaluate\nOpenAI's CLIP and OpenCLIP models for key applications, such as zero-shot\nclassification, image retrieval and image captioning. We categorize desired\nbehaviors based around three axes: (i) if the task concerns humans; (ii) how\nsubjective the task is (i.e., how likely it is that people from a diverse range\nof backgrounds would agree on a labeling); and (iii) the intended purpose of\nthe task and if fairness is better served by impartiality (i.e., making\ndecisions independent of the protected attributes) or representation (i.e.,\nmaking decisions to maximize diversity). Finally, we provide quantitative\nfairness evaluations for both binary-valued and multi-valued protected\nattributes over ten diverse datasets. We find that fair PCA, a post-processing\nmethod for fair representations, works very well for debiasing in most of the\naforementioned tasks while incurring only minor loss of performance. However,\ndifferent debiasing approaches vary in their effectiveness depending on the\ntask. Hence, one should choose the debiasing approach depending on the specific\nuse case.\n","authors":["Junaid Ali","Matthaeus Kleindessner","Florian Wenzel","Kailash Budhathoki","Volkan Cevher","Chris Russell"],"pdf_url":"https://arxiv.org/pdf/2310.11867v1.pdf","comment":"Accepted at AIES'23"},{"id":"http://arxiv.org/abs/2310.11864v1","updated":"2023-10-18T10:26:56Z","published":"2023-10-18T10:26:56Z","title":"VQ-NeRF: Neural Reflectance Decomposition and Editing with Vector\n  Quantization","summary":"  We propose VQ-NeRF, a two-branch neural network model that incorporates\nVector Quantization (VQ) to decompose and edit reflectance fields in 3D scenes.\nConventional neural reflectance fields use only continuous representations to\nmodel 3D scenes, despite the fact that objects are typically composed of\ndiscrete materials in reality. This lack of discretization can result in noisy\nmaterial decomposition and complicated material editing. To address these\nlimitations, our model consists of a continuous branch and a discrete branch.\nThe continuous branch follows the conventional pipeline to predict decomposed\nmaterials, while the discrete branch uses the VQ mechanism to quantize\ncontinuous materials into individual ones. By discretizing the materials, our\nmodel can reduce noise in the decomposition process and generate a segmentation\nmap of discrete materials. Specific materials can be easily selected for\nfurther editing by clicking on the corresponding area of the segmentation\noutcomes. Additionally, we propose a dropout-based VQ codeword ranking strategy\nto predict the number of materials in a scene, which reduces redundancy in the\nmaterial segmentation process. To improve usability, we also develop an\ninteractive interface to further assist material editing. We evaluate our model\non both computer-generated and real-world scenes, demonstrating its superior\nperformance. To the best of our knowledge, our model is the first to enable\ndiscrete material editing in 3D scenes.\n","authors":["Hongliang Zhong","Jingbo Zhang","Jing Liao"],"pdf_url":"https://arxiv.org/pdf/2310.11864v1.pdf","comment":"Under Review. Project Page:\n  https://jtbzhl.github.io/VQ-NeRF.github.io/"},{"id":"http://arxiv.org/abs/2310.11862v1","updated":"2023-10-18T10:26:18Z","published":"2023-10-18T10:26:18Z","title":"Learning to Generate Parameters of ConvNets for Unseen Image Data","summary":"  Typical Convolutional Neural Networks (ConvNets) depend heavily on large\namounts of image data and resort to an iterative optimization algorithm (e.g.,\nSGD or Adam) to learn network parameters, which makes training very time- and\nresource-intensive. In this paper, we propose a new training paradigm and\nformulate the parameter learning of ConvNets into a prediction task: given a\nConvNet architecture, we observe there exists correlations between image\ndatasets and their corresponding optimal network parameters, and explore if we\ncan learn a hyper-mapping between them to capture the relations, such that we\ncan directly predict the parameters of the network for an image dataset never\nseen during the training phase. To do this, we put forward a new hypernetwork\nbased model, called PudNet, which intends to learn a mapping between datasets\nand their corresponding network parameters, and then predicts parameters for\nunseen data with only a single forward propagation. Moreover, our model\nbenefits from a series of adaptive hyper recurrent units sharing weights to\ncapture the dependencies of parameters among different network layers.\nExtensive experiments demonstrate that our proposed method achieves good\nefficacy for unseen image datasets on two kinds of settings: Intra-dataset\nprediction and Inter-dataset prediction. Our PudNet can also well scale up to\nlarge-scale datasets, e.g., ImageNet-1K. It takes 8967 GPU seconds to train\nResNet-18 on the ImageNet-1K using GC from scratch and obtain a top-5 accuracy\nof 44.65 %. However, our PudNet costs only 3.89 GPU seconds to predict the\nnetwork parameters of ResNet-18 achieving comparable performance (44.92 %),\nmore than 2,300 times faster than the traditional training paradigm.\n","authors":["Shiye Wang","Kaituo Feng","Changsheng Li","Ye Yuan","Guoren Wang"],"pdf_url":"https://arxiv.org/pdf/2310.11862v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.07759v3","updated":"2023-10-18T10:15:16Z","published":"2023-03-14T10:06:19Z","title":"A Simple Baseline for Supervised Surround-view Depth Estimation","summary":"  Depth estimation has been widely studied and serves as the fundamental step\nof 3D perception for intelligent vehicles. Though significant progress has been\nmade in monocular depth estimation in the past decades, these attempts are\nmainly conducted on the KITTI benchmark with only front-view cameras, which\nignores the correlations across surround-view cameras. In this paper, we\npropose S3Depth, a Simple Baseline for Supervised Surround-view Depth\nEstimation, to jointly predict the depth maps across multiple surrounding\ncameras. Specifically, we employ a global-to-local feature extraction module\nwhich combines CNN with transformer layers for enriched representations.\nFurther, the Adjacent-view Attention mechanism is proposed to enable the\nintra-view and inter-view feature propagation. The former is achieved by the\nself-attention module within each view, while the latter is realized by the\nadjacent attention module, which computes the attention across multi-cameras to\nexchange the multi-scale representations across surround-view feature maps.\nExtensive experiments show that our method achieves superior performance over\nexisting state-of-the-art methods on both DDAD and nuScenes datasets.\n","authors":["Xianda Guo","Wenjie Yuan","Yunpeng Zhang","Tian Yang","Chenming Zhang","Zheng Zhu","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2303.07759v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11850v1","updated":"2023-10-18T10:06:42Z","published":"2023-10-18T10:06:42Z","title":"Revisiting Transferable Adversarial Image Examples: Attack\n  Categorization, Evaluation Guidelines, and New Insights","summary":"  Transferable adversarial examples raise critical security concerns in\nreal-world, black-box attack scenarios. However, in this work, we identify two\nmain problems in common evaluation practices: (1) For attack transferability,\nlack of systematic, one-to-one attack comparison and fair hyperparameter\nsettings. (2) For attack stealthiness, simply no comparisons. To address these\nproblems, we establish new evaluation guidelines by (1) proposing a novel\nattack categorization strategy and conducting systematic and fair\nintra-category analyses on transferability, and (2) considering diverse\nimperceptibility metrics and finer-grained stealthiness characteristics from\nthe perspective of attack traceback. To this end, we provide the first\nlarge-scale evaluation of transferable adversarial examples on ImageNet,\ninvolving 23 representative attacks against 9 representative defenses. Our\nevaluation leads to a number of new insights, including consensus-challenging\nones: (1) Under a fair attack hyperparameter setting, one early attack method,\nDI, actually outperforms all the follow-up methods. (2) A state-of-the-art\ndefense, DiffPure, actually gives a false sense of (white-box) security since\nit is indeed largely bypassed by our (black-box) transferable attacks. (3) Even\nwhen all attacks are bounded by the same $L_p$ norm, they lead to dramatically\ndifferent stealthiness performance, which negatively correlates with their\ntransferability performance. Overall, our work demonstrates that existing\nproblematic evaluations have indeed caused misleading conclusions and missing\npoints, and as a result, hindered the assessment of the actual progress in this\nfield.\n","authors":["Zhengyu Zhao","Hanwei Zhang","Renjue Li","Ronan Sicre","Laurent Amsaleg","Michael Backes","Qi Li","Chao Shen"],"pdf_url":"https://arxiv.org/pdf/2310.11850v1.pdf","comment":"Code is available at\n  https://github.com/ZhengyuZhao/TransferAttackEval"},{"id":"http://arxiv.org/abs/2310.11834v1","updated":"2023-10-18T09:38:51Z","published":"2023-10-18T09:38:51Z","title":"HB-net: Holistic bursting cell cluster integrated network for occluded\n  multi-objects recognition","summary":"  Within the realm of image recognition, a specific category of multi-label\nclassification (MLC) challenges arises when objects within the visual field may\nocclude one another, demanding simultaneous identification of both occluded and\noccluding objects. Traditional convolutional neural networks (CNNs) can tackle\nthese challenges; however, those models tend to be bulky and can only attain\nmodest levels of accuracy. Leveraging insights from cutting-edge neural science\nresearch, specifically the Holistic Bursting (HB) cell, this paper introduces a\npioneering integrated network framework named HB-net. Built upon the foundation\nof HB cell clusters, HB-net is designed to address the intricate task of\nsimultaneously recognizing multiple occluded objects within images. Various\nBursting cell cluster structures are introduced, complemented by an evidence\naccumulation mechanism. Testing is conducted on multiple datasets comprising\ndigits and letters. The results demonstrate that models incorporating the HB\nframework exhibit a significant $2.98\\%$ enhancement in recognition accuracy\ncompared to models without the HB framework ($1.0298$ times, $p=0.0499$).\nAlthough in high-noise settings, standard CNNs exhibit slightly greater\nrobustness when compared to HB-net models, the models that combine the HB\nframework and EA mechanism achieve a comparable level of accuracy and\nresilience to ResNet50, despite having only three convolutional layers and\napproximately $1/30$ of the parameters. The findings of this study offer\nvaluable insights for improving computer vision algorithms. The essential code\nis provided at https://github.com/d-lab438/hb-net.git.\n","authors":["Xudong Gao","Xiao Guang Gao","Jia Rong","Xiaowei Chen","Xiang Liao","Jun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.11834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07688v3","updated":"2023-10-18T09:35:35Z","published":"2023-08-15T10:37:13Z","title":"Enhancing Network Initialization for Medical AI Models Using\n  Large-Scale, Unlabeled Natural Images","summary":"  Pre-training datasets, like ImageNet, have become the gold standard in\nmedical image analysis. However, the emergence of self-supervised learning\n(SSL), which leverages unlabeled data to learn robust features, presents an\nopportunity to bypass the intensive labeling process. In this study, we\nexplored if SSL for pre-training on non-medical images can be applied to chest\nradiographs and how it compares to supervised pre-training on non-medical\nimages and on medical images. We utilized a vision transformer and initialized\nits weights based on (i) SSL pre-training on natural images (DINOv2), (ii) SL\npre-training on natural images (ImageNet dataset), and (iii) SL pre-training on\nchest radiographs from the MIMIC-CXR database. We tested our approach on over\n800,000 chest radiographs from six large global datasets, diagnosing more than\n20 different imaging findings. Our SSL pre-training on curated images not only\noutperformed ImageNet-based pre-training (P<0.001 for all datasets) but, in\ncertain cases, also exceeded SL on the MIMIC-CXR dataset. Our findings suggest\nthat selecting the right pre-training strategy, especially with SSL, can be\npivotal for improving artificial intelligence (AI)'s diagnostic accuracy in\nmedical imaging. By demonstrating the promise of SSL in chest radiograph\nanalysis, we underline a transformative shift towards more efficient and\naccurate AI models in medical imaging.\n","authors":["Soroosh Tayebi Arasteh","Leo Misera","Jakob Nikolas Kather","Daniel Truhn","Sven Nebelung"],"pdf_url":"https://arxiv.org/pdf/2308.07688v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11811v1","updated":"2023-10-18T09:05:57Z","published":"2023-10-18T09:05:57Z","title":"ShapeGraFormer: GraFormer-Based Network for Hand-Object Reconstruction\n  from a Single Depth Map","summary":"  3D reconstruction of hand-object manipulations is important for emulating\nhuman actions. Most methods dealing with challenging object manipulation\nscenarios, focus on hands reconstruction in isolation, ignoring physical and\nkinematic constraints due to object contact. Some approaches produce more\nrealistic results by jointly reconstructing 3D hand-object interactions.\nHowever, they focus on coarse pose estimation or rely upon known hand and\nobject shapes. We propose the first approach for realistic 3D hand-object shape\nand pose reconstruction from a single depth map. Unlike previous work, our\nvoxel-based reconstruction network regresses the vertex coordinates of a hand\nand an object and reconstructs more realistic interaction. Our pipeline\nadditionally predicts voxelized hand-object shapes, having a one-to-one mapping\nto the input voxelized depth. Thereafter, we exploit the graph nature of the\nhand and object shapes, by utilizing the recent GraFormer network with\npositional embedding to reconstruct shapes from template meshes. In addition,\nwe show the impact of adding another GraFormer component that refines the\nreconstructed shapes based on the hand-object interactions and its ability to\nreconstruct more accurate object shapes. We perform an extensive evaluation on\nthe HO-3D and DexYCB datasets and show that our method outperforms existing\napproaches in hand reconstruction and produces plausible reconstructions for\nthe objects\n","authors":["Ahmed Tawfik Aboukhadra","Jameel Malik","Nadia Robertini","Ahmed Elhayek","Didier Stricker"],"pdf_url":"https://arxiv.org/pdf/2310.11811v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05071v4","updated":"2023-10-18T09:02:13Z","published":"2023-04-11T09:08:09Z","title":"Fracture Detection in Pediatric Wrist Trauma X-ray Images Using YOLOv8\n  Algorithm","summary":"  Hospital emergency departments frequently receive lots of bone fracture\ncases, with pediatric wrist trauma fracture accounting for the majority of\nthem. Before pediatric surgeons perform surgery, they need to ask patients how\nthe fracture occurred and analyze the fracture situation by interpreting X-ray\nimages. The interpretation of X-ray images often requires a combination of\ntechniques from radiologists and surgeons, which requires time-consuming\nspecialized training. With the rise of deep learning in the field of computer\nvision, network models applying for fracture detection has become an important\nresearch topic. In this paper, we use data augmentation to improve the model\nperformance of YOLOv8 algorithm (the latest version of You Only Look Once) on a\npediatric wrist trauma X-ray dataset (GRAZPEDWRI-DX), which is a public\ndataset. The experimental results show that our model have reached the\nstate-of-the-art (SOTA) real-time model performance. Specifically, the mean\naverage precision (mAP 50) of our model is 0.638, which is significantly higher\nthan the 0.634 and 0.636 of the improved YOLOv7 and original YOLOv8 models. To\nenable surgeons to use our model for fracture detection on pediatric wrist\ntrauma X-ray images, we have designed the application \"Fracture Detection Using\nYOLOv8 App\" to assist surgeons in diagnosing fractures, reducing the\nprobability of error analysis, and providing more useful information for\nsurgery.\n","authors":["Rui-Yang Ju","Weiming Cai"],"pdf_url":"https://arxiv.org/pdf/2304.05071v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.02550v4","updated":"2023-10-18T08:48:47Z","published":"2023-02-06T03:55:35Z","title":"Domain Re-Modulation for Few-Shot Generative Domain Adaptation","summary":"  In this study, we delve into the task of few-shot Generative Domain\nAdaptation (GDA), which involves transferring a pre-trained generator from one\ndomain to a new domain using only a few reference images. Inspired by the way\nhuman brains acquire knowledge in new domains, we present an innovative\ngenerator structure called Domain Re-Modulation (DoRM). DoRM not only meets the\ncriteria of high quality, large synthesis diversity, and cross-domain\nconsistency, which were achieved by previous research in GDA, but also\nincorporates memory and domain association, akin to how human brains operate.\nSpecifically, DoRM freezes the source generator and introduces new mapping and\naffine modules (M&A modules) to capture the attributes of the target domain\nduring GDA. This process resembles the formation of new synapses in human\nbrains. Consequently, a linearly combinable domain shift occurs in the style\nspace. By incorporating multiple new M&A modules, the generator gains the\ncapability to perform high-fidelity multi-domain and hybrid-domain generation.\nMoreover, to maintain cross-domain consistency more effectively, we introduce a\nsimilarity-based structure loss. This loss aligns the auto-correlation map of\nthe target image with its corresponding auto-correlation map of the source\nimage during training. Through extensive experiments, we demonstrate the\nsuperior performance of our DoRM and similarity-based structure loss in\nfew-shot GDA, both quantitatively and qualitatively. The code will be available\nat https://github.com/wuyi2020/DoRM.\n","authors":["Yi Wu","Ziqiang Li","Chaoyue Wang","Heliang Zheng","Shanshan Zhao","Bin Li","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2302.02550v4.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2307.07063v2","updated":"2023-10-18T08:46:20Z","published":"2023-07-13T21:08:15Z","title":"Bootstrapping Vision-Language Learning with Decoupled Language\n  Pre-training","summary":"  We present a novel methodology aimed at optimizing the application of frozen\nlarge language models (LLMs) for resource-intensive vision-language (VL)\npre-training. The current paradigm uses visual features as prompts to guide\nlanguage models, with a focus on determining the most relevant visual features\nfor corresponding text. Our approach diverges by concentrating on the language\ncomponent, specifically identifying the optimal prompts to align with visual\nfeatures. We introduce the Prompt-Transformer (P-Former), a model that predicts\nthese ideal prompts, which is trained exclusively on linguistic data, bypassing\nthe need for image-text pairings. This strategy subtly bifurcates the\nend-to-end VL training process into an additional, separate stage. Our\nexperiments reveal that our framework significantly enhances the performance of\na robust image-to-text baseline (BLIP-2), and effectively narrows the\nperformance gap between models trained with either 4M or 129M image-text pairs.\nImportantly, our framework is modality-agnostic and flexible in terms of\narchitectural design, as validated by its successful application in a video\nlearning task using varied base modules. The code will be made available at\nhttps://github.com/yiren-jian/BLIText.\n","authors":["Yiren Jian","Chongyang Gao","Soroush Vosoughi"],"pdf_url":"https://arxiv.org/pdf/2307.07063v2.pdf","comment":"Accepted to NeurIPS 2023 (spotlight)"},{"id":"http://arxiv.org/abs/2310.11797v1","updated":"2023-10-18T08:38:31Z","published":"2023-10-18T08:38:31Z","title":"Panoptic Out-of-Distribution Segmentation","summary":"  Deep learning has led to remarkable strides in scene understanding with\npanoptic segmentation emerging as a key holistic scene interpretation task.\nHowever, the performance of panoptic segmentation is severely impacted in the\npresence of out-of-distribution (OOD) objects i.e. categories of objects that\ndeviate from the training distribution. To overcome this limitation, we propose\nPanoptic Out-of Distribution Segmentation for joint pixel-level semantic\nin-distribution and out-of-distribution classification with instance\nprediction. We extend two established panoptic segmentation benchmarks,\nCityscapes and BDD100K, with out-of-distribution instance segmentation\nannotations, propose suitable evaluation metrics, and present multiple strong\nbaselines. Importantly, we propose the novel PoDS architecture with a shared\nbackbone, an OOD contextual module for learning global and local OOD object\ncues, and dual symmetrical decoders with task-specific heads that employ our\nalignment-mismatch strategy for better OOD generalization. Combined with our\ndata augmentation strategy, this approach facilitates progressive learning of\nout-of-distribution objects while maintaining in-distribution performance. We\nperform extensive evaluations that demonstrate that our proposed PoDS network\neffectively addresses the main challenges and substantially outperforms the\nbaselines. We make the dataset, code, and trained models publicly available at\nhttp://pods.cs.uni-freiburg.de.\n","authors":["Rohit Mohan","Kiran Kumaraswamy","Juana Valeria Hurtado","Kürsat Petek","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2310.11797v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11784v1","updated":"2023-10-18T08:23:14Z","published":"2023-10-18T08:23:14Z","title":"Progressive3D: Progressively Local Editing for Text-to-3D Content\n  Creation with Complex Semantic Prompts","summary":"  Recent text-to-3D generation methods achieve impressive 3D content creation\ncapacity thanks to the advances in image diffusion models and optimizing\nstrategies. However, current methods struggle to generate correct 3D content\nfor a complex prompt in semantics, i.e., a prompt describing multiple\ninteracted objects binding with different attributes. In this work, we propose\na general framework named Progressive3D, which decomposes the entire generation\ninto a series of locally progressive editing steps to create precise 3D content\nfor complex prompts, and we constrain the content change to only occur in\nregions determined by user-defined region prompts in each editing step.\nFurthermore, we propose an overlapped semantic component suppression technique\nto encourage the optimization process to focus more on the semantic differences\nbetween prompts. Extensive experiments demonstrate that the proposed\nProgressive3D framework generates precise 3D content for prompts with complex\nsemantics and is general for various text-to-3D methods driven by different 3D\nrepresentations.\n","authors":["Xinhua Cheng","Tianyu Yang","Jianan Wang","Yu Li","Lei Zhang","Jian Zhang","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2310.11784v1.pdf","comment":"Project Page: https://cxh0519.github.io/projects/Progressive3D/"},{"id":"http://arxiv.org/abs/2310.09982v2","updated":"2023-10-18T08:22:45Z","published":"2023-10-15T23:20:54Z","title":"AP$n$P: A Less-constrained P$n$P Solver for Pose Estimation with Unknown\n  Anisotropic Scaling or Focal Lengths","summary":"  Perspective-$n$-Point (P$n$P) stands as a fundamental algorithm for pose\nestimation in various applications. In this paper, we present a new approach to\nthe P$n$P problem with relaxed constraints, eliminating the need for precise 3D\ncoordinates or complete calibration data. We refer to it as AP$n$P due to its\nability to handle unknown anisotropic scaling factors of 3D coordinates or\nalternatively two distinct focal lengths in addition to the conventional rigid\npose. Through algebraic manipulations and a novel parametrization, both cases\nare brought into similar forms that distinguish themselves primarily by the\norder of a rotation and an anisotropic scaling operation. AP$n$P furthermore\nbrings down both cases to an identical polynomial problem, which is solved\nusing the Gr\\\"obner basis approach. Experimental results on both simulated and\nreal datasets demonstrate the effectiveness of AP$n$P, providing a more\nflexible and practical solution to several pose estimation tasks. Code:\nhttps://github.com/goldoak/APnP.\n","authors":["Jiaxin Wei","Stefan Leutenegger","Laurent Kneip"],"pdf_url":"https://arxiv.org/pdf/2310.09982v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10255v2","updated":"2023-10-18T08:21:34Z","published":"2023-09-19T02:20:26Z","title":"RGB-based Category-level Object Pose Estimation via Decoupled Metric\n  Scale Recovery","summary":"  While showing promising results, recent RGB-D camera-based category-level\nobject pose estimation methods have restricted applications due to the heavy\nreliance on depth sensors. RGB-only methods provide an alternative to this\nproblem yet suffer from inherent scale ambiguity stemming from monocular\nobservations. In this paper, we propose a novel pipeline that decouples the 6D\npose and size estimation to mitigate the influence of imperfect scales on rigid\ntransformations. Specifically, we leverage a pre-trained monocular estimator to\nextract local geometric information, mainly facilitating the search for inlier\n2D-3D correspondence. Meanwhile, a separate branch is designed to directly\nrecover the metric scale of the object based on category-level statistics.\nFinally, we advocate using the RANSAC-P$n$P algorithm to robustly solve for 6D\nobject pose. Extensive experiments have been conducted on both synthetic and\nreal datasets, demonstrating the superior performance of our method over\nprevious state-of-the-art RGB-based approaches, especially in terms of rotation\naccuracy. Code: https://github.com/goldoak/DMSR.\n","authors":["Jiaxin Wei","Xibin Song","Weizhe Liu","Laurent Kneip","Hongdong Li","Pan Ji"],"pdf_url":"https://arxiv.org/pdf/2309.10255v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08243v2","updated":"2023-10-18T07:50:48Z","published":"2023-02-16T11:52:00Z","title":"New Insights on Relieving Task-Recency Bias for Online Class Incremental\n  Learning","summary":"  To imitate the ability of keeping learning of human, continual learning which\ncan learn from a never-ending data stream has attracted more interests\nrecently. In all settings, the online class incremental learning (OCIL), where\nincoming samples from data stream can be used only once, is more challenging\nand can be encountered more frequently in real world. Actually, all continual\nlearning models face a stability-plasticity dilemma, where the stability means\nthe ability to preserve old knowledge while the plasticity denotes the ability\nto incorporate new knowledge. Although replay-based methods have shown\nexceptional promise, most of them concentrate on the strategy for updating and\nretrieving memory to keep stability at the expense of plasticity. To strike a\npreferable trade-off between stability and plasticity, we propose an Adaptive\nFocus Shifting algorithm (AFS), which dynamically adjusts focus to ambiguous\nsamples and non-target logits in model learning. Through a deep analysis of the\ntask-recency bias caused by class imbalance, we propose a revised focal loss to\nmainly keep stability. \\Rt{By utilizing a new weight function, the revised\nfocal loss will pay more attention to current ambiguous samples, which are the\npotentially valuable samples to make model progress quickly.} To promote\nplasticity, we introduce a virtual knowledge distillation. By designing a\nvirtual teacher, it assigns more attention to non-target classes, which can\nsurmount overconfidence and encourage model to focus on inter-class\ninformation. Extensive experiments on three popular datasets for OCIL have\nshown the effectiveness of AFS. The code will be available at\n\\url{https://github.com/czjghost/AFS}.\n","authors":["Guoqiang Liang","Zhaojie Chen","Zhaoqiang Chen","Shiyu Ji","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2302.08243v2.pdf","comment":"14 pages,16 figures"},{"id":"http://arxiv.org/abs/2310.11766v1","updated":"2023-10-18T07:49:24Z","published":"2023-10-18T07:49:24Z","title":"Multi Task Consistency Guided Source-Free Test-Time Domain Adaptation\n  Medical Image Segmentation","summary":"  Source-free test-time adaptation for medical image segmentation aims to\nenhance the adaptability of segmentation models to diverse and previously\nunseen test sets of the target domain, which contributes to the\ngeneralizability and robustness of medical image segmentation models without\naccess to the source domain. Ensuring consistency between target edges and\npaired inputs is crucial for test-time adaptation. To improve the performance\nof test-time domain adaptation, we propose a multi task consistency guided\nsource-free test-time domain adaptation medical image segmentation method which\nensures the consistency of the local boundary predictions and the global\nprototype representation. Specifically, we introduce a local boundary\nconsistency constraint method that explores the relationship between tissue\nregion segmentation and tissue boundary localization tasks. Additionally, we\npropose a global feature consistency constraint toto enhance the intra-class\ncompactness. We conduct extensive experiments on the segmentation of benchmark\nfundus images. Compared to prediction directly by the source domain model, the\nsegmentation Dice score is improved by 6.27\\% and 0.96\\% in RIM-ONE-r3 and\nDrishti GS datasets, respectively. Additionally, the results of experiments\ndemonstrate that our proposed method outperforms existing competitive domain\nadaptation segmentation algorithms.\n","authors":["Yanyu Ye","Zhenxi Zhang","Wei Wei","Chunna Tian"],"pdf_url":"https://arxiv.org/pdf/2310.11766v1.pdf","comment":"31 pages,7 figures"},{"id":"http://arxiv.org/abs/2310.11759v1","updated":"2023-10-18T07:31:47Z","published":"2023-10-18T07:31:47Z","title":"Perceptual Measurements, Distances and Metrics","summary":"  Perception is often viewed as a process that transforms physical variables,\nexternal to an observer, into internal psychological variables. Such a process\ncan be modeled by a function coined perceptual scale. The perceptual scale can\nbe deduced from psychophysical measurements that consist in comparing the\nrelative differences between stimuli (i.e. difference scaling experiments).\nHowever, this approach is often overlooked by the modeling and experimentation\ncommunities. Here, we demonstrate the value of measuring the perceptual scale\nof classical (spatial frequency, orientation) and less classical physical\nvariables (interpolation between textures) by embedding it in recent\nprobabilistic modeling of perception. First, we show that the assumption that\nan observer has an internal representation of univariate parameters such as\nspatial frequency or orientation while stimuli are high-dimensional does not\nlead to contradictory predictions when following the theoretical framework.\nSecond, we show that the measured perceptual scale corresponds to the\ntransduction function hypothesized in this framework. In particular, we\ndemonstrate that it is related to the Fisher information of the generative\nmodel that underlies perception and we test the predictions given by the\ngenerative model of different stimuli in a set a of difference scaling\nexperiments. Our main conclusion is that the perceptual scale is mostly driven\nby the stimulus power spectrum. Finally, we propose that this measure of\nperceptual scale is a way to push further the notion of perceptual distances by\nestimating the perceptual geometry of images i.e. the path between images\ninstead of simply the distance between those.\n","authors":["Jonathan Vacher","Pascal Mamassian"],"pdf_url":"https://arxiv.org/pdf/2310.11759v1.pdf","comment":"15 pages, 6 figures, 5 appendix"},{"id":"http://arxiv.org/abs/2310.11758v1","updated":"2023-10-18T07:31:35Z","published":"2023-10-18T07:31:35Z","title":"Domain-Generalized Face Anti-Spoofing with Unknown Attacks","summary":"  Although face anti-spoofing (FAS) methods have achieved remarkable\nperformance on specific domains or attack types, few studies have focused on\nthe simultaneous presence of domain changes and unknown attacks, which is\ncloser to real application scenarios. To handle domain-generalized unknown\nattacks, we introduce a new method, DGUA-FAS, which consists of a\nTransformer-based feature extractor and a synthetic unknown attack sample\ngenerator (SUASG). The SUASG network simulates unknown attack samples to assist\nthe training of the feature extractor. Experimental results show that our\nmethod achieves superior performance on domain generalization FAS with known or\nunknown attacks.\n","authors":["Zong-Wei Hong","Yu-Chen Lin","Hsuan-Tung Liu","Yi-Ren Yeh","Chu-Song Chen"],"pdf_url":"https://arxiv.org/pdf/2310.11758v1.pdf","comment":"IEEE International Conference on Image Processing (ICIP 2023)"},{"id":"http://arxiv.org/abs/2310.11755v1","updated":"2023-10-18T07:30:08Z","published":"2023-10-18T07:30:08Z","title":"RGM: A Robust Generalist Matching Model","summary":"  Finding corresponding pixels within a pair of images is a fundamental\ncomputer vision task with various applications. Due to the specific\nrequirements of different tasks like optical flow estimation and local feature\nmatching, previous works are primarily categorized into dense matching and\nsparse feature matching focusing on specialized architectures along with\ntask-specific datasets, which may somewhat hinder the generalization\nperformance of specialized models. In this paper, we propose a deep model for\nsparse and dense matching, termed RGM (Robust Generalist Matching). In\nparticular, we elaborately design a cascaded GRU module for refinement by\nexploring the geometric similarity iteratively at multiple scales following an\nadditional uncertainty estimation module for sparsification. To narrow the gap\nbetween synthetic training samples and real-world scenarios, we build a new,\nlarge-scale dataset with sparse correspondence ground truth by generating\noptical flow supervision with greater intervals. As such, we are able to mix up\nvarious dense and sparse matching datasets, significantly improving the\ntraining diversity. The generalization capacity of our proposed RGM is greatly\nimproved by learning the matching and uncertainty estimation in a two-stage\nmanner on the large, mixed data. Superior performance is achieved for zero-shot\nmatching and downstream geometry estimation across multiple datasets,\noutperforming the previous methods by a large margin.\n","authors":["Songyan Zhang","Xinyu Sun","Hao Chen","Bo Li","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2310.11755v1.pdf","comment":"17 pages. Code is available at: https://github.com/aim-uofa/RGM"},{"id":"http://arxiv.org/abs/2310.11748v1","updated":"2023-10-18T07:10:47Z","published":"2023-10-18T07:10:47Z","title":"BanglaAbuseMeme: A Dataset for Bengali Abusive Meme Classification","summary":"  The dramatic increase in the use of social media platforms for information\nsharing has also fueled a steep growth in online abuse. A simple yet effective\nway of abusing individuals or communities is by creating memes, which often\nintegrate an image with a short piece of text layered on top of it. Such\nharmful elements are in rampant use and are a threat to online safety. Hence it\nis necessary to develop efficient models to detect and flag abusive memes. The\nproblem becomes more challenging in a low-resource setting (e.g., Bengali\nmemes, i.e., images with Bengali text embedded on it) because of the absence of\nbenchmark datasets on which AI models could be trained. In this paper we bridge\nthis gap by building a Bengali meme dataset. To setup an effective benchmark we\nimplement several baseline models for classifying abusive memes using this\ndataset. We observe that multimodal models that use both textual and visual\ninformation outperform unimodal models. Our best-performing model achieves a\nmacro F1 score of 70.51. Finally, we perform a qualitative error analysis of\nthe misclassified memes of the best-performing text-based, image-based and\nmultimodal models.\n","authors":["Mithun Das","Animesh Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2310.11748v1.pdf","comment":"EMNLP 2023 (main conference)"},{"id":"http://arxiv.org/abs/2309.03247v2","updated":"2023-10-18T06:48:26Z","published":"2023-09-06T12:46:20Z","title":"Robust Visual Tracking by Motion Analyzing","summary":"  In recent years, Video Object Segmentation (VOS) has emerged as a\ncomplementary method to Video Object Tracking (VOT). VOS focuses on classifying\nall the pixels around the target, allowing for precise shape labeling, while\nVOT primarily focuses on the approximate region where the target might be.\nHowever, traditional segmentation modules usually classify pixels frame by\nframe, disregarding information between adjacent frames.\n  In this paper, we propose a new algorithm that addresses this limitation by\nanalyzing the motion pattern using the inherent tensor structure. The tensor\nstructure, obtained through Tucker2 tensor decomposition, proves to be\neffective in describing the target's motion. By incorporating this information,\nwe achieved competitive results on Four benchmarks LaSOT\\cite{fan2019lasot},\nAVisT\\cite{noman2022avist}, OTB100\\cite{7001050}, and\nGOT-10k\\cite{huang2019got} LaSOT\\cite{fan2019lasot} with SOTA. Furthermore, the\nproposed tracker is capable of real-time operation, adding value to its\npractical application.\n","authors":["Mohammed Leo","Kurban Ubul","ShengJie Cheng","Michael Ma"],"pdf_url":"https://arxiv.org/pdf/2309.03247v2.pdf","comment":"found some key point that is missed,considering that it will take a\n  lot of time to reproduce the results and revise our mistakes,we would like to\n  withdraw the manuscript to avoid further mislead"},{"id":"http://arxiv.org/abs/2310.11733v1","updated":"2023-10-18T06:09:12Z","published":"2023-10-18T06:09:12Z","title":"DBDNet:Partial-to-Partial Point Cloud Registration with Dual Branches\n  Decoupling","summary":"  Point cloud registration plays a crucial role in various computer vision\ntasks, and usually demands the resolution of partial overlap registration in\npractice. Most existing methods perform a serial calculation of rotation and\ntranslation, while jointly predicting overlap during registration, this\ncoupling tends to degenerate the registration performance. In this paper, we\npropose an effective registration method with dual branches decoupling for\npartial-to-partial registration, dubbed as DBDNet. Specifically, we introduce a\ndual branches structure to eliminate mutual interference error between rotation\nand translation by separately creating two individual correspondence matrices.\nFor partial-to-partial registration, we consider overlap prediction as a\npreordering task before the registration procedure. Accordingly, we present an\noverlap predictor that benefits from explicit feature interaction, which is\nachieved by the powerful attention mechanism to accurately predict pointwise\nmasks. Furthermore, we design a multi-resolution feature extraction network to\ncapture both local and global patterns thus enhancing both overlap prediction\nand registration module. Experimental results on both synthetic and real\ndatasets validate the effectiveness of our proposed method.\n","authors":["Shiqi Li","Jihua Zhu","Yifan Xie"],"pdf_url":"https://arxiv.org/pdf/2310.11733v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n  Copyright may be transferred without notice, after which this version may no\n  longer be accessible"},{"id":"http://arxiv.org/abs/2305.03807v4","updated":"2023-10-18T06:03:27Z","published":"2023-05-05T19:20:29Z","title":"Evading Watermark based Detection of AI-Generated Content","summary":"  A generative AI model can generate extremely realistic-looking content,\nposing growing challenges to the authenticity of information. To address the\nchallenges, watermark has been leveraged to detect AI-generated content.\nSpecifically, a watermark is embedded into an AI-generated content before it is\nreleased. A content is detected as AI-generated if a similar watermark can be\ndecoded from it. In this work, we perform a systematic study on the robustness\nof such watermark-based AI-generated content detection. We focus on\nAI-generated images. Our work shows that an attacker can post-process a\nwatermarked image via adding a small, human-imperceptible perturbation to it,\nsuch that the post-processed image evades detection while maintaining its\nvisual quality. We show the effectiveness of our attack both theoretically and\nempirically. Moreover, to evade detection, our adversarial post-processing\nmethod adds much smaller perturbations to AI-generated images and thus better\nmaintain their visual quality than existing popular post-processing methods\nsuch as JPEG compression, Gaussian blur, and Brightness/Contrast. Our work\nshows the insufficiency of existing watermark-based detection of AI-generated\ncontent, highlighting the urgent needs of new methods. Our code is publicly\navailable: https://github.com/zhengyuan-jiang/WEvade.\n","authors":["Zhengyuan Jiang","Jinghuai Zhang","Neil Zhenqiang Gong"],"pdf_url":"https://arxiv.org/pdf/2305.03807v4.pdf","comment":"To appear in ACM Conference on Computer and Communications Security\n  (CCS), 2023"},{"id":"http://arxiv.org/abs/2310.11725v1","updated":"2023-10-18T05:44:49Z","published":"2023-10-18T05:44:49Z","title":"VST++: Efficient and Stronger Visual Saliency Transformer","summary":"  While previous CNN-based models have exhibited promising results for salient\nobject detection (SOD), their ability to explore global long-range dependencies\nis restricted. Our previous work, the Visual Saliency Transformer (VST),\naddressed this constraint from a transformer-based sequence-to-sequence\nperspective, to unify RGB and RGB-D SOD. In VST, we developed a multi-task\ntransformer decoder that concurrently predicts saliency and boundary outcomes\nin a pure transformer architecture. Moreover, we introduced a novel token\nupsampling method called reverse T2T for predicting a high-resolution saliency\nmap effortlessly within transformer-based structures. Building upon the VST\nmodel, we further propose an efficient and stronger VST version in this work,\ni.e. VST++. To mitigate the computational costs of the VST model, we propose a\nSelect-Integrate Attention (SIA) module, partitioning foreground into\nfine-grained segments and aggregating background information into a single\ncoarse-grained token. To incorporate 3D depth information with low cost, we\ndesign a novel depth position encoding method tailored for depth maps.\nFurthermore, we introduce a token-supervised prediction loss to provide\nstraightforward guidance for the task-related tokens. We evaluate our VST++\nmodel across various transformer-based backbones on RGB, RGB-D, and RGB-T SOD\nbenchmark datasets. Experimental results show that our model outperforms\nexisting methods while achieving a 25% reduction in computational costs without\nsignificant performance compromise. The demonstrated strong ability for\ngeneralization, enhanced performance, and heightened efficiency of our VST++\nmodel highlight its potential.\n","authors":["Nian Liu","Ziyang Luo","Ni Zhang","Junwei Han"],"pdf_url":"https://arxiv.org/pdf/2310.11725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10825v3","updated":"2023-10-18T05:43:19Z","published":"2023-05-18T09:06:01Z","title":"DiffUTE: Universal Text Editing Diffusion Model","summary":"  Diffusion model based language-guided image editing has achieved great\nsuccess recently. However, existing state-of-the-art diffusion models struggle\nwith rendering correct text and text style during generation. To tackle this\nproblem, we propose a universal self-supervised text editing diffusion model\n(DiffUTE), which aims to replace or modify words in the source image with\nanother one while maintaining its realistic appearance. Specifically, we build\nour model on a diffusion model and carefully modify the network structure to\nenable the model for drawing multilingual characters with the help of glyph and\nposition information. Moreover, we design a self-supervised learning framework\nto leverage large amounts of web data to improve the representation ability of\nthe model. Experimental results show that our method achieves an impressive\nperformance and enables controllable editing on in-the-wild images with high\nfidelity. Our code will be avaliable in\n\\url{https://github.com/chenhaoxing/DiffUTE}.\n","authors":["Haoxing Chen","Zhuoer Xu","Zhangxuan Gu","Jun Lan","Xing Zheng","Yaohui Li","Changhua Meng","Huijia Zhu","Weiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2305.10825v3.pdf","comment":"Accepted by NeurIPS'2023"},{"id":"http://arxiv.org/abs/2310.11713v1","updated":"2023-10-18T05:03:57Z","published":"2023-10-18T05:03:57Z","title":"Separating Invisible Sounds Toward Universal Audiovisual Scene-Aware\n  Sound Separation","summary":"  The audio-visual sound separation field assumes visible sources in videos,\nbut this excludes invisible sounds beyond the camera's view. Current methods\nstruggle with such sounds lacking visible cues. This paper introduces a novel\n\"Audio-Visual Scene-Aware Separation\" (AVSA-Sep) framework. It includes a\nsemantic parser for visible and invisible sounds and a separator for\nscene-informed separation. AVSA-Sep successfully separates both sound types,\nwith joint training and cross-modal alignment enhancing effectiveness.\n","authors":["Yiyang Su","Ali Vosoughi","Shijian Deng","Yapeng Tian","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2310.11713v1.pdf","comment":"Accepted at ICCV 2023 - AV4D, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2310.11702v1","updated":"2023-10-18T04:23:05Z","published":"2023-10-18T04:23:05Z","title":"DPF-Nutrition: Food Nutrition Estimation via Depth Prediction and Fusion","summary":"  A reasonable and balanced diet is essential for maintaining good health. With\nthe advancements in deep learning, automated nutrition estimation method based\non food images offers a promising solution for monitoring daily nutritional\nintake and promoting dietary health. While monocular image-based nutrition\nestimation is convenient, efficient, and economical, the challenge of limited\naccuracy remains a significant concern. To tackle this issue, we proposed\nDPF-Nutrition, an end-to-end nutrition estimation method using monocular\nimages. In DPF-Nutrition, we introduced a depth prediction module to generate\ndepth maps, thereby improving the accuracy of food portion estimation.\nAdditionally, we designed an RGB-D fusion module that combined monocular images\nwith the predicted depth information, resulting in better performance for\nnutrition estimation. To the best of our knowledge, this was the pioneering\neffort that integrated depth prediction and RGB-D fusion techniques in food\nnutrition estimation. Comprehensive experiments performed on Nutrition5k\nevaluated the effectiveness and efficiency of DPF-Nutrition.\n","authors":["Yuzhe Han","Qimin Cheng","Wenjin Wu","Ziyang Huang"],"pdf_url":"https://arxiv.org/pdf/2310.11702v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11700v1","updated":"2023-10-18T04:15:39Z","published":"2023-10-18T04:15:39Z","title":"Runner re-identification from single-view video in the open-world\n  setting","summary":"  In many sports, player re-identification is crucial for automatic video\nprocessing and analysis. However, most of the current studies on player\nre-identification in multi- or single-view sports videos focus on\nre-identification in the closed-world setting using labeled image dataset, and\nplayer re-identification in the open-world setting for automatic video analysis\nis not well developed. In this paper, we propose a runner re-identification\nsystem that directly processes single-view video to address the open-world\nsetting. In the open-world setting, we cannot use labeled dataset and have to\nprocess video directly. The proposed system automatically processes raw video\nas input to identify runners, and it can identify runners even when they are\nframed out multiple times. For the automatic processing, we first detect the\nrunners in the video using the pre-trained YOLOv8 and the fine-tuned\nEfficientNet. We then track the runners using ByteTrack and detect their shoes\nwith the fine-tuned YOLOv8. Finally, we extract the image features of the\nrunners using an unsupervised method using the gated recurrent unit autoencoder\nmodel. To improve the accuracy of runner re-identification, we use dynamic\nfeatures of running sequence images. We evaluated the system on a running\npractice video dataset and showed that the proposed method identified runners\nwith higher accuracy than one of the state-of-the-art models in unsupervised\nre-identification. We also showed that our unsupervised running dynamic feature\nextractor was effective for runner re-identification. Our runner\nre-identification system can be useful for the automatic analysis of running\nvideos.\n","authors":["Tomohiro Suzuki","Kazushi Tsutsui","Kazuya Takeda","Keisuke Fujii"],"pdf_url":"https://arxiv.org/pdf/2310.11700v1.pdf","comment":"18 pages, 8 figures"},{"id":"http://arxiv.org/abs/2310.11699v1","updated":"2023-10-18T04:15:12Z","published":"2023-10-18T04:15:12Z","title":"MISAR: A Multimodal Instructional System with Augmented Reality","summary":"  Augmented reality (AR) requires the seamless integration of visual, auditory,\nand linguistic channels for optimized human-computer interaction. While\nauditory and visual inputs facilitate real-time and contextual user guidance,\nthe potential of large language models (LLMs) in this landscape remains largely\nuntapped. Our study introduces an innovative method harnessing LLMs to\nassimilate information from visual, auditory, and contextual modalities.\nFocusing on the unique challenge of task performance quantification in AR, we\nutilize egocentric video, speech, and context analysis. The integration of LLMs\nfacilitates enhanced state estimation, marking a step towards more adaptive AR\nsystems. Code, dataset, and demo will be available at\nhttps://github.com/nguyennm1024/misar.\n","authors":["Jing Bi","Nguyen Manh Nguyen","Ali Vosoughi","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2310.11699v1.pdf","comment":"Accepted at ICCV 2023 - AV4D, 6 figures, 2 tables"},{"id":"http://arxiv.org/abs/2310.10404v2","updated":"2023-10-18T04:05:40Z","published":"2023-10-16T13:49:46Z","title":"Weakly Supervised Fine-grained Scene Graph Generation via Large Language\n  Model","summary":"  Weakly-Supervised Scene Graph Generation (WSSGG) research has recently\nemerged as an alternative to the fully-supervised approach that heavily relies\non costly annotations. In this regard, studies on WSSGG have utilized image\ncaptions to obtain unlocalized triplets while primarily focusing on grounding\nthe unlocalized triplets over image regions. However, they have overlooked the\ntwo issues involved in the triplet formation process from the captions: 1)\nSemantic over-simplification issue arises when extracting triplets from\ncaptions, where fine-grained predicates in captions are undesirably converted\ninto coarse-grained predicates, resulting in a long-tailed predicate\ndistribution, and 2) Low-density scene graph issue arises when aligning the\ntriplets in the caption with entity/predicate classes of interest, where many\ntriplets are discarded and not used in training, leading to insufficient\nsupervision. To tackle the two issues, we propose a new approach, i.e., Large\nLanguage Model for weakly-supervised SGG (LLM4SGG), where we mitigate the two\nissues by leveraging the LLM's in-depth understanding of language and reasoning\nability during the extraction of triplets from captions and alignment of\nentity/predicate classes with target data. To further engage the LLM in these\nprocesses, we adopt the idea of Chain-of-Thought and the in-context few-shot\nlearning strategy. To validate the effectiveness of LLM4SGG, we conduct\nextensive experiments on Visual Genome and GQA datasets, showing significant\nimprovements in both Recall@K and mean Recall@K compared to the\nstate-of-the-art WSSGG methods. A further appeal is that LLM4SGG is\ndata-efficient, enabling effective model training with a small amount of\ntraining images.\n","authors":["Kibum Kim","Kanghoon Yoon","Jaehyeong Jeon","Yeonjun In","Jinyoung Moon","Donghyun Kim","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2310.10404v2.pdf","comment":"24 pages, Preprint"},{"id":"http://arxiv.org/abs/2310.05012v2","updated":"2023-10-18T04:05:37Z","published":"2023-10-08T05:03:35Z","title":"Detecting Abnormal Health Conditions in Smart Home Using a Drone","summary":"  Nowadays, detecting aberrant health issues is a difficult process. Falling,\nespecially among the elderly, is a severe concern worldwide. Falls can result\nin deadly consequences, including unconsciousness, internal bleeding, and often\ntimes, death. A practical and optimal, smart approach of detecting falling is\ncurrently a concern. The use of vision-based fall monitoring is becoming more\ncommon among scientists as it enables senior citizens and those with other\nhealth conditions to live independently. For tracking, surveillance, and\nrescue, unmanned aerial vehicles use video or image segmentation and object\ndetection methods. The Tello drone is equipped with a camera and with this\ndevice we determined normal and abnormal behaviors among our participants. The\nautonomous falling objects are classified using a convolutional neural network\n(CNN) classifier. The results demonstrate that the systems can identify falling\nobjects with a precision of 0.9948.\n","authors":["Pronob Kumar Barman"],"pdf_url":"https://arxiv.org/pdf/2310.05012v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.12550v2","updated":"2023-10-18T04:00:45Z","published":"2022-08-26T10:05:39Z","title":"Training and Tuning Generative Neural Radiance Fields for\n  Attribute-Conditional 3D-Aware Face Generation","summary":"  Generative Neural Radiance Fields (GNeRF) based 3D-aware GANs have\ndemonstrated remarkable capabilities in generating high-quality images while\nmaintaining strong 3D consistency. Notably, significant advancements have been\nmade in the domain of face generation. However, most existing models prioritize\nview consistency over disentanglement, resulting in limited semantic/attribute\ncontrol during generation. To address this limitation, we propose a conditional\nGNeRF model incorporating specific attribute labels as input to enhance the\ncontrollability and disentanglement abilities of 3D-aware generative models.\nOur approach builds upon a pre-trained 3D-aware face model, and we introduce a\nTraining as Init and Optimizing for Tuning (TRIOT) method to train a\nconditional normalized flow module to enable the facial attribute editing, then\noptimize the latent vector to improve attribute-editing precision further. Our\nextensive experiments demonstrate that our model produces high-quality edits\nwith superior view consistency while preserving non-target regions. Code is\navailable at https://github.com/zhangqianhui/TT-GNeRF.\n","authors":["Jichao Zhang","Aliaksandr Siarohin","Yahui Liu","Hao Tang","Nicu Sebe","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2208.12550v2.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2310.11696v1","updated":"2023-10-18T03:57:06Z","published":"2023-10-18T03:57:06Z","title":"MOHO: Learning Single-view Hand-held Object Reconstruction with\n  Multi-view Occlusion-Aware Supervision","summary":"  Previous works concerning single-view hand-held object reconstruction\ntypically utilize supervision from 3D ground truth models, which are hard to\ncollect in real world. In contrast, abundant videos depicting hand-object\ninteractions can be accessed easily with low cost, although they only give\npartial object observations with complex occlusion. In this paper, we present\nMOHO to reconstruct hand-held object from a single image with multi-view\nsupervision from hand-object videos, tackling two predominant challenges\nincluding object's self-occlusion and hand-induced occlusion. MOHO inputs\nsemantic features indicating visible object parts and geometric embeddings\nprovided by hand articulations as partial-to-full cues to resist object's\nself-occlusion, so as to recover full shape of the object. Meanwhile, a novel\n2D-3D hand-occlusion-aware training scheme following the synthetic-to-real\nparadigm is proposed to release hand-induced occlusion. In the synthetic\npre-training stage, 2D-3D hand-object correlations are constructed by\nsupervising MOHO with rendered images to complete the hand-concealed regions of\nthe object in both 2D and 3D space. Subsequently, MOHO is finetuned in real\nworld by the mask-weighted volume rendering supervision adopting hand-object\ncorrelations obtained during pre-training. Extensive experiments on HO3D and\nDexYCB datasets demonstrate that 2D-supervised MOHO gains superior results\nagainst 3D-supervised methods by a large margin. Codes and key assets will be\nreleased soon.\n","authors":["Chenyangguang Zhang","Guanlong Jiao","Yan Di","Ziqin Huang","Gu Wang","Ruida Zhang","Bowen Fu","Federico Tombari","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2310.11696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13720v6","updated":"2023-10-18T03:56:20Z","published":"2023-06-23T18:08:00Z","title":"Decoupled Diffusion Models: Image to Zero and Zero to Noise","summary":"  Recent diffusion probabilistic models (DPMs) have shown remarkable abilities\nof generated content, however, they often suffer from complex forward\nprocesses, resulting in inefficient solutions for the reversed process and\nprolonged sampling times. In this paper, we aim to address the aforementioned\nchallenges by focusing on the diffusion process itself that we propose to\ndecouple the intricate diffusion process into two comparatively simpler process\nto improve the generative efficacy and speed. In particular, we present a novel\ndiffusion paradigm named DDM (Decoupled Diffusion Models) based on the Ito\ndiffusion process, in which the image distribution is approximated by an\nexplicit transition probability while the noise path is controlled by the\nstandard Wiener process. We find that decoupling the diffusion process reduces\nthe learning difficulty and the explicit transition probability improves the\ngenerative speed significantly. We prove a new training objective for DPM,\nwhich enables the model to learn to predict the noise and image components\nseparately. Moreover, given the novel forward diffusion equation, we derive the\nreverse denoising formula of DDM that naturally supports fewer steps of\ngeneration without ordinary differential equation (ODE) based accelerators. Our\nexperiments demonstrate that DDM outperforms previous DPMs by a large margin in\nfewer function evaluations setting and gets comparable performances in long\nfunction evaluations setting. We also show that our framework can be applied to\nimage-conditioned generation and high-resolution image synthesis, and that it\ncan generate high-quality images with only 10 function evaluations.\n","authors":["Yuhang Huang","Zheng Qin","Xinwang Liu","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2306.13720v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15289v3","updated":"2023-10-18T02:21:35Z","published":"2023-09-26T21:56:03Z","title":"SEPT: Towards Efficient Scene Representation Learning for Motion\n  Prediction","summary":"  Motion prediction is crucial for autonomous vehicles to operate safely in\ncomplex traffic environments. Extracting effective spatiotemporal relationships\namong traffic elements is key to accurate forecasting. Inspired by the\nsuccessful practice of pretrained large language models, this paper presents\nSEPT, a modeling framework that leverages self-supervised learning to develop\npowerful spatiotemporal understanding for complex traffic scenes. Specifically,\nour approach involves three masking-reconstruction modeling tasks on scene\ninputs including agents' trajectories and road network, pretraining the scene\nencoder to capture kinematics within trajectory, spatial structure of road\nnetwork, and interactions among roads and agents. The pretrained encoder is\nthen finetuned on the downstream forecasting task. Extensive experiments\ndemonstrate that SEPT, without elaborate architectural design or manual feature\nengineering, achieves state-of-the-art performance on the Argoverse 1 and\nArgoverse 2 motion forecasting benchmarks, outperforming previous methods on\nall main metrics by a large margin.\n","authors":["Zhiqian Lan","Yuxuan Jiang","Yao Mu","Chen Chen","Shengbo Eben Li","Hang Zhao","Keqiang Li"],"pdf_url":"https://arxiv.org/pdf/2309.15289v3.pdf","comment":"The content of the article needs to be further polished"},{"id":"http://arxiv.org/abs/2303.10073v2","updated":"2023-10-18T02:08:01Z","published":"2023-03-17T15:54:30Z","title":"DialogPaint: A Dialog-based Image Editing Model","summary":"  We introduce DialogPaint, a novel framework that bridges conversational\ninteractions with image editing, enabling users to modify images through\nnatural dialogue. By integrating a dialogue model with the Stable Diffusion\nimage transformation technique, DialogPaint offers a more intuitive and\ninteractive approach to image modifications. Our method stands out by\neffectively interpreting and executing both explicit and ambiguous\ninstructions, handling tasks such as object replacement, style transfer, and\ncolor modification. Notably, DialogPaint supports iterative, multi-round\nediting, allowing users to refine image edits over successive interactions.\nComprehensive evaluations highlight the robustness and versatility of our\napproach, marking a significant advancement in dialogue-driven image editing.\n","authors":["Jingxuan Wei","Shiyu Wu","Xin Jiang","Yequan Wang"],"pdf_url":"https://arxiv.org/pdf/2303.10073v2.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2310.11657v1","updated":"2023-10-18T02:07:22Z","published":"2023-10-18T02:07:22Z","title":"ChatGPT-guided Semantics for Zero-shot Learning","summary":"  Zero-shot learning (ZSL) aims to classify objects that are not observed or\nseen during training. It relies on class semantic description to transfer\nknowledge from the seen classes to the unseen classes. Existing methods of\nobtaining class semantics include manual attributes or automatic word vectors\nfrom language models (like word2vec). We know attribute annotation is costly,\nwhereas automatic word-vectors are relatively noisy. To address this problem,\nwe explore how ChatGPT, a large language model, can enhance class semantics for\nZSL tasks. ChatGPT can be a helpful source to obtain text descriptions for each\nclass containing related attributes and semantics. We use the word2vec model to\nget a word vector using the texts from ChatGPT. Then, we enrich word vectors by\ncombining the word embeddings from class names and descriptions generated by\nChatGPT. More specifically, we leverage ChatGPT to provide extra supervision\nfor the class description, eventually benefiting ZSL models. We evaluate our\napproach on various 2D image (CUB and AwA) and 3D point cloud (ModelNet10,\nModelNet40, and ScanObjectNN) datasets and show that it improves ZSL\nperformance. Our work contributes to the ZSL literature by applying ChatGPT for\nclass semantics enhancement and proposing a novel word vector fusion method.\n","authors":["Fahimul Hoque Shubho","Townim Faisal Chowdhury","Ali Cheraghian","Morteza Saberi","Nabeel Mohammed","Shafin Rahman"],"pdf_url":"https://arxiv.org/pdf/2310.11657v1.pdf","comment":"Accepted in International Conference on Digital Image Computing:\n  Techniques and Applications (DICTA), 2023"},{"id":"http://arxiv.org/abs/2305.16580v3","updated":"2023-10-18T01:45:06Z","published":"2023-05-26T02:09:48Z","title":"TFDet: Target-Aware Fusion for RGB-T Pedestrian Detection","summary":"  Pedestrian detection plays a critical role in computer vision as it\ncontributes to ensuring traffic safety. Existing methods that rely solely on\nRGB images suffer from performance degradation under low-light conditions due\nto the lack of useful information. To address this issue, recent multispectral\ndetection approaches have combined thermal images to provide complementary\ninformation and have obtained enhanced performances. Nevertheless, few\napproaches focus on the negative effects of false positives caused by noisy\nfused feature maps. Different from them, we comprehensively analyze the impacts\nof false positives on the detection performance and find that enhancing feature\ncontrast can significantly reduce these false positives. In this paper, we\npropose a novel target-aware fusion strategy for multispectral pedestrian\ndetection, named TFDet. Our fusion strategy highlights the pedestrian-related\nfeatures and suppresses unrelated ones, generating more discriminative fused\nfeatures. TFDet achieves state-of-the-art performance on both KAIST and LLVIP\nbenchmarks, with an efficiency comparable to the previous state-of-the-art\ncounterpart. Importantly, TFDet performs remarkably well even under low-light\nconditions, which is a significant advancement for ensuring road safety. The\ncode will be made publicly available at https://github.com/XueZ-phd/TFDet.git.\n","authors":["Xue Zhang","Xiao-Han Zhang","Jiacheng Ying","Zehua Sheng","Heng Yu","Chunguang Li","Hui-Liang Shen"],"pdf_url":"https://arxiv.org/pdf/2305.16580v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02815v2","updated":"2023-10-18T01:44:22Z","published":"2023-10-04T13:38:53Z","title":"CoBEV: Elevating Roadside 3D Object Detection with Depth and Height\n  Complementarity","summary":"  Roadside camera-driven 3D object detection is a crucial task in intelligent\ntransportation systems, which extends the perception range beyond the\nlimitations of vision-centric vehicles and enhances road safety. While previous\nstudies have limitations in using only depth or height information, we find\nboth depth and height matter and they are in fact complementary. The depth\nfeature encompasses precise geometric cues, whereas the height feature is\nprimarily focused on distinguishing between various categories of height\nintervals, essentially providing semantic context. This insight motivates the\ndevelopment of Complementary-BEV (CoBEV), a novel end-to-end monocular 3D\nobject detection framework that integrates depth and height to construct robust\nBEV representations. In essence, CoBEV estimates each pixel's depth and height\ndistribution and lifts the camera features into 3D space for lateral fusion\nusing the newly proposed two-stage complementary feature selection (CFS)\nmodule. A BEV feature distillation framework is also seamlessly integrated to\nfurther enhance the detection accuracy from the prior knowledge of the\nfusion-modal CoBEV teacher. We conduct extensive experiments on the public 3D\ndetection benchmarks of roadside camera-based DAIR-V2X-I and Rope3D, as well as\nthe private Supremind-Road dataset, demonstrating that CoBEV not only achieves\nthe accuracy of the new state-of-the-art, but also significantly advances the\nrobustness of previous methods in challenging long-distance scenarios and noisy\ncamera disturbance, and enhances generalization by a large margin in\nheterologous settings with drastic changes in scene and camera parameters. For\nthe first time, the vehicle AP score of a camera model reaches 80% on\nDAIR-V2X-I in terms of easy mode. The source code will be made publicly\navailable at https://github.com/MasterHow/CoBEV.\n","authors":["Hao Shi","Chengshan Pang","Jiaming Zhang","Kailun Yang","Yuhao Wu","Huajian Ni","Yining Lin","Rainer Stiefelhagen","Kaiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2310.02815v2.pdf","comment":"The source code will be made publicly available at\n  https://github.com/MasterHow/CoBEV"},{"id":"http://arxiv.org/abs/2309.17264v3","updated":"2023-10-18T01:44:03Z","published":"2023-09-29T14:17:24Z","title":"A Foundation Model for General Moving Object Segmentation in Medical\n  Images","summary":"  Medical image segmentation aims to delineate the anatomical or pathological\nstructures of interest, playing a crucial role in clinical diagnosis. A\nsubstantial amount of high-quality annotated data is crucial for constructing\nhigh-precision deep segmentation models. However, medical annotation is highly\ncumbersome and time-consuming, especially for medical videos or 3D volumes, due\nto the huge labeling space and poor inter-frame consistency. Recently, a\nfundamental task named Moving Object Segmentation (MOS) has made significant\nadvancements in natural images. Its objective is to delineate moving objects\nfrom the background within image sequences, requiring only minimal annotations.\nIn this paper, we propose the first foundation model, named iMOS, for MOS in\nmedical images. Extensive experiments on a large multi-modal medical dataset\nvalidate the effectiveness of the proposed iMOS. Specifically, with the\nannotation of only a small number of images in the sequence, iMOS can achieve\nsatisfactory tracking and segmentation performance of moving objects throughout\nthe entire sequence in bi-directions. We hope that the proposed iMOS can help\naccelerate the annotation speed of experts, and boost the development of\nmedical foundation models.\n","authors":["Zhongnuo Yan","Tong Han","Yuhao Huang","Lian Liu","Han Zhou","Jiongquan Chen","Wenlong Shi","Yan Cao","Xin Yang","Dong Ni"],"pdf_url":"https://arxiv.org/pdf/2309.17264v3.pdf","comment":"5 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2309.11497v2","updated":"2023-10-18T01:34:30Z","published":"2023-09-20T17:56:18Z","title":"FreeU: Free Lunch in Diffusion U-Net","summary":"  In this paper, we uncover the untapped potential of diffusion U-Net, which\nserves as a \"free lunch\" that substantially improves the generation quality on\nthe fly. We initially investigate the key contributions of the U-Net\narchitecture to the denoising process and identify that its main backbone\nprimarily contributes to denoising, whereas its skip connections mainly\nintroduce high-frequency features into the decoder module, causing the network\nto overlook the backbone semantics. Capitalizing on this discovery, we propose\na simple yet effective method-termed \"FreeU\" - that enhances generation quality\nwithout additional training or finetuning. Our key insight is to strategically\nre-weight the contributions sourced from the U-Net's skip connections and\nbackbone feature maps, to leverage the strengths of both components of the\nU-Net architecture. Promising results on image and video generation tasks\ndemonstrate that our FreeU can be readily integrated to existing diffusion\nmodels, e.g., Stable Diffusion, DreamBooth, ModelScope, Rerender and ReVersion,\nto improve the generation quality with only a few lines of code. All you need\nis to adjust two scaling factors during inference. Project page:\nhttps://chenyangsi.top/FreeU/.\n","authors":["Chenyang Si","Ziqi Huang","Yuming Jiang","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2309.11497v2.pdf","comment":"Method update: we proposed structure-based scaling to enhance the\n  performance of FreeU. Project page: https://chenyangsi.top/FreeU/"},{"id":"http://arxiv.org/abs/2310.11650v1","updated":"2023-10-18T01:28:29Z","published":"2023-10-18T01:28:29Z","title":"VKIE: The Application of Key Information Extraction on Video Text","summary":"  Extracting structured information from videos is critical for numerous\ndownstream applications in the industry. In this paper, we define a significant\ntask of extracting hierarchical key information from visual texts on videos. To\nfulfill this task, we decouples it into four subtasks and introduce two\nimplementation solutions called PipVKIE and UniVKIE. PipVKIE sequentially\ncompletes the four subtasks in continuous stages, while UniVKIE is improved by\nunifying all the subtasks into one backbone. Both PipVKIE and UniVKIE leverage\nmultimodal information from vision, text, and coordinates for feature\nrepresentation. Extensive experiments on one well-defined dataset demonstrate\nthat our solutions can achieve remarkable performance and efficient inference\nspeed. The code and dataset will be publicly available.\n","authors":["Siyu An","Ye Liu","Haoyuan Peng","Di Yin"],"pdf_url":"https://arxiv.org/pdf/2310.11650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08825v2","updated":"2023-10-18T01:21:57Z","published":"2023-10-13T02:41:55Z","title":"From CLIP to DINO: Visual Encoders Shout in Multi-modal Large Language\n  Models","summary":"  Multi-modal Large Language Models (MLLMs) have made significant strides in\nexpanding the capabilities of Large Language Models (LLMs) through the\nincorporation of visual perception interfaces. Despite the emergence of\nexciting applications and the availability of diverse instruction tuning data,\nexisting approaches often rely on CLIP or its variants as the visual branch,\nand merely extract features from the deep layers. However, these methods lack a\ncomprehensive analysis of the visual encoders in MLLMs. In this paper, we\nconduct an extensive investigation into the effectiveness of different vision\nencoders within MLLMs. Our findings reveal that the shallow layer features of\nCLIP offer particular advantages for fine-grained tasks such as grounding and\nregion understanding. Surprisingly, the vision-only model DINO, which is not\npretrained with text-image alignment, demonstrates promising performance as a\nvisual branch within MLLMs. By simply equipping it with an MLP layer for\nalignment, DINO surpasses CLIP in fine-grained related perception tasks.\nBuilding upon these observations, we propose a simple yet effective feature\nmerging strategy, named COMM, that integrates CLIP and DINO with Multi-level\nfeatures Merging, to enhance the visual capabilities of MLLMs. We evaluate COMM\nthrough comprehensive experiments on a wide range of benchmarks, including\nimage captioning, visual question answering, visual grounding, and object\nhallucination. Experimental results demonstrate the superior performance of\nCOMM compared to existing methods, showcasing its enhanced visual capabilities\nwithin MLLMs. Code will be made available at\nhttps://github.com/YuchenLiu98/COMM.\n","authors":["Dongsheng Jiang","Yuchen Liu","Songlin Liu","Xiaopeng Zhang","Jin Li","Hongkai Xiong","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2310.08825v2.pdf","comment":"The paper is under the corporation's legal review"},{"id":"http://arxiv.org/abs/2308.05314v2","updated":"2023-10-18T01:19:55Z","published":"2023-08-10T03:07:28Z","title":"Deep Semantic Graph Matching for Large-scale Outdoor Point Clouds\n  Registration","summary":"  Current point cloud registration methods are mainly based on local geometric\ninformation and usually ignore the semantic information contained in the\nscenes. In this paper, we treat the point cloud registration problem as a\nsemantic instance matching and registration task, and propose a deep semantic\ngraph matching method (DeepSGM) for large-scale outdoor point cloud\nregistration. Firstly, the semantic categorical labels of 3D points are\nobtained using a semantic segmentation network. The adjacent points with the\nsame category labels are then clustered together using the Euclidean clustering\nalgorithm to obtain the semantic instances, which are represented by three\nkinds of attributes including spatial location information, semantic\ncategorical information, and global geometric shape information. Secondly, the\nsemantic adjacency graph is constructed based on the spatial adjacency\nrelations of semantic instances. To fully explore the topological structures\nbetween semantic instances in the same scene and across different scenes, the\nspatial distribution features and the semantic categorical features are learned\nwith graph convolutional networks, and the global geometric shape features are\nlearned with a PointNet-like network. These three kinds of features are further\nenhanced with the self-attention and cross-attention mechanisms. Thirdly, the\nsemantic instance matching is formulated as an optimal transport problem, and\nsolved through an optimal matching layer. Finally, the geometric transformation\nmatrix between two point clouds is first estimated by the SVD algorithm and\nthen refined by the ICP algorithm. Experimental results conducted on the KITTI\nOdometry dataset demonstrate that the proposed method improves the registration\nperformance and outperforms various state-of-the-art methods.\n","authors":["Shaocong Liu","Tao Wang","Yan Zhang","Ruqin Zhou","Li Li","Chenguang Dai","Yongsheng Zhang","Longguang Wang","Hanyun Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05314v2.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2310.11440v2","updated":"2023-10-18T01:15:01Z","published":"2023-10-17T17:50:46Z","title":"EvalCrafter: Benchmarking and Evaluating Large Video Generation Models","summary":"  The vision and language generative models have been overgrown in recent\nyears. For video generation, various open-sourced models and public-available\nservices are released for generating high-visual quality videos. However, these\nmethods often use a few academic metrics, for example, FVD or IS, to evaluate\nthe performance. We argue that it is hard to judge the large conditional\ngenerative models from the simple metrics since these models are often trained\non very large datasets with multi-aspect abilities. Thus, we propose a new\nframework and pipeline to exhaustively evaluate the performance of the\ngenerated videos. To achieve this, we first conduct a new prompt list for\ntext-to-video generation by analyzing the real-world prompt list with the help\nof the large language model. Then, we evaluate the state-of-the-art video\ngenerative models on our carefully designed benchmarks, in terms of visual\nqualities, content qualities, motion qualities, and text-caption alignment with\naround 18 objective metrics. To obtain the final leaderboard of the models, we\nalso fit a series of coefficients to align the objective metrics to the users'\nopinions. Based on the proposed opinion alignment method, our final score shows\na higher correlation than simply averaging the metrics, showing the\neffectiveness of the proposed evaluation method.\n","authors":["Yaofang Liu","Xiaodong Cun","Xuebo Liu","Xintao Wang","Yong Zhang","Haoxin Chen","Yang Liu","Tieyong Zeng","Raymond Chan","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2310.11440v2.pdf","comment":"Technical Report, Project page: https://evalcrafter.github.io/"},{"id":"http://arxiv.org/abs/2310.11645v1","updated":"2023-10-18T01:06:19Z","published":"2023-10-18T01:06:19Z","title":"Towards Abdominal 3-D Scene Rendering from Laparoscopy Surgical Videos\n  using NeRFs","summary":"  Given that a conventional laparoscope only provides a two-dimensional (2-D)\nview, the detection and diagnosis of medical ailments can be challenging. To\novercome the visual constraints associated with laparoscopy, the use of\nlaparoscopic images and videos to reconstruct the three-dimensional (3-D)\nanatomical structure of the abdomen has proven to be a promising approach.\nNeural Radiance Fields (NeRFs) have recently gained attention thanks to their\nability to generate photorealistic images from a 3-D static scene, thus\nfacilitating a more comprehensive exploration of the abdomen through the\nsynthesis of new views. This distinguishes NeRFs from alternative methods such\nas Simultaneous Localization and Mapping (SLAM) and depth estimation. In this\npaper, we present a comprehensive examination of NeRFs in the context of\nlaparoscopy surgical videos, with the goal of rendering abdominal scenes in\n3-D. Although our experimental results are promising, the proposed approach\nencounters substantial challenges, which require further exploration in future\nresearch.\n","authors":["Khoa Tuan Nguyen","Francesca Tozzi","Nikdokht Rashidian","Wouter Willaert","Joris Vankerschaver","Wesley De Neve"],"pdf_url":"https://arxiv.org/pdf/2310.11645v1.pdf","comment":"The Version of Record of this contribution is published in MLMI 2023\n  Part I, and is available online at\n  https://doi.org/10.1007/978-3-031-45673-2_9"},{"id":"http://arxiv.org/abs/2310.02523v3","updated":"2023-10-18T00:56:51Z","published":"2023-10-04T01:47:36Z","title":"A Spatio-Temporal Attention-Based Method for Detecting Student Classroom\n  Behaviors","summary":"  Accurately detecting student behavior from classroom videos is beneficial for\nanalyzing their classroom status and improving teaching efficiency. However,\nlow accuracy in student classroom behavior detection is a prevalent issue. To\naddress this issue, we propose a Spatio-Temporal Attention-Based Method for\nDetecting Student Classroom Behaviors (BDSTA). Firstly, the SlowFast network is\nused to generate motion and environmental information feature maps from the\nvideo. Then, the spatio-temporal attention module is applied to the feature\nmaps, including information aggregation, compression and stimulation processes.\nSubsequently, attention maps in the time, channel and space dimensions are\nobtained, and multi-label behavior classification is performed based on these\nattention maps. To solve the long-tail data problem that exists in student\nclassroom behavior datasets, we use an improved focal loss function to assign\nmore weight to the tail class data during training. Experimental results are\nconducted on a self-made student classroom behavior dataset named STSCB.\nCompared with the SlowFast model, the average accuracy of student behavior\nclassification detection improves by 8.94\\% using BDSTA.\n","authors":["Fan Yang"],"pdf_url":"https://arxiv.org/pdf/2310.02523v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.00837v3","updated":"2023-10-18T00:15:41Z","published":"2023-02-02T02:46:52Z","title":"SHINE: Deep Learning-Based Accessible Parking Management System","summary":"  The ongoing expansion of urban areas facilitated by advancements in science\nand technology has resulted in a considerable increase in the number of\nprivately owned vehicles worldwide, including in South Korea. However, this\ngradual increment in the number of vehicles has inevitably led to\nparking-related issues, including the abuse of disabled parking spaces\n(hereafter referred to as accessible parking spaces) designated for individuals\nwith disabilities. Traditional license plate recognition (LPR) systems have\nproven inefficient in addressing such a problem in real-time due to the high\nframe rate of surveillance cameras, the presence of natural and artificial\nnoise, and variations in lighting and weather conditions that impede detection\nand recognition by these systems. With the growing concept of parking 4.0, many\nsensors, IoT and deep learning-based approaches have been applied to automatic\nLPR and parking management systems. Nonetheless, the studies show a need for a\nrobust and efficient model for managing accessible parking spaces in South\nKorea. To address this, we have proposed a novel system called, Shine, which\nuses the deep learning-based object detection algorithm for detecting the\nvehicle, license plate, and disability badges (referred to as cards, badges, or\naccess badges hereafter) and verifies the rights of the driver to use\naccessible parking spaces by coordinating with the central server. Our model,\nwhich achieves a mean average precision of 92.16%, is expected to address the\nissue of accessible parking space abuse and contributes significantly towards\nefficient and effective parking management in urban environments.\n","authors":["Dhiraj Neupane","Aashish Bhattarai","Sunil Aryal","Mohamed Reda Bouadjenek","Uk-Min Seok","Jongwon Seok"],"pdf_url":"https://arxiv.org/pdf/2302.00837v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08068v2","updated":"2023-10-18T00:07:18Z","published":"2023-06-13T18:32:35Z","title":"DORSal: Diffusion for Object-centric Representations of Scenes et al","summary":"  Recent progress in 3D scene understanding enables scalable learning of\nrepresentations across large datasets of diverse scenes. As a consequence,\ngeneralization to unseen scenes and objects, rendering novel views from just a\nsingle or a handful of input images, and controllable scene generation that\nsupports editing, is now possible. However, training jointly on a large number\nof scenes typically compromises rendering quality when compared to single-scene\noptimized models such as NeRFs. In this paper, we leverage recent progress in\ndiffusion models to equip 3D scene representation learning models with the\nability to render high-fidelity novel views, while retaining benefits such as\nobject-level scene editing to a large degree. In particular, we propose DORSal,\nwhich adapts a video diffusion architecture for 3D scene generation conditioned\non frozen object-centric slot-based representations of scenes. On both complex\nsynthetic multi-object scenes and on the real-world large-scale Street View\ndataset, we show that DORSal enables scalable neural rendering of 3D scenes\nwith object-level editing and improves upon existing approaches.\n","authors":["Allan Jabri","Sjoerd van Steenkiste","Emiel Hoogeboom","Mehdi S. M. Sajjadi","Thomas Kipf"],"pdf_url":"https://arxiv.org/pdf/2306.08068v2.pdf","comment":"Project page: https://www.sjoerdvansteenkiste.com/dorsal"},{"id":"http://arxiv.org/abs/2306.11249v2","updated":"2023-10-18T00:02:52Z","published":"2023-06-20T03:02:14Z","title":"OpenSTL: A Comprehensive Benchmark of Spatio-Temporal Predictive\n  Learning","summary":"  Spatio-temporal predictive learning is a learning paradigm that enables\nmodels to learn spatial and temporal patterns by predicting future frames from\ngiven past frames in an unsupervised manner. Despite remarkable progress in\nrecent years, a lack of systematic understanding persists due to the diverse\nsettings, complex implementation, and difficult reproducibility. Without\nstandardization, comparisons can be unfair and insights inconclusive. To\naddress this dilemma, we propose OpenSTL, a comprehensive benchmark for\nspatio-temporal predictive learning that categorizes prevalent approaches into\nrecurrent-based and recurrent-free models. OpenSTL provides a modular and\nextensible framework implementing various state-of-the-art methods. We conduct\nstandard evaluations on datasets across various domains, including synthetic\nmoving object trajectory, human motion, driving scenes, traffic flow and\nweather forecasting. Based on our observations, we provide a detailed analysis\nof how model architecture and dataset properties affect spatio-temporal\npredictive learning performance. Surprisingly, we find that recurrent-free\nmodels achieve a good balance between efficiency and performance than recurrent\nmodels. Thus, we further extend the common MetaFormers to boost recurrent-free\nspatial-temporal predictive learning. We open-source the code and models at\nhttps://github.com/chengtan9907/OpenSTL.\n","authors":["Cheng Tan","Siyuan Li","Zhangyang Gao","Wenfei Guan","Zedong Wang","Zicheng Liu","Lirong Wu","Stan Z. Li"],"pdf_url":"https://arxiv.org/pdf/2306.11249v2.pdf","comment":"Accepted by NeurIPS 2023. 33 pages, 17 figures, 19 tables. Under\n  review. For more details, please refer to\n  https://github.com/chengtan9907/OpenSTL"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2310.12131v1","updated":"2023-10-18T17:41:28Z","published":"2023-10-18T17:41:28Z","title":"Automated Attribute Extraction from Legal Proceedings","summary":"  The escalating number of pending cases is a growing concern world-wide.\nRecent advancements in digitization have opened up possibilities for leveraging\nartificial intelligence (AI) tools in the processing of legal documents.\nAdopting a structured representation for legal documents, as opposed to a mere\nbag-of-words flat text representation, can significantly enhance processing\ncapabilities. With the aim of achieving this objective, we put forward a set of\ndiverse attributes for criminal case proceedings. We use a state-of-the-art\nsequence labeling framework to automatically extract attributes from the legal\ndocuments. Moreover, we demonstrate the efficacy of the extracted attributes in\na downstream task, namely legal judgment prediction.\n","authors":["Subinay Adhikary","Sagnik Das","Sagnik Saha","Procheta Sen","Dwaipayan Roy","Kripabandhu Ghosh"],"pdf_url":"https://arxiv.org/pdf/2310.12131v1.pdf","comment":"Presented in Mining and Learning in the Legal Domain (MLLD) workshop\n  2023"},{"id":"http://arxiv.org/abs/2304.11029v4","updated":"2023-10-18T17:16:28Z","published":"2023-04-21T15:23:00Z","title":"CLaMP: Contrastive Language-Music Pre-training for Cross-Modal Symbolic\n  Music Information Retrieval","summary":"  We introduce CLaMP: Contrastive Language-Music Pre-training, which learns\ncross-modal representations between natural language and symbolic music using a\nmusic encoder and a text encoder trained jointly with a contrastive loss. To\npre-train CLaMP, we collected a large dataset of 1.4 million music-text pairs.\nIt employed text dropout as a data augmentation technique and bar patching to\nefficiently represent music data which reduces sequence length to less than\n10\\%. In addition, we developed a masked music model pre-training objective to\nenhance the music encoder's comprehension of musical context and structure.\nCLaMP integrates textual information to enable semantic search and zero-shot\nclassification for symbolic music, surpassing the capabilities of previous\nmodels. To support the evaluation of semantic search and music classification,\nwe publicly release WikiMusicText (WikiMT), a dataset of 1010 lead sheets in\nABC notation, each accompanied by a title, artist, genre, and description. In\ncomparison to state-of-the-art models that require fine-tuning, zero-shot CLaMP\ndemonstrated comparable or superior performance on score-oriented datasets. Our\nmodels and code are available at\nhttps://github.com/microsoft/muzic/tree/main/clamp.\n","authors":["Shangda Wu","Dingyao Yu","Xu Tan","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2304.11029v4.pdf","comment":"11 pages, 5 figures, 5 tables, accepted by ISMIR 2023"},{"id":"http://arxiv.org/abs/2310.12086v1","updated":"2023-10-18T16:27:49Z","published":"2023-10-18T16:27:49Z","title":"Unveiling the Siren's Song: Towards Reliable Fact-Conflicting\n  Hallucination Detection","summary":"  Large Language Models (LLMs), such as ChatGPT/GPT-4, have garnered widespread\nattention owing to their myriad of practical applications, yet their adoption\nhas been constrained by issues of fact-conflicting hallucinations across web\nplatforms. The assessment of factuality in text, produced by LLMs, remains\ninadequately explored, extending not only to the judgment of vanilla facts but\nalso encompassing the evaluation of factual errors emerging in complex\ninferential tasks like multi-hop, and etc. In response, we introduce FactCHD, a\nfact-conflicting hallucination detection benchmark meticulously designed for\nLLMs. Functioning as a pivotal tool in evaluating factuality within\n\"Query-Respons\" contexts, our benchmark assimilates a large-scale dataset,\nencapsulating a broad spectrum of factuality patterns, such as vanilla,\nmulti-hops, comparison, and set-operation patterns. A distinctive feature of\nour benchmark is its incorporation of fact-based chains of evidence, thereby\nfacilitating comprehensive and conducive factual reasoning throughout the\nassessment process. We evaluate multiple LLMs, demonstrating the effectiveness\nof the benchmark and current methods fall short of faithfully detecting factual\nerrors. Furthermore, we present TRUTH-TRIANGULATOR that synthesizes reflective\nconsiderations by tool-enhanced ChatGPT and LoRA-tuning based on Llama2, aiming\nto yield more credible detection through the amalgamation of predictive results\nand evidence. The benchmark dataset and source code will be made available in\nhttps://github.com/zjunlp/FactCHD.\n","authors":["Xiang Chen","Duanzheng Song","Honghao Gui","Chengxi Wang","Ningyu Zhang","Fei Huang","Chengfei Lv","Dan Zhang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.12086v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2306.05809v3","updated":"2023-10-18T15:36:08Z","published":"2023-06-09T10:48:04Z","title":"Interactive Explanation with Varying Level of Details in an Explainable\n  Scientific Literature Recommender System","summary":"  Explainable recommender systems (RS) have traditionally followed a\none-size-fits-all approach, delivering the same explanation level of detail to\neach user, without considering their individual needs and goals. Further,\nexplanations in RS have so far been presented mostly in a static and\nnon-interactive manner. To fill these research gaps, we aim in this paper to\nadopt a user-centered, interactive explanation model that provides explanations\nwith different levels of detail and empowers users to interact with, control,\nand personalize the explanations based on their needs and preferences. We\nfollowed a user-centered approach to design interactive explanations with three\nlevels of detail (basic, intermediate, and advanced) and implemented them in\nthe transparent Recommendation and Interest Modeling Application (RIMA). We\nconducted a qualitative user study (N=14) to investigate the impact of\nproviding interactive explanations with varying level of details on the users'\nperception of the explainable RS. Our study showed qualitative evidence that\nfostering interaction and giving users control in deciding which explanation\nthey would like to see can meet the demands of users with different needs,\npreferences, and goals, and consequently can have positive effects on different\ncrucial aspects in explainable recommendation, including transparency, trust,\nsatisfaction, and user experience.\n","authors":["Mouadh Guesmi","Mohamed Amine Chatti","Shoeb Joarder","Qurat Ul Ain","Rawaa Alatrash","Clara Siepmann","Tannaz Vahidi"],"pdf_url":"https://arxiv.org/pdf/2306.05809v3.pdf","comment":"This is an original manuscript of an article published by Taylor &\n  Francis in the International Journal of Human-Computer Interaction on 15 Oct\n  2023, available online: https://doi.org/10.1080/10447318.2023.2262797"},{"id":"http://arxiv.org/abs/2103.03223v3","updated":"2023-10-18T14:10:17Z","published":"2021-03-04T18:51:06Z","title":"A Comparative Evaluation of Quantification Methods","summary":"  Quantification represents the problem of predicting class distributions in a\ndataset. It also represents a growing research field in supervised machine\nlearning, for which a large variety of different algorithms has been proposed\nin recent years. However, a comprehensive empirical comparison of\nquantification methods that supports algorithm selection is not available yet.\nIn this work, we close this research gap by conducting a thorough empirical\nperformance comparison of 24 different quantification methods on overall more\nthan 40 data sets, considering binary as well as multiclass quantification\nsettings. We observe that no single algorithm generally outperforms all\ncompetitors, but identify a group of methods including the threshold\nselection-based Median Sweep and TSMax methods, the DyS framework, and\nFriedman's method that performs best in the binary setting. For the multiclass\nsetting, we observe that a different group of algorithms yields good\nperformance, including the Generalized Probabilistic Adjusted Count, the readme\nmethod, the energy distance minimization method, the EM algorithm for\nquantification, and Friedman's method. We also find that tuning the underlying\nclassifiers has in most cases only a limited impact on the quantification\nperformance. More generally, we find that the performance on multiclass\nquantification is inferior to the results obtained in the binary setting. Our\nresults can guide practitioners who intend to apply quantification algorithms\nand help researchers to identify opportunities for future research.\n","authors":["Tobias Schumacher","Markus Strohmaier","Florian Lemmerich"],"pdf_url":"https://arxiv.org/pdf/2103.03223v3.pdf","comment":"39 pages, 18 figures, 10 tables"},{"id":"http://arxiv.org/abs/2310.09716v2","updated":"2023-10-18T13:48:03Z","published":"2023-10-15T03:04:17Z","title":"Enhancing Conversational Search: Large Language Model-Aided Informative\n  Query Rewriting","summary":"  Query rewriting plays a vital role in enhancing conversational search by\ntransforming context-dependent user queries into standalone forms. Existing\napproaches primarily leverage human-rewritten queries as labels to train query\nrewriting models. However, human rewrites may lack sufficient information for\noptimal retrieval performance. To overcome this limitation, we propose\nutilizing large language models (LLMs) as query rewriters, enabling the\ngeneration of informative query rewrites through well-designed instructions. We\ndefine four essential properties for well-formed rewrites and incorporate all\nof them into the instruction. In addition, we introduce the role of rewrite\neditors for LLMs when initial query rewrites are available, forming a\n\"rewrite-then-edit\" process. Furthermore, we propose distilling the rewriting\ncapabilities of LLMs into smaller models to reduce rewriting latency. Our\nexperimental evaluation on the QReCC dataset demonstrates that informative\nquery rewrites can yield substantially improved retrieval performance compared\nto human rewrites, especially with sparse retrievers.\n","authors":["Fanghua Ye","Meng Fang","Shenghui Li","Emine Yilmaz"],"pdf_url":"https://arxiv.org/pdf/2310.09716v2.pdf","comment":"22 pages, accepted to EMNLP Findings 2023"},{"id":"http://arxiv.org/abs/2310.11931v1","updated":"2023-10-18T12:49:40Z","published":"2023-10-18T12:49:40Z","title":"Simulating Users in Interactive Web Table Retrieval","summary":"  Considering the multimodal signals of search items is beneficial for\nretrieval effectiveness. Especially in web table retrieval (WTR) experiments,\naccounting for multimodal properties of tables boosts effectiveness. However,\nit still remains an open question how the single modalities affect user\nexperience in particular. Previous work analyzed WTR performance in ad-hoc\nretrieval benchmarks, which neglects interactive search behavior and limits the\nconclusion about the implications for real-world user environments.\n  To this end, this work presents an in-depth evaluation of simulated\ninteractive WTR search sessions as a more cost-efficient and reproducible\nalternative to real user studies. As a first of its kind, we introduce\ninteractive query reformulation strategies based on Doc2Query, incorporating\ncognitive states of simulated user knowledge. Our evaluations include two\nperspectives on user effectiveness by considering different cost paradigms,\nnamely query-wise and time-oriented measures of effort. Our multi-perspective\nevaluation scheme reveals new insights about query strategies, the impact of\nmodalities, and different user types in simulated WTR search sessions.\n","authors":["Björn Engelmann","Timo Breuer","Philipp Schaer"],"pdf_url":"https://arxiv.org/pdf/2310.11931v1.pdf","comment":"4 pages + references; accepted at CIKM'23"},{"id":"http://arxiv.org/abs/2310.11852v1","updated":"2023-10-18T10:08:20Z","published":"2023-10-18T10:08:20Z","title":"CIR at the NTCIR-17 ULTRE-2 Task","summary":"  The Chinese academy of sciences Information Retrieval team (CIR) has\nparticipated in the NTCIR-17 ULTRE-2 task. This paper describes our approaches\nand reports our results on the ULTRE-2 task. We recognize the issue of false\nnegatives in the Baidu search data in this competition is very severe, much\nmore severe than position bias. Hence, we adopt the Dual Learning Algorithm\n(DLA) to address the position bias and use it as an auxiliary model to study\nhow to alleviate the false negative issue. We approach the problem from two\nperspectives: 1) correcting the labels for non-clicked items by a relevance\njudgment model trained from DLA, and learn a new ranker that is initialized\nfrom DLA; 2) including random documents as true negatives and documents that\nhave partial matching as hard negatives. Both methods can enhance the model\nperformance and our best method has achieved nDCG@10 of 0.5355, which is 2.66%\nbetter than the best score from the organizer.\n","authors":["Lulu Yu","Keping Bi","Jiafeng Guo","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2310.11852v1.pdf","comment":"5 pages, 1 figure, NTCIR-17"},{"id":"http://arxiv.org/abs/2310.11270v2","updated":"2023-10-18T08:54:32Z","published":"2023-10-17T13:42:32Z","title":"Graph Neural Networks for Recommendation: Reproducibility, Graph\n  Topology, and Node Representation","summary":"  Graph neural networks (GNNs) have gained prominence in recommendation systems\nin recent years. By representing the user-item matrix as a bipartite and\nundirected graph, GNNs have demonstrated their potential to capture short- and\nlong-distance user-item interactions, thereby learning more accurate preference\npatterns than traditional recommendation approaches. In contrast to previous\ntutorials on the same topic, this tutorial aims to present and examine three\nkey aspects that characterize GNNs for recommendation: (i) the reproducibility\nof state-of-the-art approaches, (ii) the potential impact of graph topological\ncharacteristics on the performance of these models, and (iii) strategies for\nlearning node representations when training features from scratch or utilizing\npre-trained embeddings as additional item information (e.g., multimodal\nfeatures). The goal is to provide three novel theoretical and practical\nperspectives on the field, currently subject to debate in graph learning but\nlong been overlooked in the context of recommendation systems.\n","authors":["Daniele Malitesta","Claudio Pomo","Tommaso Di Noia"],"pdf_url":"https://arxiv.org/pdf/2310.11270v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11777v1","updated":"2023-10-18T08:16:27Z","published":"2023-10-18T08:16:27Z","title":"DCRNN: A Deep Cross approach based on RNN for Partial Parameter Sharing\n  in Multi-task Learning","summary":"  In recent years, DL has developed rapidly, and personalized services are\nexploring using DL algorithms to improve the performance of the recommendation\nsystem. For personalized services, a successful recommendation consists of two\nparts: attracting users to click the item and users being willing to consume\nthe item. If both tasks need to be predicted at the same time, traditional\nrecommendation systems generally train two independent models. This approach is\ncumbersome and does not effectively model the relationship between the two\nsubtasks of \"click-consumption\". Therefore, in order to improve the success\nrate of recommendation and reduce computational costs, researchers are trying\nto model multi-task learning.\n  At present, existing multi-task learning models generally adopt hard\nparameter sharing or soft parameter sharing architecture, but these two\narchitectures each have certain problems. Therefore, in this work, we propose a\nnovel recommendation model based on real recommendation scenarios, Deep Cross\nnetwork based on RNN for partial parameter sharing (DCRNN). The model has three\ninnovations: 1) It adopts the idea of cross network and uses RNN network to\ncross-process the features, thereby effectively improves the expressive ability\nof the model; 2) It innovatively proposes the structure of partial parameter\nsharing; 3) It can effectively capture the potential correlation between\ndifferent tasks to optimize the efficiency and methods for learning different\ntasks.\n","authors":["Jie Zhou","Qian Yu"],"pdf_url":"https://arxiv.org/pdf/2310.11777v1.pdf","comment":"Work done while the first author was an algorithm engineer at Xiaomi\n  Inc"},{"id":"http://arxiv.org/abs/2310.11675v1","updated":"2023-10-18T02:59:38Z","published":"2023-10-18T02:59:38Z","title":"From Relevance to Utility: Evidence Retrieval with Feedback for Fact\n  Verification","summary":"  Retrieval-enhanced methods have become a primary approach in fact\nverification (FV); it requires reasoning over multiple retrieved pieces of\nevidence to verify the integrity of a claim. To retrieve evidence, existing\nwork often employs off-the-shelf retrieval models whose design is based on the\nprobability ranking principle. We argue that, rather than relevance, for FV we\nneed to focus on the utility that a claim verifier derives from the retrieved\nevidence. We introduce the feedback-based evidence retriever(FER) that\noptimizes the evidence retrieval process by incorporating feedback from the\nclaim verifier. As a feedback signal we use the divergence in utility between\nhow effectively the verifier utilizes the retrieved evidence and the\nground-truth evidence to produce the final claim label. Empirical studies\ndemonstrate the superiority of FER over prevailing baselines.\n","authors":["Hengran Zhang","Ruqing Zhang","Jiafeng Guo","Maarten de Rijke","Yixing Fan","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2310.11675v1.pdf","comment":"Acctepted by EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2310.11650v1","updated":"2023-10-18T01:28:29Z","published":"2023-10-18T01:28:29Z","title":"VKIE: The Application of Key Information Extraction on Video Text","summary":"  Extracting structured information from videos is critical for numerous\ndownstream applications in the industry. In this paper, we define a significant\ntask of extracting hierarchical key information from visual texts on videos. To\nfulfill this task, we decouples it into four subtasks and introduce two\nimplementation solutions called PipVKIE and UniVKIE. PipVKIE sequentially\ncompletes the four subtasks in continuous stages, while UniVKIE is improved by\nunifying all the subtasks into one backbone. Both PipVKIE and UniVKIE leverage\nmultimodal information from vision, text, and coordinates for feature\nrepresentation. Extensive experiments on one well-defined dataset demonstrate\nthat our solutions can achieve remarkable performance and efficient inference\nspeed. The code and dataset will be publicly available.\n","authors":["Siyu An","Ye Liu","Haoyuan Peng","Di Yin"],"pdf_url":"https://arxiv.org/pdf/2310.11650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11644v1","updated":"2023-10-18T01:06:01Z","published":"2023-10-18T01:06:01Z","title":"Open Information Extraction: A Review of Baseline Techniques,\n  Approaches, and Applications","summary":"  With the abundant amount of available online and offline text data, there\narises a crucial need to extract the relation between phrases and summarize the\nmain content of each document in a few words. For this purpose, there have been\nmany studies recently in Open Information Extraction (OIE). OIE improves upon\nrelation extraction techniques by analyzing relations across different domains\nand avoids requiring hand-labeling pre-specified relations in sentences. This\npaper surveys recent approaches of OIE and its applications on Knowledge Graph\n(KG), text summarization, and Question Answering (QA). Moreover, the paper\ndescribes OIE basis methods in relation extraction. It briefly discusses the\nmain approaches and the pros and cons of each method. Finally, it gives an\noverview about challenges, open issues, and future work opportunities for OIE,\nrelation extraction, and OIE applications.\n","authors":["Serafina Kamp","Morteza Fayazi","Zineb Benameur-El","Shuyan Yu","Ronald Dreslinski"],"pdf_url":"https://arxiv.org/pdf/2310.11644v1.pdf","comment":"15 pages, 9 figures"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2310.12153v1","updated":"2023-10-18T17:59:45Z","published":"2023-10-18T17:59:45Z","title":"Probabilistic Sampling of Balanced K-Means using Adiabatic Quantum\n  Computing","summary":"  Adiabatic quantum computing (AQC) is a promising quantum computing approach\nfor discrete and often NP-hard optimization problems. Current AQCs allow to\nimplement problems of research interest, which has sparked the development of\nquantum representations for many machine learning and computer vision tasks.\nDespite requiring multiple measurements from the noisy AQC, current approaches\nonly utilize the best measurement, discarding information contained in the\nremaining ones. In this work, we explore the potential of using this\ninformation for probabilistic balanced k-means clustering. Instead of\ndiscarding non-optimal solutions, we propose to use them to compute calibrated\nposterior probabilities with little additional compute cost. This allows us to\nidentify ambiguous solutions and data points, which we demonstrate on a D-Wave\nAQC on synthetic and real data.\n","authors":["Jan-Nico Zaech","Martin Danelljan","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2310.12153v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12145v1","updated":"2023-10-18T17:56:24Z","published":"2023-10-18T17:56:24Z","title":"Fairer and More Accurate Tabular Models Through NAS","summary":"  Making models algorithmically fairer in tabular data has been long studied,\nwith techniques typically oriented towards fixes which usually take a neural\nmodel with an undesirable outcome and make changes to how the data are\ningested, what the model weights are, or how outputs are processed. We employ\nan emergent and different strategy where we consider updating the model's\narchitecture and training hyperparameters to find an entirely new model with\nbetter outcomes from the beginning of the debiasing procedure. In this work, we\npropose using multi-objective Neural Architecture Search (NAS) and\nHyperparameter Optimization (HPO) in the first application to the very\nchallenging domain of tabular data. We conduct extensive exploration of\narchitectural and hyperparameter spaces (MLP, ResNet, and FT-Transformer)\nacross diverse datasets, demonstrating the dependence of accuracy and fairness\nmetrics of model predictions on hyperparameter combinations. We show that\nmodels optimized solely for accuracy with NAS often fail to inherently address\nfairness concerns. We propose a novel approach that jointly optimizes\narchitectural and training hyperparameters in a multi-objective constraint of\nboth accuracy and fairness. We produce architectures that consistently Pareto\ndominate state-of-the-art bias mitigation methods either in fairness, accuracy\nor both, all of this while being Pareto-optimal over hyperparameters achieved\nthrough single-objective (accuracy) optimization runs. This research\nunderscores the promise of automating fairness and accuracy optimization in\ndeep learning models.\n","authors":["Richeek Das","Samuel Dooley"],"pdf_url":"https://arxiv.org/pdf/2310.12145v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12144v1","updated":"2023-10-18T17:55:12Z","published":"2023-10-18T17:55:12Z","title":"Dynamic financial processes identification using sparse regressive\n  reservoir computers","summary":"  In this document, we present key findings in structured matrix approximation\ntheory, with applications to the regressive representation of dynamic financial\nprocesses. Initially, we explore a comprehensive approach involving generic\nnonlinear time delay embedding for time series data extracted from a financial\nor economic system under examination. Subsequently, we employ sparse\nleast-squares and structured matrix approximation methods to discern\napproximate representations of the output coupling matrices. These\nrepresentations play a pivotal role in establishing the regressive models\ncorresponding to the recursive structures inherent in a given financial system.\nThe document further introduces prototypical algorithms that leverage the\naforementioned techniques. These algorithms are demonstrated through\napplications in approximate identification and predictive simulation of dynamic\nfinancial and economic processes, encompassing scenarios that may or may not\nexhibit chaotic behavior.\n","authors":["Fredy Vides","Idelfonso B. R. Nogueira","Lendy Banegas","Evelyn Flores"],"pdf_url":"https://arxiv.org/pdf/2310.12144v1.pdf","comment":"The content of this publication represents the opinion of the\n  researchers affiliated with the Department of Statistics and Research, but\n  not the official opinion of the CNBS"},{"id":"http://arxiv.org/abs/2310.12143v1","updated":"2023-10-18T17:54:29Z","published":"2023-10-18T17:54:29Z","title":"Simple Mechanisms for Representing, Indexing and Manipulating Concepts","summary":"  Deep networks typically learn concepts via classifiers, which involves\nsetting up a model and training it via gradient descent to fit the\nconcept-labeled data. We will argue instead that learning a concept could be\ndone by looking at its moment statistics matrix to generate a concrete\nrepresentation or signature of that concept. These signatures can be used to\ndiscover structure across the set of concepts and could recursively produce\nhigher-level concepts by learning this structure from those signatures. When\nthe concepts are `intersected', signatures of the concepts can be used to find\na common theme across a number of related `intersected' concepts. This process\ncould be used to keep a dictionary of concepts so that inputs could correctly\nidentify and be routed to the set of concepts involved in the (latent)\ngeneration of the input.\n","authors":["Yuanzhi Li","Raghu Meka","Rina Panigrahy","Kulin Shah"],"pdf_url":"https://arxiv.org/pdf/2310.12143v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2305.18409v2","updated":"2023-10-18T17:39:04Z","published":"2023-05-28T16:13:59Z","title":"Direction-oriented Multi-objective Learning: Simple and Provable\n  Stochastic Algorithms","summary":"  Multi-objective optimization (MOO) has become an influential framework in\nmany machine learning problems with multiple objectives such as learning with\nmultiple criteria and multi-task learning (MTL). In this paper, we propose a\nnew direction-oriented multi-objective problem by regularizing the common\ndescent direction within a neighborhood of a direction that optimizes a linear\ncombination of objectives such as the average loss in MTL. This formulation\nincludes GD and MGDA as special cases, enjoys the direction-oriented benefit as\nin CAGrad, and facilitates the design of stochastic algorithms. To solve this\nproblem, we propose Stochastic Direction-oriented Multi-objective Gradient\ndescent (SDMGrad) with simple SGD type of updates, and its variant SDMGrad-OS\nwith an efficient objective sampling in the setting where the number of\nobjectives is large. For a constant-level regularization parameter $\\lambda$,\nwe show that SDMGrad and SDMGrad-OS provably converge to a Pareto stationary\npoint with improved complexities and milder assumptions. For an increasing\n$\\lambda$, this convergent point reduces to a stationary point of the linear\ncombination of objectives. We demonstrate the superior performance of the\nproposed methods in a series of tasks on multi-task supervised learning and\nreinforcement learning. Code is provided at\nhttps://github.com/ml-opt-lab/sdmgrad.\n","authors":["Peiyao Xiao","Hao Ban","Kaiyi Ji"],"pdf_url":"https://arxiv.org/pdf/2305.18409v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12128v1","updated":"2023-10-18T17:37:10Z","published":"2023-10-18T17:37:10Z","title":"DiagrammerGPT: Generating Open-Domain, Open-Platform Diagrams via LLM\n  Planning","summary":"  Text-to-image (T2I) generation has seen significant growth over the past few\nyears. Despite this, there has been little work on generating diagrams with T2I\nmodels. A diagram is a symbolic/schematic representation that explains\ninformation using structurally rich and spatially complex visualizations (e.g.,\na dense combination of related objects, text labels, directional arrows,\nconnection lines, etc.). Existing state-of-the-art T2I models often fail at\ndiagram generation because they lack fine-grained object layout control when\nmany objects are densely connected via complex relations such as arrows/lines\nand also often fail to render comprehensible text labels. To address this gap,\nwe present DiagrammerGPT, a novel two-stage text-to-diagram generation\nframework that leverages the layout guidance capabilities of LLMs (e.g., GPT-4)\nto generate more accurate open-domain, open-platform diagrams. In the first\nstage, we use LLMs to generate and iteratively refine 'diagram plans' (in a\nplanner-auditor feedback loop) which describe all the entities (objects and\ntext labels), their relationships (arrows or lines), and their bounding box\nlayouts. In the second stage, we use a diagram generator, DiagramGLIGEN, and a\ntext label rendering module to generate diagrams following the diagram plans.\nTo benchmark the text-to-diagram generation task, we introduce AI2D-Caption, a\ndensely annotated diagram dataset built on top of the AI2D dataset. We show\nquantitatively and qualitatively that our DiagrammerGPT framework produces more\naccurate diagrams, outperforming existing T2I models. We also provide\ncomprehensive analysis including open-domain diagram generation, vector graphic\ndiagram generation in different platforms, human-in-the-loop diagram plan\nediting, and multimodal planner/auditor LLMs (e.g., GPT-4Vision). We hope our\nwork can inspire further research on diagram generation via T2I models and\nLLMs.\n","authors":["Abhay Zala","Han Lin","Jaemin Cho","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2310.12128v1.pdf","comment":"Project page: https://diagrammerGPT.github.io/"},{"id":"http://arxiv.org/abs/2310.12127v1","updated":"2023-10-18T17:36:55Z","published":"2023-10-18T17:36:55Z","title":"A Tale of Pronouns: Interpretability Informs Gender Bias Mitigation for\n  Fairer Instruction-Tuned Machine Translation","summary":"  Recent instruction fine-tuned models can solve multiple NLP tasks when\nprompted to do so, with machine translation (MT) being a prominent use case.\nHowever, current research often focuses on standard performance benchmarks,\nleaving compelling fairness and ethical considerations behind. In MT, this\nmight lead to misgendered translations, resulting, among other harms, in the\nperpetuation of stereotypes and prejudices. In this work, we address this gap\nby investigating whether and to what extent such models exhibit gender bias in\nmachine translation and how we can mitigate it. Concretely, we compute\nestablished gender bias metrics on the WinoMT corpus from English to German and\nSpanish. We discover that IFT models default to male-inflected translations,\neven disregarding female occupational stereotypes. Next, using interpretability\nmethods, we unveil that models systematically overlook the pronoun indicating\nthe gender of a target occupation in misgendered translations. Finally, based\non this finding, we propose an easy-to-implement and effective bias mitigation\nsolution based on few-shot learning that leads to significantly fairer\ntranslations.\n","authors":["Giuseppe Attanasio","Flor Miriam Plaza-del-Arco","Debora Nozza","Anne Lauscher"],"pdf_url":"https://arxiv.org/pdf/2310.12127v1.pdf","comment":"Accepted at EMNLP 2023. Code and data at\n  https://github.com/MilaNLProc/interpretability-mt-gender-bias"},{"id":"http://arxiv.org/abs/2310.12126v1","updated":"2023-10-18T17:35:15Z","published":"2023-10-18T17:35:15Z","title":"SHARCS: Efficient Transformers through Routing with Dynamic Width\n  Sub-networks","summary":"  We introduce SHARCS for adaptive inference that takes into account the\nhardness of input samples. SHARCS can train a router on any transformer\nnetwork, enabling the model to direct different samples to sub-networks with\nvarying widths. Our experiments demonstrate that: (1) SHARCS outperforms or\ncomplements existing per-sample adaptive inference methods across various\nclassification tasks in terms of accuracy vs. FLOPs; (2) SHARCS generalizes\nacross different architectures and can be even applied to compressed and\nefficient transformer encoders to further improve their efficiency; (3) SHARCS\ncan provide a 2 times inference speed up at an insignificant drop in accuracy.\n","authors":["Mohammadreza Salehi","Sachin Mehta","Aditya Kusupati","Ali Farhadi","Hannaneh Hajishirzi"],"pdf_url":"https://arxiv.org/pdf/2310.12126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10065v2","updated":"2023-10-18T17:30:41Z","published":"2023-09-12T18:48:18Z","title":"Bayesian longitudinal tensor response regression for modeling\n  neuroplasticity","summary":"  A major interest in longitudinal neuroimaging studies involves investigating\nvoxel-level neuroplasticity due to treatment and other factors across visits.\nHowever, traditional voxel-wise methods are beset with several pitfalls, which\ncan compromise the accuracy of these approaches. We propose a novel Bayesian\ntensor response regression approach for longitudinal imaging data, which pools\ninformation across spatially-distributed voxels to infer significant changes\nwhile adjusting for covariates. The proposed method, which is implemented using\nMarkov chain Monte Carlo (MCMC) sampling, utilizes low-rank decomposition to\nreduce dimensionality and preserve spatial configurations of voxels when\nestimating coefficients. It also enables feature selection via joint credible\nregions which respect the shape of the posterior distributions for more\naccurate inference. In addition to group level inferences, the method is able\nto infer individual-level neuroplasticity, allowing for examination of\npersonalized disease or recovery trajectories. The advantages of the proposed\napproach in terms of prediction and feature selection over voxel-wise\nregression are highlighted via extensive simulation studies. Subsequently, we\napply the approach to a longitudinal Aphasia dataset consisting of task\nfunctional MRI images from a group of subjects who were administered either a\ncontrol intervention or intention treatment at baseline and were followed up\nover subsequent visits. Our analysis revealed that while the control therapy\nshowed long-term increases in brain activity, the intention treatment produced\npredominantly short-term changes, both of which were concentrated in distinct\nlocalized regions. In contrast, the voxel-wise regression failed to detect any\nsignificant neuroplasticity after multiplicity adjustments, which is\nbiologically implausible and implies lack of power.\n","authors":["Suprateek Kundu","Alec Reinhardt","Serena Song","Joo Han","M. Lawson Meadows","Bruce Crosson","Venkatagiri Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2309.10065v2.pdf","comment":"28 pages, 8 figures, 6 tables"},{"id":"http://arxiv.org/abs/2306.15444v2","updated":"2023-10-18T17:21:28Z","published":"2023-06-27T12:59:56Z","title":"Limited-Memory Greedy Quasi-Newton Method with Non-asymptotic\n  Superlinear Convergence Rate","summary":"  Non-asymptotic convergence analysis of quasi-Newton methods has gained\nattention with a landmark result establishing an explicit local superlinear\nrate of O$((1/\\sqrt{t})^t)$. The methods that obtain this rate, however,\nexhibit a well-known drawback: they require the storage of the previous Hessian\napproximation matrix or all past curvature information to form the current\nHessian inverse approximation. Limited-memory variants of quasi-Newton methods\nsuch as the celebrated L-BFGS alleviate this issue by leveraging a limited\nwindow of past curvature information to construct the Hessian inverse\napproximation. As a result, their per iteration complexity and storage\nrequirement is O$(\\tau d)$ where $\\tau\\le d$ is the size of the window and $d$\nis the problem dimension reducing the O$(d^2)$ computational cost and memory\nrequirement of standard quasi-Newton methods. However, to the best of our\nknowledge, there is no result showing a non-asymptotic superlinear convergence\nrate for any limited-memory quasi-Newton method. In this work, we close this\ngap by presenting a Limited-memory Greedy BFGS (LG-BFGS) method that can\nachieve an explicit non-asymptotic superlinear rate. We incorporate\ndisplacement aggregation, i.e., decorrelating projection, in post-processing\ngradient variations, together with a basis vector selection scheme on variable\nvariations, which greedily maximizes a progress measure of the Hessian estimate\nto the true Hessian. Their combination allows past curvature information to\nremain in a sparse subspace while yielding a valid representation of the full\nhistory. Interestingly, our established non-asymptotic superlinear convergence\nrate demonstrates an explicit trade-off between the convergence speed and\nmemory requirement, which to our knowledge, is the first of its kind. Numerical\nresults corroborate our theoretical findings and demonstrate the effectiveness\nof our method.\n","authors":["Zhan Gao","Aryan Mokhtari","Alec Koppel"],"pdf_url":"https://arxiv.org/pdf/2306.15444v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12121v1","updated":"2023-10-18T17:21:01Z","published":"2023-10-18T17:21:01Z","title":"Automatic prediction of mortality in patients with mental illness using\n  electronic health records","summary":"  Mental disorders impact the lives of millions of people globally, not only\nimpeding their day-to-day lives but also markedly reducing life expectancy.\nThis paper addresses the persistent challenge of predicting mortality in\npatients with mental diagnoses using predictive machine-learning models with\nelectronic health records (EHR). Data from patients with mental disease\ndiagnoses were extracted from the well-known clinical MIMIC-III data set\nutilizing demographic, prescription, and procedural information. Four machine\nlearning algorithms (Logistic Regression, Random Forest, Support Vector\nMachine, and K-Nearest Neighbors) were used, with results indicating that\nRandom Forest and Support Vector Machine models outperformed others, with AUC\nscores of 0.911. Feature importance analysis revealed that drug prescriptions,\nparticularly Morphine Sulfate, play a pivotal role in prediction. We applied a\nvariety of machine learning algorithms to predict 30-day mortality followed by\nfeature importance analysis. This study can be used to assist hospital workers\nin identifying at-risk patients to reduce excess mortality.\n","authors":["Sean Kim","Samuel Kim"],"pdf_url":"https://arxiv.org/pdf/2310.12121v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12115v1","updated":"2023-10-18T17:12:29Z","published":"2023-10-18T17:12:29Z","title":"MMD-based Variable Importance for Distributional Random Forest","summary":"  Distributional Random Forest (DRF) is a flexible forest-based method to\nestimate the full conditional distribution of a multivariate output of interest\ngiven input variables. In this article, we introduce a variable importance\nalgorithm for DRFs, based on the well-established drop and relearn principle\nand MMD distance. While traditional importance measures only detect variables\nwith an influence on the output mean, our algorithm detects variables impacting\nthe output distribution more generally. We show that the introduced importance\nmeasure is consistent, exhibits high empirical performance on both real and\nsimulated data, and outperforms competitors. In particular, our algorithm is\nhighly efficient to select variables through recursive feature elimination, and\ncan therefore provide small sets of variables to build accurate estimates of\nconditional output distributions.\n","authors":["Clément Bénard","Jeffrey Näf","Julie Josse"],"pdf_url":"https://arxiv.org/pdf/2310.12115v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08466v2","updated":"2023-10-18T17:08:26Z","published":"2023-02-16T18:20:27Z","title":"Marich: A Query-efficient Distributionally Equivalent Model Extraction\n  Attack using Public Data","summary":"  We study design of black-box model extraction attacks that can send minimal\nnumber of queries from a publicly available dataset to a target ML model\nthrough a predictive API with an aim to create an informative and\ndistributionally equivalent replica of the target. First, we define\ndistributionally equivalent and Max-Information model extraction attacks, and\nreduce them into a variational optimisation problem. The attacker sequentially\nsolves this optimisation problem to select the most informative queries that\nsimultaneously maximise the entropy and reduce the mismatch between the target\nand the stolen models. This leads to an active sampling-based query selection\nalgorithm, Marich, which is model-oblivious. Then, we evaluate Marich on\ndifferent text and image data sets, and different models, including CNNs and\nBERT. Marich extracts models that achieve $\\sim 60-95\\%$ of true model's\naccuracy and uses $\\sim 1,000 - 8,500$ queries from the publicly available\ndatasets, which are different from the private training datasets. Models\nextracted by Marich yield prediction distributions, which are $\\sim 2-4\\times$\ncloser to the target's distribution in comparison to the existing active\nsampling-based attacks. The extracted models also lead to $84-96\\%$ accuracy\nunder membership inference attacks. Experimental results validate that Marich\nis query-efficient, and capable of performing task-accurate, high-fidelity, and\ninformative model extraction.\n","authors":["Pratik Karmakar","Debabrota Basu"],"pdf_url":"https://arxiv.org/pdf/2302.08466v2.pdf","comment":"Presented in the Privacy-Preserving AI (PPAI) workshop at AAAI 2023\n  as a spotlight talk and published in NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.12112v1","updated":"2023-10-18T17:07:07Z","published":"2023-10-18T17:07:07Z","title":"A Cautionary Tale: On the Role of Reference Data in Empirical Privacy\n  Defenses","summary":"  Within the realm of privacy-preserving machine learning, empirical privacy\ndefenses have been proposed as a solution to achieve satisfactory levels of\ntraining data privacy without a significant drop in model utility. Most\nexisting defenses against membership inference attacks assume access to\nreference data, defined as an additional dataset coming from the same (or a\nsimilar) underlying distribution as training data. Despite the common use of\nreference data, previous works are notably reticent about defining and\nevaluating reference data privacy. As gains in model utility and/or training\ndata privacy may come at the expense of reference data privacy, it is essential\nthat all three aspects are duly considered. In this paper, we first examine the\navailability of reference data and its privacy treatment in previous works and\ndemonstrate its necessity for fairly comparing defenses. Second, we propose a\nbaseline defense that enables the utility-privacy tradeoff with respect to both\ntraining and reference data to be easily understood. Our method is formulated\nas an empirical risk minimization with a constraint on the generalization\nerror, which, in practice, can be evaluated as a weighted empirical risk\nminimization (WERM) over the training and reference datasets. Although we\nconceived of WERM as a simple baseline, our experiments show that,\nsurprisingly, it outperforms the most well-studied and current state-of-the-art\nempirical privacy defenses using reference data for nearly all relative privacy\nlevels of reference and training data. Our investigation also reveals that\nthese existing methods are unable to effectively trade off reference data\nprivacy for model utility and/or training data privacy. Overall, our work\nhighlights the need for a proper evaluation of the triad model utility /\ntraining data privacy / reference data privacy when comparing privacy defenses.\n","authors":["Caelin G. Kaplan","Chuan Xu","Othmane Marfoq","Giovanni Neglia","Anderson Santana de Oliveira"],"pdf_url":"https://arxiv.org/pdf/2310.12112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12109v1","updated":"2023-10-18T17:06:22Z","published":"2023-10-18T17:06:22Z","title":"Monarch Mixer: A Simple Sub-Quadratic GEMM-Based Architecture","summary":"  Machine learning models are increasingly being scaled in both sequence length\nand model dimension to reach longer contexts and better performance. However,\nexisting architectures such as Transformers scale quadratically along both\nthese axes. We ask: are there performant architectures that can scale\nsub-quadratically along sequence length and model dimension? We introduce\nMonarch Mixer (M2), a new architecture that uses the same sub-quadratic\nprimitive along both sequence length and model dimension: Monarch matrices, a\nsimple class of expressive structured matrices that captures many linear\ntransforms, achieves high hardware efficiency on GPUs, and scales\nsub-quadratically. As a proof of concept, we explore the performance of M2 in\nthree domains: non-causal BERT-style language modeling, ViT-style image\nclassification, and causal GPT-style language modeling. For non-causal\nBERT-style modeling, M2 matches BERT-base and BERT-large in downstream GLUE\nquality with up to 27% fewer parameters, and achieves up to 9.1$\\times$ higher\nthroughput at sequence length 4K. On ImageNet, M2 outperforms ViT-b by 1% in\naccuracy, with only half the parameters. Causal GPT-style models introduce a\ntechnical challenge: enforcing causality via masking introduces a quadratic\nbottleneck. To alleviate this bottleneck, we develop a novel theoretical view\nof Monarch matrices based on multivariate polynomial evaluation and\ninterpolation, which lets us parameterize M2 to be causal while remaining\nsub-quadratic. Using this parameterization, M2 matches GPT-style Transformers\nat 360M parameters in pretraining perplexity on The PILE--showing for the first\ntime that it may be possible to match Transformer quality without attention or\nMLPs.\n","authors":["Daniel Y. Fu","Simran Arora","Jessica Grogan","Isys Johnson","Sabri Eyuboglu","Armin W. Thomas","Benjamin Spector","Michael Poli","Atri Rudra","Christopher Ré"],"pdf_url":"https://arxiv.org/pdf/2310.12109v1.pdf","comment":"NeurIPS 2023 (Oral)"},{"id":"http://arxiv.org/abs/2310.12107v1","updated":"2023-10-18T17:01:32Z","published":"2023-10-18T17:01:32Z","title":"An Online Learning Theory of Brokerage","summary":"  We investigate brokerage between traders from an online learning perspective.\nAt any round $t$, two traders arrive with their private valuations, and the\nbroker proposes a trading price. Unlike other bilateral trade problems already\nstudied in the online learning literature, we focus on the case where there are\nno designated buyer and seller roles: each trader will attempt to either buy or\nsell depending on the current price of the good.\n  We assume the agents' valuations are drawn i.i.d. from a fixed but unknown\ndistribution. If the distribution admits a density bounded by some constant\n$M$, then, for any time horizon $T$:\n  $\\bullet$ If the agents' valuations are revealed after each interaction, we\nprovide an algorithm achieving regret $M \\log T$ and show this rate is optimal,\nup to constant factors.\n  $\\bullet$ If only their willingness to sell or buy at the proposed price is\nrevealed after each interaction, we provide an algorithm achieving regret\n$\\sqrt{M T}$ and show this rate is optimal, up to constant factors.\n  Finally, if we drop the bounded density assumption, we show that the optimal\nrate degrades to $\\sqrt{T}$ in the first case, and the problem becomes\nunlearnable in the second.\n","authors":["Nataša Bolić","Tommaso Cesari","Roberto Colomboni"],"pdf_url":"https://arxiv.org/pdf/2310.12107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.11241v3","updated":"2023-10-18T16:43:35Z","published":"2023-01-26T17:25:45Z","title":"On the Convergence of No-Regret Learning Dynamics in Time-Varying Games","summary":"  Most of the literature on learning in games has focused on the restrictive\nsetting where the underlying repeated game does not change over time. Much less\nis known about the convergence of no-regret learning algorithms in dynamic\nmultiagent settings. In this paper, we characterize the convergence of\noptimistic gradient descent (OGD) in time-varying games. Our framework yields\nsharp convergence bounds for the equilibrium gap of OGD in zero-sum games\nparameterized on natural variation measures of the sequence of games, subsuming\nknown results for static games. Furthermore, we establish improved second-order\nvariation bounds under strong convexity-concavity, as long as each game is\nrepeated multiple times. Our results also apply to time-varying general-sum\nmulti-player games via a bilinear formulation of correlated equilibria, which\nhas novel implications for meta-learning and for obtaining refined\nvariation-dependent regret bounds, addressing questions left open in prior\npapers. Finally, we leverage our framework to also provide new insights on\ndynamic regret guarantees in static games.\n","authors":["Ioannis Anagnostides","Ioannis Panageas","Gabriele Farina","Tuomas Sandholm"],"pdf_url":"https://arxiv.org/pdf/2301.11241v3.pdf","comment":"To appear at NeurIPS 2023; V3 incorporates reviewers' feedback and\n  minor corrections"},{"id":"http://arxiv.org/abs/2310.12100v1","updated":"2023-10-18T16:43:08Z","published":"2023-10-18T16:43:08Z","title":"Non-Intrusive Adaptation: Input-Centric Parameter-efficient Fine-Tuning\n  for Versatile Multimodal Modeling","summary":"  Large language models (LLMs) and vision language models (VLMs) demonstrate\nexcellent performance on a wide range of tasks by scaling up parameter counts\nfrom O(10^9) to O(10^{12}) levels and further beyond. These large scales make\nit impossible to adapt and deploy fully specialized models given a task of\ninterest. Parameter-efficient fine-tuning (PEFT) emerges as a promising\ndirection to tackle the adaptation and serving challenges for such large\nmodels. We categorize PEFT techniques into two types: intrusive and\nnon-intrusive. Intrusive PEFT techniques directly change a model's internal\narchitecture. Though more flexible, they introduce significant complexities for\ntraining and serving. Non-intrusive PEFT techniques leave the internal\narchitecture unchanged and only adapt model-external parameters, such as\nembeddings for input. In this work, we describe AdaLink as a non-intrusive PEFT\ntechnique that achieves competitive performance compared to SoTA intrusive PEFT\n(LoRA) and full model fine-tuning (FT) on various tasks. We evaluate using both\ntext-only and multimodal tasks, with experiments that account for both\nparameter-count scaling and training regime (with and without instruction\ntuning).\n","authors":["Yaqing Wang","Jialin Wu","Tanmaya Dabral","Jiageng Zhang","Geoff Brown","Chun-Ta Lu","Frederick Liu","Yi Liang","Bo Pang","Michael Bendersky","Radu Soricut"],"pdf_url":"https://arxiv.org/pdf/2310.12100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12095v1","updated":"2023-10-18T16:38:23Z","published":"2023-10-18T16:38:23Z","title":"On the latent dimension of deep autoencoders for reduced order modeling\n  of PDEs parametrized by random fields","summary":"  Deep Learning is having a remarkable impact on the design of Reduced Order\nModels (ROMs) for Partial Differential Equations (PDEs), where it is exploited\nas a powerful tool for tackling complex problems for which classical methods\nmight fail. In this respect, deep autoencoders play a fundamental role, as they\nprovide an extremely flexible tool for reducing the dimensionality of a given\nproblem by leveraging on the nonlinear capabilities of neural networks. Indeed,\nstarting from this paradigm, several successful approaches have already been\ndeveloped, which are here referred to as Deep Learning-based ROMs (DL-ROMs).\nNevertheless, when it comes to stochastic problems parameterized by random\nfields, the current understanding of DL-ROMs is mostly based on empirical\nevidence: in fact, their theoretical analysis is currently limited to the case\nof PDEs depending on a finite number of (deterministic) parameters. The purpose\nof this work is to extend the existing literature by providing some theoretical\ninsights about the use of DL-ROMs in the presence of stochasticity generated by\nrandom fields. In particular, we derive explicit error bounds that can guide\ndomain practitioners when choosing the latent dimension of deep autoencoders.\nWe evaluate the practical usefulness of our theory by means of numerical\nexperiments, showing how our analysis can significantly impact the performance\nof DL-ROMs.\n","authors":["Nicola Rares Franco","Daniel Fraulin","Andrea Manzoni","Paolo Zunino"],"pdf_url":"https://arxiv.org/pdf/2310.12095v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12086v1","updated":"2023-10-18T16:27:49Z","published":"2023-10-18T16:27:49Z","title":"Unveiling the Siren's Song: Towards Reliable Fact-Conflicting\n  Hallucination Detection","summary":"  Large Language Models (LLMs), such as ChatGPT/GPT-4, have garnered widespread\nattention owing to their myriad of practical applications, yet their adoption\nhas been constrained by issues of fact-conflicting hallucinations across web\nplatforms. The assessment of factuality in text, produced by LLMs, remains\ninadequately explored, extending not only to the judgment of vanilla facts but\nalso encompassing the evaluation of factual errors emerging in complex\ninferential tasks like multi-hop, and etc. In response, we introduce FactCHD, a\nfact-conflicting hallucination detection benchmark meticulously designed for\nLLMs. Functioning as a pivotal tool in evaluating factuality within\n\"Query-Respons\" contexts, our benchmark assimilates a large-scale dataset,\nencapsulating a broad spectrum of factuality patterns, such as vanilla,\nmulti-hops, comparison, and set-operation patterns. A distinctive feature of\nour benchmark is its incorporation of fact-based chains of evidence, thereby\nfacilitating comprehensive and conducive factual reasoning throughout the\nassessment process. We evaluate multiple LLMs, demonstrating the effectiveness\nof the benchmark and current methods fall short of faithfully detecting factual\nerrors. Furthermore, we present TRUTH-TRIANGULATOR that synthesizes reflective\nconsiderations by tool-enhanced ChatGPT and LoRA-tuning based on Llama2, aiming\nto yield more credible detection through the amalgamation of predictive results\nand evidence. The benchmark dataset and source code will be made available in\nhttps://github.com/zjunlp/FactCHD.\n","authors":["Xiang Chen","Duanzheng Song","Honghao Gui","Chengxi Wang","Ningyu Zhang","Fei Huang","Chengfei Lv","Dan Zhang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.12086v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2306.00840v2","updated":"2023-10-18T16:25:46Z","published":"2023-06-01T16:01:23Z","title":"What model does MuZero learn?","summary":"  Model-based reinforcement learning has drawn considerable interest in recent\nyears, given its promise to improve sample efficiency. Moreover, when using\ndeep-learned models, it is potentially possible to learn compact models from\ncomplex sensor data. However, the effectiveness of these learned models,\nparticularly their capacity to plan, i.e., to improve the current policy,\nremains unclear. In this work, we study MuZero, a well-known deep model-based\nreinforcement learning algorithm, and explore how far it achieves its learning\nobjective of a value-equivalent model and how useful the learned models are for\npolicy improvement. Amongst various other insights, we conclude that the model\nlearned by MuZero cannot effectively generalize to evaluate unseen policies,\nwhich limits the extent to which we can additionally improve the current policy\nby planning with the model.\n","authors":["Jinke He","Thomas M. Moerland","Frans A. Oliehoek"],"pdf_url":"https://arxiv.org/pdf/2306.00840v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12083v1","updated":"2023-10-18T16:24:23Z","published":"2023-10-18T16:24:23Z","title":"Contributing Components of Metabolic Energy Models to Metabolic Cost\n  Estimations in Gait","summary":"  Objective: As metabolic cost is a primary factor influencing humans' gait, we\nwant to deepen our understanding of metabolic energy expenditure models.\nTherefore, this paper identifies the parameters and input variables, such as\nmuscle or joint states, that contribute to accurate metabolic cost estimations.\nMethods: We explored the parameters of four metabolic energy expenditure models\nin a Monte Carlo sensitivity analysis. Then, we analysed the model parameters\nby their calculated sensitivity indices, physiological context, and the\nresulting metabolic rates during the gait cycle. The parameter combination with\nthe highest accuracy in the Monte Carlo simulations represented a\nquasi-optimized model. In the second step, we investigated the importance of\ninput parameters and variables by analysing the accuracy of neural networks\ntrained with different input features. Results: Power-related parameters were\nmost influential in the sensitivity analysis and the neural network-based\nfeature selection. We observed that the quasi-optimized models produced\nnegative metabolic rates, contradicting muscle physiology. Neural network-based\nmodels showed promising abilities but have been unable to match the accuracy of\ntraditional metabolic energy expenditure models. Conclusion: We showed that\npower-related metabolic energy expenditure model parameters and inputs are most\ninfluential during gait. Furthermore, our results suggest that neural\nnetwork-based metabolic energy expenditure models are viable. However, bigger\ndatasets are required to achieve better accuracy. Significance: As there is a\nneed for more accurate metabolic energy expenditure models, we explored which\nmusculoskeletal parameters are essential when developing a model to estimate\nmetabolic energy.\n","authors":["Markus Gambietz","Marlies Nitschke","Jörg Miehling","Anne Koelewijn"],"pdf_url":"https://arxiv.org/pdf/2310.12083v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12079v1","updated":"2023-10-18T16:15:10Z","published":"2023-10-18T16:15:10Z","title":"Differential Equation Scaling Limits of Shaped and Unshaped Neural\n  Networks","summary":"  Recent analyses of neural networks with shaped activations (i.e. the\nactivation function is scaled as the network size grows) have led to scaling\nlimits described by differential equations. However, these results do not a\npriori tell us anything about \"ordinary\" unshaped networks, where the\nactivation is unchanged as the network size grows. In this article, we find\nsimilar differential equation based asymptotic characterization for two types\nof unshaped networks.\n  Firstly, we show that the following two architectures converge to the same\ninfinite-depth-and-width limit at initialization: (i) a fully connected ResNet\nwith a $d^{-1/2}$ factor on the residual branch, where $d$ is the network\ndepth. (ii) a multilayer perceptron (MLP) with depth $d \\ll$ width $n$ and\nshaped ReLU activation at rate $d^{-1/2}$.\n  Secondly, for an unshaped MLP at initialization, we derive the first order\nasymptotic correction to the layerwise correlation. In particular, if\n$\\rho_\\ell$ is the correlation at layer $\\ell$, then $q_t = \\ell^2 (1 -\n\\rho_\\ell)$ with $t = \\frac{\\ell}{n}$ converges to an SDE with a singularity at\n$t=0$.\n  These results together provide a connection between shaped and unshaped\nnetwork architectures, and opens up the possibility of studying the effect of\nnormalization methods and how it connects with shaping activation functions.\n","authors":["Mufan Bill Li","Mihai Nica"],"pdf_url":"https://arxiv.org/pdf/2310.12079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12077v1","updated":"2023-10-18T16:13:35Z","published":"2023-10-18T16:13:35Z","title":"One-Shot Imitation Learning: A Pose Estimation Perspective","summary":"  In this paper, we study imitation learning under the challenging setting of:\n(1) only a single demonstration, (2) no further data collection, and (3) no\nprior task or object knowledge. We show how, with these constraints, imitation\nlearning can be formulated as a combination of trajectory transfer and unseen\nobject pose estimation. To explore this idea, we provide an in-depth study on\nhow state-of-the-art unseen object pose estimators perform for one-shot\nimitation learning on ten real-world tasks, and we take a deep dive into the\neffects that camera calibration, pose estimation error, and spatial\ngeneralisation have on task success rates. For videos, please visit\nhttps://www.robot-learning.uk/pose-estimation-perspective.\n","authors":["Pietro Vitiello","Kamil Dreczkowski","Edward Johns"],"pdf_url":"https://arxiv.org/pdf/2310.12077v1.pdf","comment":"Published at the 7th Conference on Robot Learning (CoRL 2023). For\n  more details please visit\n  https://www.robot-learning.uk/pose-estimation-perspective"},{"id":"http://arxiv.org/abs/2303.06275v2","updated":"2023-10-18T16:11:11Z","published":"2023-03-11T01:24:10Z","title":"A Systematic Study of Joint Representation Learning on Protein Sequences\n  and Structures","summary":"  Learning effective protein representations is critical in a variety of tasks\nin biology such as predicting protein functions. Recent sequence representation\nlearning methods based on Protein Language Models (PLMs) excel in\nsequence-based tasks, but their direct adaptation to tasks involving protein\nstructures remains a challenge. In contrast, structure-based methods leverage\n3D structural information with graph neural networks and geometric pre-training\nmethods show potential in function prediction tasks, but still suffers from the\nlimited number of available structures. To bridge this gap, our study\nundertakes a comprehensive exploration of joint protein representation learning\nby integrating a state-of-the-art PLM (ESM-2) with distinct structure encoders\n(GVP, GearNet, CDConv). We introduce three representation fusion strategies and\nexplore different pre-training techniques. Our method achieves significant\nimprovements over existing sequence- and structure-based methods, setting new\nstate-of-the-art for function annotation. This study underscores several\nimportant design choices for fusing protein sequence and structure information.\nOur implementation is available at\nhttps://github.com/DeepGraphLearning/ESM-GearNet.\n","authors":["Zuobai Zhang","Chuanrui Wang","Minghao Xu","Vijil Chenthamarakshan","Aurélie Lozano","Payel Das","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2303.06275v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12672v2","updated":"2023-10-18T16:05:32Z","published":"2023-07-24T10:20:14Z","title":"Global k-Space Interpolation for Dynamic MRI Reconstruction using Masked\n  Image Modeling","summary":"  In dynamic Magnetic Resonance Imaging (MRI), k-space is typically\nundersampled due to limited scan time, resulting in aliasing artifacts in the\nimage domain. Hence, dynamic MR reconstruction requires not only modeling\nspatial frequency components in the x and y directions of k-space but also\nconsidering temporal redundancy. Most previous works rely on image-domain\nregularizers (priors) to conduct MR reconstruction. In contrast, we focus on\ninterpolating the undersampled k-space before obtaining images with Fourier\ntransform. In this work, we connect masked image modeling with k-space\ninterpolation and propose a novel Transformer-based k-space Global\nInterpolation Network, termed k-GIN. Our k-GIN learns global dependencies among\nlow- and high-frequency components of 2D+t k-space and uses it to interpolate\nunsampled data. Further, we propose a novel k-space Iterative Refinement Module\n(k-IRM) to enhance the high-frequency components learning. We evaluate our\napproach on 92 in-house 2D+t cardiac MR subjects and compare it to MR\nreconstruction methods with image-domain regularizers. Experiments show that\nour proposed k-space interpolation method quantitatively and qualitatively\noutperforms baseline methods. Importantly, the proposed approach achieves\nsubstantially higher robustness and generalizability in cases of\nhighly-undersampled MR data. For video presentation, poster, GIF results and\ncode please check our project page:\nhttps://jzpeterpan.github.io/k-gin.github.io/.\n","authors":["Jiazhen Pan","Suprosanna Shit","Özgün Turgut","Wenqi Huang","Hongwei Bran Li","Nil Stolt-Ansó","Thomas Küstner","Kerstin Hammernik","Daniel Rueckert"],"pdf_url":"https://arxiv.org/pdf/2307.12672v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12069v1","updated":"2023-10-18T16:02:32Z","published":"2023-10-18T16:02:32Z","title":"Transformers for scientific data: a pedagogical review for astronomers","summary":"  The deep learning architecture associated with ChatGPT and related generative\nAI products is known as transformers. Initially applied to Natural Language\nProcessing, transformers and the self-attention mechanism they exploit have\ngained widespread interest across the natural sciences. The goal of this\npedagogical and informal review is to introduce transformers to scientists. Our\npedagogical and informal review includes the mathematics underlying the\nattention mechanism, a description of the original transformer architecture,\nand a section on applications to time series and imaging data in astronomy. We\ninclude with a Frequently Asked Questions section for readers who are curious\nabout generative AI and interested in getting started with transformers for\ntheir research problem.\n","authors":["Dimitrios Tanoglidis","Bhuvnesh Jain","Helen Qu"],"pdf_url":"https://arxiv.org/pdf/2310.12069v1.pdf","comment":"17 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.12063v1","updated":"2023-10-18T15:53:20Z","published":"2023-10-18T15:53:20Z","title":"Black-Box Training Data Identification in GANs via Detector Networks","summary":"  Since their inception Generative Adversarial Networks (GANs) have been\npopular generative models across images, audio, video, and tabular data. In\nthis paper we study whether given access to a trained GAN, as well as fresh\nsamples from the underlying distribution, if it is possible for an attacker to\nefficiently identify if a given point is a member of the GAN's training data.\nThis is of interest for both reasons related to copyright, where a user may\nwant to determine if their copyrighted data has been used to train a GAN, and\nin the study of data privacy, where the ability to detect training set\nmembership is known as a membership inference attack. Unlike the majority of\nprior work this paper investigates the privacy implications of using GANs in\nblack-box settings, where the attack only has access to samples from the\ngenerator, rather than access to the discriminator as well. We introduce a\nsuite of membership inference attacks against GANs in the black-box setting and\nevaluate our attacks on image GANs trained on the CIFAR10 dataset and tabular\nGANs trained on genomic data. Our most successful attack, called The Detector,\ninvolve training a second network to score samples based on their likelihood of\nbeing generated by the GAN, as opposed to a fresh sample from the distribution.\nWe prove under a simple model of the generator that the detector is an\napproximately optimal membership inference attack. Across a wide range of\ntabular and image datasets, attacks, and GAN architectures, we find that\nadversaries can orchestrate non-trivial privacy attacks when provided with\naccess to samples from the generator. At the same time, the attack success\nachievable against GANs still appears to be lower compared to other generative\nand discriminative models; this leaves the intriguing open question of whether\nGANs are in fact more private, or if it is a matter of developing stronger\nattacks.\n","authors":["Lukman Olagoke","Salil Vadhan","Seth Neel"],"pdf_url":"https://arxiv.org/pdf/2310.12063v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11141v4","updated":"2023-10-18T15:52:02Z","published":"2023-05-18T17:35:35Z","title":"Clifford Group Equivariant Neural Networks","summary":"  We introduce Clifford Group Equivariant Neural Networks: a novel approach for\nconstructing $\\mathrm{O}(n)$- and $\\mathrm{E}(n)$-equivariant models. We\nidentify and study the $\\textit{Clifford group}$, a subgroup inside the\nClifford algebra whose definition we adjust to achieve several favorable\nproperties. Primarily, the group's action forms an orthogonal automorphism that\nextends beyond the typical vector space to the entire Clifford algebra while\nrespecting the multivector grading. This leads to several non-equivalent\nsubrepresentations corresponding to the multivector decomposition. Furthermore,\nwe prove that the action respects not just the vector space structure of the\nClifford algebra but also its multiplicative structure, i.e., the geometric\nproduct. These findings imply that every polynomial in multivectors, An\nadvantage worth mentioning is that we obtain expressive layers that can\nelegantly generalize to inner-product spaces of any dimension. We demonstrate,\nnotably from a single core implementation, state-of-the-art performance on\nseveral distinct tasks, including a three-dimensional $n$-body experiment, a\nfour-dimensional Lorentz-equivariant high-energy physics experiment, and a\nfive-dimensional convex hull experiment.\n","authors":["David Ruhe","Johannes Brandstetter","Patrick Forré"],"pdf_url":"https://arxiv.org/pdf/2305.11141v4.pdf","comment":"Published at NeurIPS 2023 (Oral)"},{"id":"http://arxiv.org/abs/2310.09667v2","updated":"2023-10-18T15:43:35Z","published":"2023-10-14T21:19:15Z","title":"Edge-InversionNet: Enabling Efficient Inference of InversionNet on Edge\n  Devices","summary":"  Seismic full waveform inversion (FWI) is a widely used technique in\ngeophysics for inferring subsurface structures from seismic data. And\nInversionNet is one of the most successful data-driven machine learning models\nthat is applied to seismic FWI. However, the high computing costs to run\nInversionNet have made it challenging to be efficiently deployed on edge\ndevices that are usually resource-constrained. Therefore, we propose to employ\nthe structured pruning algorithm to get a lightweight version of InversionNet,\nwhich can make an efficient inference on edge devices. And we also made a\nprototype with Raspberry Pi to run the lightweight InversionNet. Experimental\nresults show that the pruned InversionNet can achieve up to 98.2 % reduction in\ncomputing resources with moderate model performance degradation.\n","authors":["Zhepeng Wang","Isaacshubhanand Putla","Weiwen Jiang","Youzuo Lin"],"pdf_url":"https://arxiv.org/pdf/2310.09667v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12055v1","updated":"2023-10-18T15:42:53Z","published":"2023-10-18T15:42:53Z","title":"Understanding Reward Ambiguity Through Optimal Transport Theory in\n  Inverse Reinforcement Learning","summary":"  In inverse reinforcement learning (IRL), the central objective is to infer\nunderlying reward functions from observed expert behaviors in a way that not\nonly explains the given data but also generalizes to unseen scenarios. This\nensures robustness against reward ambiguity where multiple reward functions can\nequally explain the same expert behaviors. While significant efforts have been\nmade in addressing this issue, current methods often face challenges with\nhigh-dimensional problems and lack a geometric foundation. This paper harnesses\nthe optimal transport (OT) theory to provide a fresh perspective on these\nchallenges. By utilizing the Wasserstein distance from OT, we establish a\ngeometric framework that allows for quantifying reward ambiguity and\nidentifying a central representation or centroid of reward functions. These\ninsights pave the way for robust IRL methodologies anchored in geometric\ninterpretations, offering a structured approach to tackle reward ambiguity in\nhigh-dimensional settings.\n","authors":["Ali Baheri"],"pdf_url":"https://arxiv.org/pdf/2310.12055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13974v2","updated":"2023-10-18T15:37:28Z","published":"2023-03-24T12:52:43Z","title":"Mixed-Type Wafer Classification For Low Memory Devices Using Knowledge\n  Distillation","summary":"  Manufacturing wafers is an intricate task involving thousands of steps.\nDefect Pattern Recognition (DPR) of wafer maps is crucial for determining the\nroot cause of production defects, which may further provide insight for yield\nimprovement in wafer foundry. During manufacturing, various defects may appear\nstandalone in the wafer or may appear as different combinations. Identifying\nmultiple defects in a wafer is generally harder compared to identifying a\nsingle defect. Recently, deep learning methods have gained significant traction\nin mixed-type DPR. However, the complexity of defects requires complex and\nlarge models making them very difficult to operate on low-memory embedded\ndevices typically used in fabrication labs. Another common issue is the\nunavailability of labeled data to train complex networks. In this work, we\npropose an unsupervised training routine to distill the knowledge of complex\npre-trained models to lightweight deployment-ready models. We empirically show\nthat this type of training compresses the model without sacrificing accuracy\ndespite being up to 10 times smaller than the teacher model. The compressed\nmodel also manages to outperform contemporary state-of-the-art models.\n","authors":["Nitish Shukla","Anurima Dey","Srivatsan K"],"pdf_url":"https://arxiv.org/pdf/2303.13974v2.pdf","comment":"Study is not relevant"},{"id":"http://arxiv.org/abs/2310.12052v1","updated":"2023-10-18T15:37:19Z","published":"2023-10-18T15:37:19Z","title":"Machine Learning-based Nutrient Application's Timeline Recommendation\n  for Smart Agriculture: A Large-Scale Data Mining Approach","summary":"  This study addresses the vital role of data analytics in monitoring\nfertiliser applications in crop cultivation. Inaccurate fertiliser application\ndecisions can lead to costly consequences, hinder food production, and cause\nenvironmental harm. We propose a solution to predict nutrient application by\ndetermining required fertiliser quantities for an entire season. The proposed\nsolution recommends adjusting fertiliser amounts based on weather conditions\nand soil characteristics to promote cost-effective and environmentally friendly\nagriculture. The collected dataset is high-dimensional and heterogeneous. Our\nresearch examines large-scale heterogeneous datasets in the context of the\ndecision-making process, encompassing data collection and analysis. We also\nstudy the impact of fertiliser applications combined with weather data on crop\nyield, using the winter wheat crop as a case study. By understanding local\ncontextual and geographic factors, we aspire to stabilise or even reduce the\ndemand for agricultural nutrients while enhancing crop development. The\nproposed approach is proven to be efficient and scalable, as it is validated\nusing a real-world and large dataset.\n","authors":["Usama Ikhlaq","Tahar Kechadi"],"pdf_url":"https://arxiv.org/pdf/2310.12052v1.pdf","comment":"Research articles have: 6 Pages, 6 Figures, and 3 Tables |\n  ACKNOWLEDGMENT: CONSUS is funded under Science Foundation Ireland's Strategic\n  Partnerships Programme (16/SPP/3296) and is co-funded by Origin Enterprises\n  Plc"},{"id":"http://arxiv.org/abs/2310.12046v1","updated":"2023-10-18T15:32:30Z","published":"2023-10-18T15:32:30Z","title":"Applications of ML-Based Surrogates in Bayesian Approaches to Inverse\n  Problems","summary":"  Neural networks have become a powerful tool as surrogate models to provide\nnumerical solutions for scientific problems with increased computational\nefficiency. This efficiency can be advantageous for numerically challenging\nproblems where time to solution is important or when evaluation of many similar\nanalysis scenarios is required. One particular area of scientific interest is\nthe setting of inverse problems, where one knows the forward dynamics of a\nsystem are described by a partial differential equation and the task is to\ninfer properties of the system given (potentially noisy) observations of these\ndynamics. We consider the inverse problem of inferring the location of a wave\nsource on a square domain, given a noisy solution to the 2-D acoustic wave\nequation. Under the assumption of Gaussian noise, a likelihood function for\nsource location can be formulated, which requires one forward simulation of the\nsystem per evaluation. Using a standard neural network as a surrogate model\nmakes it computationally feasible to evaluate this likelihood several times,\nand so Markov Chain Monte Carlo methods can be used to evaluate the posterior\ndistribution of the source location. We demonstrate that this method can\naccurately infer source-locations from noisy data.\n","authors":["Pelin Ersin","Emma Hayes","Peter Matthews","Paramjyoti Mohapatra","Elisa Negrini","Karl Schulz"],"pdf_url":"https://arxiv.org/pdf/2310.12046v1.pdf","comment":"5 pages, 2 figures, submitted to NeurIPS Workshop on Machine Learning\n  for Physical Sciences"},{"id":"http://arxiv.org/abs/2310.02784v2","updated":"2023-10-18T15:29:21Z","published":"2023-10-04T13:00:53Z","title":"MAD Max Beyond Single-Node: Enabling Large Machine Learning Model\n  Acceleration on Distributed Systems","summary":"  Training and deploying large machine learning (ML) models is time-consuming\nand requires significant distributed computing infrastructures. Based on\nreal-world large model training on datacenter-scale infrastructures, we show\n14~32% of all GPU hours are spent on communication with no overlapping\ncomputation. To minimize the outstanding communication latency, in this work,\nwe develop an agile performance modeling framework to guide parallelization and\nhardware-software co-design strategies. Using the suite of real-world large ML\nmodels on state-of-the-art GPU training hardware, we demonstrate 2.24x and\n5.27x throughput improvement potential for pre-training and inference\nscenarios, respectively.\n","authors":["Samuel Hsia","Alicia Golden","Bilge Acun","Newsha Ardalani","Zachary DeVito","Gu-Yeon Wei","David Brooks","Carole-Jean Wu"],"pdf_url":"https://arxiv.org/pdf/2310.02784v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12036v1","updated":"2023-10-18T15:21:28Z","published":"2023-10-18T15:21:28Z","title":"A General Theoretical Paradigm to Understand Learning from Human\n  Preferences","summary":"  The prevalent deployment of learning from human preferences through\nreinforcement learning (RLHF) relies on two important approximations: the first\nassumes that pairwise preferences can be substituted with pointwise rewards.\nThe second assumes that a reward model trained on these pointwise rewards can\ngeneralize from collected data to out-of-distribution data sampled by the\npolicy. Recently, Direct Preference Optimisation (DPO) has been proposed as an\napproach that bypasses the second approximation and learn directly a policy\nfrom collected data without the reward modelling stage. However, this method\nstill heavily relies on the first approximation.\n  In this paper we try to gain a deeper theoretical understanding of these\npractical algorithms. In particular we derive a new general objective called\n$\\Psi$PO for learning from human preferences that is expressed in terms of\npairwise preferences and therefore bypasses both approximations. This new\ngeneral objective allows us to perform an in-depth analysis of the behavior of\nRLHF and DPO (as special cases of $\\Psi$PO) and to identify their potential\npitfalls. We then consider another special case for $\\Psi$PO by setting $\\Psi$\nsimply to Identity, for which we can derive an efficient optimisation\nprocedure, prove performance guarantees and demonstrate its empirical\nsuperiority to DPO on some illustrative examples.\n","authors":["Mohammad Gheshlaghi Azar","Mark Rowland","Bilal Piot","Daniel Guo","Daniele Calandriello","Michal Valko","Rémi Munos"],"pdf_url":"https://arxiv.org/pdf/2310.12036v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12033v1","updated":"2023-10-18T15:17:10Z","published":"2023-10-18T15:17:10Z","title":"Conformal Drug Property Prediction with Density Estimation under\n  Covariate Shift","summary":"  In drug discovery, it is vital to confirm the predictions of pharmaceutical\nproperties from computational models using costly wet-lab experiments. Hence,\nobtaining reliable uncertainty estimates is crucial for prioritizing drug\nmolecules for subsequent experimental validation. Conformal Prediction (CP) is\na promising tool for creating such prediction sets for molecular properties\nwith a coverage guarantee. However, the exchangeability assumption of CP is\noften challenged with covariate shift in drug discovery tasks: Most datasets\ncontain limited labeled data, which may not be representative of the vast\nchemical space from which molecules are drawn. To address this limitation, we\npropose a method called CoDrug that employs an energy-based model leveraging\nboth training data and unlabelled data, and Kernel Density Estimation (KDE) to\nassess the densities of a molecule set. The estimated densities are then used\nto weigh the molecule samples while building prediction sets and rectifying for\ndistribution shift. In extensive experiments involving realistic distribution\ndrifts in various small-molecule drug discovery tasks, we demonstrate the\nability of CoDrug to provide valid prediction sets and its utility in\naddressing the distribution shift arising from de novo drug design models. On\naverage, using CoDrug can reduce the coverage gap by over 35% when compared to\nconformal prediction sets not adjusted for covariate shift.\n","authors":["Siddhartha Laghuvarapu","Zhen Lin","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2310.12033v1.pdf","comment":"Accepted at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.12032v1","updated":"2023-10-18T15:16:24Z","published":"2023-10-18T15:16:24Z","title":"Exact and efficient solutions of the LMC Multitask Gaussian Process\n  model","summary":"  The Linear Model of Co-regionalization (LMC) is a very general model of\nmultitask gaussian process for regression or classification. While its\nexpressivity and conceptual simplicity are appealing, naive implementations\nhave cubic complexity in the number of datapoints and number of tasks, making\napproximations mandatory for most applications. However, recent work has shown\nthat under some conditions the latent processes of the model can be decoupled,\nleading to a complexity that is only linear in the number of said processes. We\nhere extend these results, showing from the most general assumptions that the\nonly condition necessary to an efficient exact computation of the LMC is a mild\nhypothesis on the noise model. We introduce a full parametrization of the\nresulting \\emph{projected LMC} model, and an expression of the marginal\nlikelihood enabling efficient optimization. We perform a parametric study on\nsynthetic data to show the excellent performance of our approach, compared to\nan unrestricted exact LMC and approximations of the latter. Overall, the\nprojected LMC appears as a credible and simpler alternative to state-of-the art\nmodels, which greatly facilitates some computations such as leave-one-out\ncross-validation and fantasization.\n","authors":["Olivier Truffinet","Karim Ammar","Jean-Philippe Argaud","Bertrand Bouriquet"],"pdf_url":"https://arxiv.org/pdf/2310.12032v1.pdf","comment":"21 pages, 5 figures, submitted to AISTATS"},{"id":"http://arxiv.org/abs/2310.12031v1","updated":"2023-10-18T15:15:13Z","published":"2023-10-18T15:15:13Z","title":"SegmATRon: Embodied Adaptive Semantic Segmentation for Indoor\n  Environment","summary":"  This paper presents an adaptive transformer model named SegmATRon for\nembodied image semantic segmentation. Its distinctive feature is the adaptation\nof model weights during inference on several images using a hybrid\nmulticomponent loss function. We studied this model on datasets collected in\nthe photorealistic Habitat and the synthetic AI2-THOR Simulators. We showed\nthat obtaining additional images using the agent's actions in an indoor\nenvironment can improve the quality of semantic segmentation. The code of the\nproposed approach and datasets are publicly available at\nhttps://github.com/wingrune/SegmATRon.\n","authors":["Tatiana Zemskova","Margarita Kichik","Dmitry Yudin","Aleksei Staroverov","Aleksandr Panov"],"pdf_url":"https://arxiv.org/pdf/2310.12031v1.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2309.15028v2","updated":"2023-10-18T15:05:36Z","published":"2023-09-26T15:57:57Z","title":"Don't throw away your value model! Making PPO even better via\n  Value-Guided Monte-Carlo Tree Search decoding","summary":"  Inference-time search algorithms such as Monte-Carlo Tree Search (MCTS) may\nseem unnecessary when generating natural language text based on\nstate-of-the-art reinforcement learning such as Proximal Policy Optimization\n(PPO). In this paper, we demonstrate that it is possible to get extra mileage\nout of PPO by integrating MCTS on top. The key idea is not to throw out the\nvalue network, a byproduct of PPO training for evaluating partial output\nsequences, when decoding text out of the policy network. More concretely, we\npresent a novel value-guided decoding algorithm called PPO-MCTS, which can\nintegrate the value network from PPO to work closely with the policy network\nduring inference-time generation. Compared to prior approaches based on MCTS\nfor controlled text generation, the key strength of our approach is to reduce\nthe fundamental mismatch of the scoring mechanisms of the partial outputs\nbetween training and test. Evaluation on four text generation tasks demonstrate\nthat PPO-MCTS greatly improves the preferability of generated text compared to\nthe standard practice of using only the PPO policy. Our results demonstrate the\npromise of search algorithms even on top of the aligned language models from\nPPO, and the under-explored benefit of the value network.\n","authors":["Jiacheng Liu","Andrew Cohen","Ramakanth Pasunuru","Yejin Choi","Hannaneh Hajishirzi","Asli Celikyilmaz"],"pdf_url":"https://arxiv.org/pdf/2309.15028v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12026v1","updated":"2023-10-18T15:01:53Z","published":"2023-10-18T15:01:53Z","title":"Nonparametric Discrete Choice Experiments with Machine Learning Guided\n  Adaptive Design","summary":"  Designing products to meet consumers' preferences is essential for a\nbusiness's success. We propose the Gradient-based Survey (GBS), a discrete\nchoice experiment for multiattribute product design. The experiment elicits\nconsumer preferences through a sequence of paired comparisons for partial\nprofiles. GBS adaptively constructs paired comparison questions based on the\nrespondents' previous choices. Unlike the traditional random utility\nmaximization paradigm, GBS is robust to model misspecification by not requiring\na parametric utility model. Cross-pollinating the machine learning and\nexperiment design, GBS is scalable to products with hundreds of attributes and\ncan design personalized products for heterogeneous consumers. We demonstrate\nthe advantage of GBS in accuracy and sample efficiency compared to the existing\nparametric and nonparametric methods in simulations.\n","authors":["Mingzhang Yin","Ruijiang Gao","Weiran Lin","Steven M. Shugan"],"pdf_url":"https://arxiv.org/pdf/2310.12026v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04921v2","updated":"2023-10-18T14:52:03Z","published":"2023-10-07T21:23:58Z","title":"Crystal: Introspective Reasoners Reinforced with Self-Feedback","summary":"  Extensive work has shown that the performance and interpretability of\ncommonsense reasoning can be improved via knowledge-augmented reasoning\nmethods, where the knowledge that underpins the reasoning process is explicitly\nverbalized and utilized. However, existing implementations, including\n\"chain-of-thought\" and its variants, fall short in capturing the introspective\nnature of knowledge required in commonsense reasoning, and in accounting for\nthe mutual adaptation between the generation and utilization of knowledge. We\npropose a novel method to develop an introspective commonsense reasoner,\nCrystal. To tackle commonsense problems, it first introspects for knowledge\nstatements related to the given question, and subsequently makes an informed\nprediction that is grounded in the previously introspected knowledge. The\nknowledge introspection and knowledge-grounded reasoning modes of the model are\ntuned via reinforcement learning to mutually adapt, where the reward derives\nfrom the feedback given by the model itself. Experiments show that Crystal\nsignificantly outperforms both the standard supervised finetuning and\nchain-of-thought distilled methods, and enhances the transparency of the\ncommonsense reasoning process. Our work ultimately validates the feasibility\nand potential of reinforcing a neural model with self-feedback.\n","authors":["Jiacheng Liu","Ramakanth Pasunuru","Hannaneh Hajishirzi","Yejin Choi","Asli Celikyilmaz"],"pdf_url":"https://arxiv.org/pdf/2310.04921v2.pdf","comment":"EMNLP 2023 main conference"},{"id":"http://arxiv.org/abs/2305.18901v4","updated":"2023-10-18T14:38:06Z","published":"2023-05-30T09:59:04Z","title":"Policy Optimization for Continuous Reinforcement Learning","summary":"  We study reinforcement learning (RL) in the setting of continuous time and\nspace, for an infinite horizon with a discounted objective and the underlying\ndynamics driven by a stochastic differential equation. Built upon recent\nadvances in the continuous approach to RL, we develop a notion of occupation\ntime (specifically for a discounted objective), and show how it can be\neffectively used to derive performance-difference and local-approximation\nformulas. We further extend these results to illustrate their applications in\nthe PG (policy gradient) and TRPO/PPO (trust region policy optimization/\nproximal policy optimization) methods, which have been familiar and powerful\ntools in the discrete RL setting but under-developed in continuous RL. Through\nnumerical experiments, we demonstrate the effectiveness and advantages of our\napproach.\n","authors":["Hanyang Zhao","Wenpin Tang","David D. Yao"],"pdf_url":"https://arxiv.org/pdf/2305.18901v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10907v2","updated":"2023-10-18T14:33:27Z","published":"2023-10-17T00:44:51Z","title":"Surrogate Active Subspaces for Jump-Discontinuous Functions","summary":"  Surrogate modeling and active subspaces have emerged as powerful paradigms in\ncomputational science and engineering. Porting such techniques to computational\nmodels in the social sciences brings into sharp relief their limitations in\ndealing with discontinuous simulators, such as Agent-Based Models, which have\ndiscrete outputs. Nevertheless, prior applied work has shown that surrogate\nestimates of active subspaces for such estimators can yield interesting\nresults. But given that active subspaces are defined by way of gradients, it is\nnot clear what quantity is being estimated when this methodology is applied to\na discontinuous simulator. We begin this article by showing some pathologies\nthat can arise when conducting such an analysis. This motivates an extension of\nactive subspaces to discontinuous functions, clarifying what is actually being\nestimated in such analyses. We also conduct numerical experiments on synthetic\ntest functions to compare Gaussian process estimates of active subspaces on\ncontinuous and discontinuous functions. Finally, we deploy our methodology on\nFlee, an agent-based model of refugee movement, yielding novel insights into\nwhich parameters of the simulation are most important across 8 displacement\ncrises in Africa and the Middle East.\n","authors":["Nathan Wycoff"],"pdf_url":"https://arxiv.org/pdf/2310.10907v2.pdf","comment":"comments very welcome"},{"id":"http://arxiv.org/abs/2305.18803v2","updated":"2023-10-18T14:33:12Z","published":"2023-05-30T07:40:27Z","title":"Koopa: Learning Non-stationary Time Series Dynamics with Koopman\n  Predictors","summary":"  Real-world time series are characterized by intrinsic non-stationarity that\nposes a principal challenge for deep forecasting models. While previous models\nsuffer from complicated series variations induced by changing temporal\ndistribution, we tackle non-stationary time series with modern Koopman theory\nthat fundamentally considers the underlying time-variant dynamics. Inspired by\nKoopman theory of portraying complex dynamical systems, we disentangle\ntime-variant and time-invariant components from intricate non-stationary series\nby Fourier Filter and design Koopman Predictor to advance respective dynamics\nforward. Technically, we propose Koopa as a novel Koopman forecaster composed\nof stackable blocks that learn hierarchical dynamics. Koopa seeks measurement\nfunctions for Koopman embedding and utilizes Koopman operators as linear\nportraits of implicit transition. To cope with time-variant dynamics that\nexhibits strong locality, Koopa calculates context-aware operators in the\ntemporal neighborhood and is able to utilize incoming ground truth to scale up\nforecast horizon. Besides, by integrating Koopman Predictors into deep residual\nstructure, we ravel out the binding reconstruction loss in previous Koopman\nforecasters and achieve end-to-end forecasting objective optimization. Compared\nwith the state-of-the-art model, Koopa achieves competitive performance while\nsaving 77.3% training time and 76.0% memory.\n","authors":["Yong Liu","Chenyu Li","Jianmin Wang","Mingsheng Long"],"pdf_url":"https://arxiv.org/pdf/2305.18803v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10943v2","updated":"2023-10-18T14:32:37Z","published":"2023-10-17T02:40:27Z","title":"Reaching the Limit in Autonomous Racing: Optimal Control versus\n  Reinforcement Learning","summary":"  A central question in robotics is how to design a control system for an agile\nmobile robot. This paper studies this question systematically, focusing on a\nchallenging setting: autonomous drone racing. We show that a neural network\ncontroller trained with reinforcement learning (RL) outperformed optimal\ncontrol (OC) methods in this setting. We then investigated which fundamental\nfactors have contributed to the success of RL or have limited OC. Our study\nindicates that the fundamental advantage of RL over OC is not that it optimizes\nits objective better but that it optimizes a better objective. OC decomposes\nthe problem into planning and control with an explicit intermediate\nrepresentation, such as a trajectory, that serves as an interface. This\ndecomposition limits the range of behaviors that can be expressed by the\ncontroller, leading to inferior control performance when facing unmodeled\neffects. In contrast, RL can directly optimize a task-level objective and can\nleverage domain randomization to cope with model uncertainty, allowing the\ndiscovery of more robust control responses. Our findings allowed us to push an\nagile drone to its maximum performance, achieving a peak acceleration greater\nthan 12 times the gravitational acceleration and a peak velocity of 108\nkilometers per hour. Our policy achieved superhuman control within minutes of\ntraining on a standard workstation. This work presents a milestone in agile\nrobotics and sheds light on the role of RL and OC in robot control.\n","authors":["Yunlong Song","Angel Romero","Matthias Mueller","Vladlen Koltun","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2310.10943v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12001v1","updated":"2023-10-18T14:32:20Z","published":"2023-10-18T14:32:20Z","title":"Bayesian Flow Networks in Continual Learning","summary":"  Bayesian Flow Networks (BFNs) has been recently proposed as one of the most\npromising direction to universal generative modelling, having ability to learn\nany of the data type. Their power comes from the expressiveness of neural\nnetworks and Bayesian inference which make them suitable in the context of\ncontinual learning. We delve into the mechanics behind BFNs and conduct the\nexperiments to empirically verify the generative capabilities on non-stationary\ndata.\n","authors":["Mateusz Pyla","Kamil Deja","Bartłomiej Twardowski","Tomasz Trzciński"],"pdf_url":"https://arxiv.org/pdf/2310.12001v1.pdf","comment":"Submitted to NeurIPS 2023 Workshop on Diffusion Models"},{"id":"http://arxiv.org/abs/2310.12000v1","updated":"2023-10-18T14:31:16Z","published":"2023-10-18T14:31:16Z","title":"Iterative Methods for Vecchia-Laplace Approximations for Latent Gaussian\n  Process Models","summary":"  Latent Gaussian process (GP) models are flexible probabilistic non-parametric\nfunction models. Vecchia approximations are accurate approximations for GPs to\novercome computational bottlenecks for large data, and the Laplace\napproximation is a fast method with asymptotic convergence guarantees to\napproximate marginal likelihoods and posterior predictive distributions for\nnon-Gaussian likelihoods. Unfortunately, the computational complexity of\ncombined Vecchia-Laplace approximations grows faster than linearly in the\nsample size when used in combination with direct solver methods such as the\nCholesky decomposition. Computations with Vecchia-Laplace approximations thus\nbecome prohibitively slow precisely when the approximations are usually the\nmost accurate, i.e., on large data sets. In this article, we present several\niterative methods for inference with Vecchia-Laplace approximations which make\ncomputations considerably faster compared to Cholesky-based calculations. We\nanalyze our proposed methods theoretically and in experiments with simulated\nand real-world data. In particular, we obtain a speed-up of an order of\nmagnitude compared to Cholesky-based inference and a threefold increase in\nprediction accuracy in terms of the continuous ranked probability score\ncompared to a state-of-the-art method on a large satellite data set. All\nmethods are implemented in a free C++ software library with high-level Python\nand R packages.\n","authors":["Pascal Kündig","Fabio Sigrist"],"pdf_url":"https://arxiv.org/pdf/2310.12000v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11991v1","updated":"2023-10-18T14:22:36Z","published":"2023-10-18T14:22:36Z","title":"Removing Spurious Concepts from Neural Network Representations via Joint\n  Subspace Estimation","summary":"  Out-of-distribution generalization in neural networks is often hampered by\nspurious correlations. A common strategy is to mitigate this by removing\nspurious concepts from the neural network representation of the data. Existing\nconcept-removal methods tend to be overzealous by inadvertently eliminating\nfeatures associated with the main task of the model, thereby harming model\nperformance. We propose an iterative algorithm that separates spurious from\nmain-task concepts by jointly identifying two low-dimensional orthogonal\nsubspaces in the neural network representation. We evaluate the algorithm on\nbenchmark datasets for computer vision (Waterbirds, CelebA) and natural\nlanguage processing (MultiNLI), and show that it outperforms existing concept\nremoval methods\n","authors":["Floris Holstege","Bram Wouters","Noud van Giersbergen","Cees Diks"],"pdf_url":"https://arxiv.org/pdf/2310.11991v1.pdf","comment":"Preprint. Under Review. 33 pages"},{"id":"http://arxiv.org/abs/2310.11989v1","updated":"2023-10-18T14:20:55Z","published":"2023-10-18T14:20:55Z","title":"Image Clustering with External Guidance","summary":"  The core of clustering is incorporating prior knowledge to construct\nsupervision signals. From classic k-means based on data compactness to recent\ncontrastive clustering guided by self-supervision, the evolution of clustering\nmethods intrinsically corresponds to the progression of supervision signals. At\npresent, substantial efforts have been devoted to mining internal supervision\nsignals from data. Nevertheless, the abundant external knowledge such as\nsemantic descriptions, which naturally conduces to clustering, is regrettably\noverlooked. In this work, we propose leveraging external knowledge as a new\nsupervision signal to guide clustering, even though it seems irrelevant to the\ngiven data. To implement and validate our idea, we design an externally guided\nclustering method (Text-Aided Clustering, TAC), which leverages the textual\nsemantics of WordNet to facilitate image clustering. Specifically, TAC first\nselects and retrieves WordNet nouns that best distinguish images to enhance the\nfeature discriminability. Then, to improve image clustering performance, TAC\ncollaborates text and image modalities by mutually distilling cross-modal\nneighborhood information. Experiments demonstrate that TAC achieves\nstate-of-the-art performance on five widely used and three more challenging\nimage clustering benchmarks, including the full ImageNet-1K dataset.\n","authors":["Yunfan Li","Peng Hu","Dezhong Peng","Jiancheng Lv","Jianping Fan","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2310.11989v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04528v4","updated":"2023-10-18T14:16:01Z","published":"2023-06-07T15:37:00Z","title":"PromptBench: Towards Evaluating the Robustness of Large Language Models\n  on Adversarial Prompts","summary":"  The increasing reliance on Large Language Models (LLMs) across academia and\nindustry necessitates a comprehensive understanding of their robustness to\nprompts. In response to this vital need, we introduce PromptBench, a robustness\nbenchmark designed to measure LLMs' resilience to adversarial prompts. This\nstudy uses a plethora of adversarial textual attacks targeting prompts across\nmultiple levels: character, word, sentence, and semantic. The adversarial\nprompts, crafted to mimic plausible user errors like typos or synonyms, aim to\nevaluate how slight deviations can affect LLM outcomes while maintaining\nsemantic integrity. These prompts are then employed in diverse tasks, such as\nsentiment analysis, natural language inference, reading comprehension, machine\ntranslation, and math problem-solving. Our study generates 4788 adversarial\nprompts, meticulously evaluated over 8 tasks and 13 datasets. Our findings\ndemonstrate that contemporary LLMs are not robust to adversarial prompts.\nFurthermore, we present comprehensive analysis to understand the mystery behind\nprompt robustness and its transferability. We then offer insightful robustness\nanalysis and pragmatic recommendations for prompt composition, beneficial to\nboth researchers and everyday users. Code is available at:\nhttps://github.com/microsoft/promptbench.\n","authors":["Kaijie Zhu","Jindong Wang","Jiaheng Zhou","Zichen Wang","Hao Chen","Yidong Wang","Linyi Yang","Wei Ye","Yue Zhang","Neil Zhenqiang Gong","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2306.04528v4.pdf","comment":"Technical report; code is at:\n  https://github.com/microsoft/promptbench"},{"id":"http://arxiv.org/abs/2310.11985v1","updated":"2023-10-18T14:11:41Z","published":"2023-10-18T14:11:41Z","title":"A Finite-Horizon Approach to Active Level Set Estimation","summary":"  We consider the problem of active learning in the context of spatial sampling\nfor level set estimation (LSE), where the goal is to localize all regions where\na function of interest lies above/below a given threshold as quickly as\npossible. We present a finite-horizon search procedure to perform LSE in one\ndimension while optimally balancing both the final estimation error and the\ndistance traveled for a fixed number of samples. A tuning parameter is used to\ntrade off between the estimation accuracy and distance traveled. We show that\nthe resulting optimization problem can be solved in closed form and that the\nresulting policy generalizes existing approaches to this problem. We then show\nhow this approach can be used to perform level set estimation in higher\ndimensions under the popular Gaussian process model. Empirical results on\nsynthetic data indicate that as the cost of travel increases, our method's\nability to treat distance nonmyopically allows it to significantly improve on\nthe state of the art. On real air quality data, our approach achieves roughly\none fifth the estimation error at less than half the cost of competing\nalgorithms.\n","authors":["Phillip Kearns","Bruno Jedynak","John Lipor"],"pdf_url":"https://arxiv.org/pdf/2310.11985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11984v1","updated":"2023-10-18T14:10:47Z","published":"2023-10-18T14:10:47Z","title":"From Interpolation to Extrapolation: Complete Length Generalization for\n  Arithmetic Transformers","summary":"  Since its introduction, the transformer model has demonstrated outstanding\nperformance across various tasks. However, there are still unresolved issues\nregarding length generalization, particularly in algorithmic tasks. In this\npaper, we investigate the inherent capabilities of transformer models in\nlearning arithmetic algorithms, such as addition and multiplication. Through\nexperiments and attention analysis, we identify a number of crucial factors for\nachieving optimal length generalization. We show that transformer models are\nable to generalize to long lengths with the help of targeted attention biasing.\nWe then introduce Attention Bias Calibration (ABC), a calibration stage that\nenables the model to automatically learn the proper attention biases, which we\nlink to mechanisms in relative position encoding. We demonstrate that using\nABC, the transformer model can achieve unprecedented perfect length\ngeneralization on certain arithmetic tasks.\n","authors":["Shaoxiong Duan","Yining Shi"],"pdf_url":"https://arxiv.org/pdf/2310.11984v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2103.03223v3","updated":"2023-10-18T14:10:17Z","published":"2021-03-04T18:51:06Z","title":"A Comparative Evaluation of Quantification Methods","summary":"  Quantification represents the problem of predicting class distributions in a\ndataset. It also represents a growing research field in supervised machine\nlearning, for which a large variety of different algorithms has been proposed\nin recent years. However, a comprehensive empirical comparison of\nquantification methods that supports algorithm selection is not available yet.\nIn this work, we close this research gap by conducting a thorough empirical\nperformance comparison of 24 different quantification methods on overall more\nthan 40 data sets, considering binary as well as multiclass quantification\nsettings. We observe that no single algorithm generally outperforms all\ncompetitors, but identify a group of methods including the threshold\nselection-based Median Sweep and TSMax methods, the DyS framework, and\nFriedman's method that performs best in the binary setting. For the multiclass\nsetting, we observe that a different group of algorithms yields good\nperformance, including the Generalized Probabilistic Adjusted Count, the readme\nmethod, the energy distance minimization method, the EM algorithm for\nquantification, and Friedman's method. We also find that tuning the underlying\nclassifiers has in most cases only a limited impact on the quantification\nperformance. More generally, we find that the performance on multiclass\nquantification is inferior to the results obtained in the binary setting. Our\nresults can guide practitioners who intend to apply quantification algorithms\nand help researchers to identify opportunities for future research.\n","authors":["Tobias Schumacher","Markus Strohmaier","Florian Lemmerich"],"pdf_url":"https://arxiv.org/pdf/2103.03223v3.pdf","comment":"39 pages, 18 figures, 10 tables"},{"id":"http://arxiv.org/abs/2310.11978v1","updated":"2023-10-18T14:05:04Z","published":"2023-10-18T14:05:04Z","title":"Can bin-wise scaling improve consistency and adaptivity of prediction\n  uncertainty for machine learning regression ?","summary":"  Binwise Variance Scaling (BVS) has recently been proposed as a post hoc\nrecalibration method for prediction uncertainties of machine learning\nregression problems that is able of more efficient corrections than uniform\nvariance (or temperature) scaling. The original version of BVS uses\nuncertainty-based binning, which is aimed to improve calibration conditionally\non uncertainty, i.e. consistency. I explore here several adaptations of BVS, in\nparticular with alternative loss functions and a binning scheme based on an\ninput-feature (X) in order to improve adaptivity, i.e. calibration conditional\non X. The performances of BVS and its proposed variants are tested on a\nbenchmark dataset for the prediction of atomization energies and compared to\nthe results of isotonic regression.\n","authors":["Pascal Pernot"],"pdf_url":"https://arxiv.org/pdf/2310.11978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11971v1","updated":"2023-10-18T13:54:15Z","published":"2023-10-18T13:54:15Z","title":"Improving Generalization of Alignment with Human Preferences through\n  Group Invariant Learning","summary":"  The success of AI assistants based on language models (LLMs) hinges crucially\non Reinforcement Learning from Human Feedback (RLHF), which enables the\ngeneration of responses more aligned with human preferences. As universal AI\nassistants, there's a growing expectation for them to perform consistently\nacross various domains. However, previous work shows that Reinforcement\nLearning (RL) often exploits shortcuts to attain high rewards and overlooks\nchallenging samples. This focus on quick reward gains undermines both the\nstability in training and the model's ability to generalize to new, unseen\ndata. In this work, we propose a novel approach that can learn a consistent\npolicy via RL across various data groups or domains. Given the challenges\nassociated with acquiring group annotations, our method automatically\nclassifies data into different groups, deliberately maximizing performance\nvariance. Then, we optimize the policy to perform well on challenging groups.\nLastly, leveraging the established groups, our approach adaptively adjusts the\nexploration space, allocating more learning capacity to more challenging data\nand preventing the model from over-optimizing on simpler data. Experimental\nresults indicate that our approach significantly enhances training stability\nand model generalization.\n","authors":["Rui Zheng","Wei Shen","Yuan Hua","Wenbin Lai","Shihan Dou","Yuhao Zhou","Zhiheng Xi","Xiao Wang","Haoran Huang","Tao Gui","Qi Zhang","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2310.11971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08813v2","updated":"2023-10-18T13:52:33Z","published":"2023-07-17T20:01:11Z","title":"Comparative Performance Evaluation of Large Language Models for\n  Extracting Molecular Interactions and Pathway Knowledge","summary":"  Understanding protein interactions and pathway knowledge is crucial for\nunraveling the complexities of living systems and investigating the underlying\nmechanisms of biological functions and complex diseases. While existing\ndatabases provide curated biological data from literature and other sources,\nthey are often incomplete and their maintenance is labor-intensive,\nnecessitating alternative approaches. In this study, we propose to harness the\ncapabilities of large language models to address these issues by automatically\nextracting such knowledge from the relevant scientific literature. Toward this\ngoal, in this work, we investigate the effectiveness of different large\nlanguage models in tasks that involve recognizing protein interactions,\nidentifying genes associated with pathways affected by low-dose radiation, and\ngene regulatory relations. We thoroughly evaluate the performance of various\nmodels, highlight the significant findings, and discuss both the future\nopportunities and the remaining challenges associated with this approach. The\ncode and data are available at: https://github.com/boxorange/BioIE-LLM\n","authors":["Gilchan Park","Byung-Jun Yoon","Xihaier Luo","Vanessa López-Marrero","Shinjae Yoo","Shantenu Jha"],"pdf_url":"https://arxiv.org/pdf/2307.08813v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05366v2","updated":"2023-10-18T13:48:31Z","published":"2023-06-08T17:08:52Z","title":"Ordinal Potential-based Player Rating","summary":"  It was recently observed that Elo ratings fail at preserving transitive\nrelations among strategies and therefore cannot correctly extract the\ntransitive component of a game. We provide a characterization of transitive\ngames as a weak variant of ordinal potential games and show that Elo ratings\nactually do preserve transitivity when computed in the right space, using\nsuitable invertible mappings. Leveraging this insight, we introduce a new game\ndecomposition of an arbitrary game into transitive and cyclic components that\nis learnt using a neural network-based architecture and that prioritises\ncapturing the sign pattern of the game, namely transitive and cyclic relations\namong strategies. We link our approach to the known concept of sign-rank, and\nevaluate our methodology using both toy examples and empirical data from\nreal-world games.\n","authors":["Nelson Vadori","Rahul Savani"],"pdf_url":"https://arxiv.org/pdf/2306.05366v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11967v1","updated":"2023-10-18T13:45:47Z","published":"2023-10-18T13:45:47Z","title":"Take the aTrain. Introducing an Interface for the Accessible\n  Transcription of Interviews","summary":"  aTrain is an open-source and offline tool for transcribing audio data in\nmultiple languages with CPU and NVIDIA GPU support. It is specifically designed\nfor researchers using qualitative data generated from various forms of speech\ninteractions with research participants. aTrain requires no programming skills,\nruns on most computers, does not require an internet connection, and was\nverified not to upload data to any server. aTrain combines OpenAI's Whisper\nmodel with speaker recognition to provide output that integrates with the\npopular qualitative data analysis software tools MAXQDA and ATLAS.ti. It has an\neasy-to-use graphical interface and is provided as a Windows-App through the\nMicrosoft Store allowing for simple installation by researchers. The source\ncode is freely available on GitHub. Having developed aTrain with a focus on\nspeed on local computers, we show that the transcription time on current mobile\nCPUs is around 2 to 3 times the duration of the audio file using the\nhighest-accuracy transcription models. If an entry-level graphics card is\navailable, the transcription speed increases to 20% of the audio duration.\n","authors":["Armin Haberl","Jürgen Fleiß","Dominik Kowald","Stefan Thalmann"],"pdf_url":"https://arxiv.org/pdf/2310.11967v1.pdf","comment":"Install via Microsoft store:\n  apps.microsoft.com/store/detail/atrain/9N15Q44SZNS2. Github:\n  github.com/BANDAS-Center/aTrain"},{"id":"http://arxiv.org/abs/2310.11966v1","updated":"2023-10-18T13:45:17Z","published":"2023-10-18T13:45:17Z","title":"Flexible Payload Configuration for Satellites using Machine Learning","summary":"  Satellite communications, essential for modern connectivity, extend access to\nmaritime, aeronautical, and remote areas where terrestrial networks are\nunfeasible. Current GEO systems distribute power and bandwidth uniformly across\nbeams using multi-beam footprints with fractional frequency reuse. However,\nrecent research reveals the limitations of this approach in heterogeneous\ntraffic scenarios, leading to inefficiencies. To address this, this paper\npresents a machine learning (ML)-based approach to Radio Resource Management\n(RRM).\n  We treat the RRM task as a regression ML problem, integrating RRM objectives\nand constraints into the loss function that the ML algorithm aims at\nminimizing. Moreover, we introduce a context-aware ML metric that evaluates the\nML model's performance but also considers the impact of its resource allocation\ndecisions on the overall performance of the communication system.\n","authors":["Marcele O. K. Mendonca","Flor G. Ortiz-Gomez","Jorge Querol","Eva Lagunas","Juan A. Vásquez Peralvo","Victor Monzon Baeza","Symeon Chatzinotas","Bjorn Ottersten"],"pdf_url":"https://arxiv.org/pdf/2310.11966v1.pdf","comment":"in review for conference"},{"id":"http://arxiv.org/abs/2310.11960v1","updated":"2023-10-18T13:40:41Z","published":"2023-10-18T13:40:41Z","title":"Fast Multipole Attention: A Divide-and-Conquer Attention Mechanism for\n  Long Sequences","summary":"  Transformer-based models have achieved state-of-the-art performance in many\nareas. However, the quadratic complexity of self-attention with respect to the\ninput length hinders the applicability of Transformer-based models to long\nsequences. To address this, we present Fast Multipole Attention, a new\nattention mechanism that uses a divide-and-conquer strategy to reduce the time\nand memory complexity of attention for sequences of length $n$ from\n$\\mathcal{O}(n^2)$ to $\\mathcal{O}(n \\log n)$ or $O(n)$, while retaining a\nglobal receptive field. The hierarchical approach groups queries, keys, and\nvalues into $\\mathcal{O}( \\log n)$ levels of resolution, where groups at\ngreater distances are increasingly larger in size and the weights to compute\ngroup quantities are learned. As such, the interaction between tokens far from\neach other is considered in lower resolution in an efficient hierarchical\nmanner. The overall complexity of Fast Multipole Attention is $\\mathcal{O}(n)$\nor $\\mathcal{O}(n \\log n)$, depending on whether the queries are down-sampled\nor not. This multi-level divide-and-conquer strategy is inspired by fast\nsummation methods from $n$-body physics and the Fast Multipole Method. We\nperform evaluation on autoregressive and bidirectional language modeling tasks\nand compare our Fast Multipole Attention model with other efficient attention\nvariants on medium-size datasets. We find empirically that the Fast Multipole\nTransformer performs much better than other efficient transformers in terms of\nmemory size and accuracy. The Fast Multipole Attention mechanism has the\npotential to empower large language models with much greater sequence lengths,\ntaking the full context into account in an efficient, naturally hierarchical\nmanner during training and when generating long sequences.\n","authors":["Yanming Kang","Giang Tran","Hans De Sterck"],"pdf_url":"https://arxiv.org/pdf/2310.11960v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.02605v2","updated":"2023-10-18T13:40:05Z","published":"2023-05-04T07:24:12Z","title":"IMAP: Intrinsically Motivated Adversarial Policy","summary":"  Reinforcement learning agents are susceptible to evasion attacks during\ndeployment. In single-agent environments, these attacks can occur through\nimperceptible perturbations injected into the inputs of the victim policy\nnetwork. In multi-agent environments, an attacker can manipulate an adversarial\nopponent to influence the victim policy's observations indirectly. While\nadversarial policies offer a promising technique to craft such attacks, current\nmethods are either sample-inefficient due to poor exploration strategies or\nrequire extra surrogate model training under the black-box assumption. To\naddress these challenges, in this paper, we propose Intrinsically Motivated\nAdversarial Policy (IMAP) for efficient black-box adversarial policy learning\nin both single- and multi-agent environments. We formulate four types of\nadversarial intrinsic regularizers -- maximizing the adversarial state\ncoverage, policy coverage, risk, or divergence -- to discover potential\nvulnerabilities of the victim policy in a principled way. We also present a\nnovel Bias-Reduction (BR) method to boost IMAP further. Our experiments\nvalidate the effectiveness of the four types of adversarial intrinsic\nregularizers and BR in enhancing black-box adversarial policy learning across a\nvariety of environments. Our IMAP successfully evades two types of defense\nmethods, adversarial training and robust regularizer, decreasing the\nperformance of the state-of-the-art robust WocaR-PPO agents by 34%-54% across\nfour single-agent tasks. IMAP also achieves a state-of-the-art attacking\nsuccess rate of 83.91% in the multi-agent game YouShallNotPass.\n","authors":["Xiang Zheng","Xingjun Ma","Shengjie Wang","Xinyu Wang","Chao Shen","Cong Wang"],"pdf_url":"https://arxiv.org/pdf/2305.02605v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11959v1","updated":"2023-10-18T13:39:07Z","published":"2023-10-18T13:39:07Z","title":"A Multi-Scale Decomposition MLP-Mixer for Time Series Analysis","summary":"  Time series data, often characterized by unique composition and complex\nmulti-scale temporal variations, requires special consideration of\ndecomposition and multi-scale modeling in its analysis. Existing deep learning\nmethods on this best fit to only univariate time series, and have not\nsufficiently accounted for sub-series level modeling and decomposition\ncompleteness. To address this, we propose MSD-Mixer, a Multi-Scale\nDecomposition MLP-Mixer which learns to explicitly decompose the input time\nseries into different components, and represents the components in different\nlayers. To handle multi-scale temporal patterns and inter-channel dependencies,\nwe propose a novel temporal patching approach to model the time series as\nmulti-scale sub-series, i.e., patches, and employ MLPs to mix intra- and\ninter-patch variations and channel-wise correlations. In addition, we propose a\nloss function to constrain both the magnitude and autocorrelation of the\ndecomposition residual for decomposition completeness. Through extensive\nexperiments on various real-world datasets for five common time series analysis\ntasks (long- and short-term forecasting, imputation, anomaly detection, and\nclassification), we demonstrate that MSD-Mixer consistently achieves\nsignificantly better performance in comparison with other state-of-the-art\ntask-general and task-specific approaches.\n","authors":["Shuhan Zhong","Sizhe Song","Guanyao Li","Weipeng Zhuo","Yang Liu","S. -H. Gary Chan"],"pdf_url":"https://arxiv.org/pdf/2310.11959v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11958v1","updated":"2023-10-18T13:38:03Z","published":"2023-10-18T13:38:03Z","title":"Emptying the Ocean with a Spoon: Should We Edit Models?","summary":"  We call into question the recently popularized method of direct model editing\nas a means of correcting factual errors in LLM generations. We contrast model\nediting with three similar but distinct approaches that pursue better defined\nobjectives: (1) retrieval-based architectures, which decouple factual memory\nfrom inference and linguistic capabilities embodied in LLMs; (2) concept\nerasure methods, which aim at preventing systemic bias in generated text; and\n(3) attribution methods, which aim at grounding generations into identified\ntextual sources. We argue that direct model editing cannot be trusted as a\nsystematic remedy for the disadvantages inherent to LLMs, and while it has\nproven potential in improving model explainability, it opens risks by\nreinforcing the notion that models can be trusted for factuality. We call for\ncautious promotion and application of model editing as part of the LLM\ndeployment process, and for responsibly limiting the use cases of LLMs to those\nnot relying on editing as a critical component.\n","authors":["Yuval Pinter","Michael Elhadad"],"pdf_url":"https://arxiv.org/pdf/2310.11958v1.pdf","comment":"Findings of ACL: EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.11952v1","updated":"2023-10-18T13:26:52Z","published":"2023-10-18T13:26:52Z","title":"Recasting Continual Learning as Sequence Modeling","summary":"  In this work, we aim to establish a strong connection between two significant\nbodies of machine learning research: continual learning and sequence modeling.\nThat is, we propose to formulate continual learning as a sequence modeling\nproblem, allowing advanced sequence models to be utilized for continual\nlearning. Under this formulation, the continual learning process becomes the\nforward pass of a sequence model. By adopting the meta-continual learning (MCL)\nframework, we can train the sequence model at the meta-level, on multiple\ncontinual learning episodes. As a specific example of our new formulation, we\ndemonstrate the application of Transformers and their efficient variants as MCL\nmethods. Our experiments on seven benchmarks, covering both classification and\nregression, show that sequence models can be an attractive solution for general\nMCL.\n","authors":["Soochan Lee","Jaehyeon Son","Gunhee Kim"],"pdf_url":"https://arxiv.org/pdf/2310.11952v1.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.11950v1","updated":"2023-10-18T13:24:05Z","published":"2023-10-18T13:24:05Z","title":"Too Good To Be True: performance overestimation in (re)current practices\n  for Human Activity Recognition","summary":"  Today, there are standard and well established procedures within the Human\nActivity Recognition (HAR) pipeline. However, some of these conventional\napproaches lead to accuracy overestimation. In particular, sliding windows for\ndata segmentation followed by standard random k-fold cross validation, produce\nbiased results. An analysis of previous literature and present-day studies,\nsurprisingly, shows that these are common approaches in state-of-the-art\nstudies on HAR. It is important to raise awareness in the scientific community\nabout this problem, whose negative effects are being overlooked. Otherwise,\npublications of biased results lead to papers that report lower accuracies,\nwith correct unbiased methods, harder to publish. Several experiments with\ndifferent types of datasets and different types of classification models allow\nus to exhibit the problem and show it persists independently of the method or\ndataset.\n","authors":["Andrés Tello","Victoria Degeler","Alexander Lazovik"],"pdf_url":"https://arxiv.org/pdf/2310.11950v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11940v1","updated":"2023-10-18T13:06:05Z","published":"2023-10-18T13:06:05Z","title":"Interpretable Spectral Variational AutoEncoder (ISVAE) for time series\n  clustering","summary":"  The best encoding is the one that is interpretable in nature. In this work,\nwe introduce a novel model that incorporates an interpretable bottleneck-termed\nthe Filter Bank (FB)-at the outset of a Variational Autoencoder (VAE). This\narrangement compels the VAE to attend on the most informative segments of the\ninput signal, fostering the learning of a novel encoding ${f_0}$ which boasts\nenhanced interpretability and clusterability over traditional latent spaces. By\ndeliberately constraining the VAE with this FB, we intentionally constrict its\ncapacity to access broad input domain information, promoting the development of\nan encoding that is discernible, separable, and of reduced dimensionality. The\nevolutionary learning trajectory of ${f_0}$ further manifests as a dynamic\nhierarchical tree, offering profound insights into cluster similarities.\nAdditionally, for handling intricate data configurations, we propose a tailored\ndecoder structure that is symmetrically aligned with FB's architecture.\nEmpirical evaluations highlight the superior efficacy of ISVAE, which compares\nfavorably to state-of-the-art results in clustering metrics across real-world\ndatasets.\n","authors":["Óscar Jiménez Rama","Fernando Moreno-Pino","David Ramírez","Pablo M. Olmos"],"pdf_url":"https://arxiv.org/pdf/2310.11940v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.10737v2","updated":"2023-10-18T12:24:45Z","published":"2023-04-21T04:25:08Z","title":"Schooling to Exploit Foolish Contracts","summary":"  We introduce SCooLS, our Smart Contract Learning (Semi-supervised) engine.\nSCooLS uses neural networks to analyze Ethereum contract bytecode and\nidentifies specific vulnerable functions. SCooLS incorporates two key elements:\nsemi-supervised learning and graph neural networks (GNNs). Semi-supervised\nlearning produces more accurate models than unsupervised learning, while not\nrequiring the large oracle-labeled training set that supervised learning\nrequires. GNNs enable direct analysis of smart contract bytecode without any\nmanual feature engineering, predefined patterns, or expert rules. SCooLS is the\nfirst application of semi-supervised learning to smart contract vulnerability\nanalysis, as well as the first deep learning-based vulnerability analyzer to\nidentify specific vulnerable functions. SCooLS's performance is better than\nexisting tools, with an accuracy level of 98.4%, an F1 score of 90.5%, and an\nexceptionally low false positive rate of only 0.8%. Furthermore, SCooLS is\nfast, analyzing a typical function in 0.05 seconds. We leverage SCooLS's\nability to identify specific vulnerable functions to build an exploit\ngenerator, which was successful in stealing Ether from 76.9% of the true\npositives.\n","authors":["Tamer Abdelaziz","Aquinas Hobor"],"pdf_url":"https://arxiv.org/pdf/2304.10737v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11917v1","updated":"2023-10-18T12:13:13Z","published":"2023-10-18T12:13:13Z","title":"A Benchmark for Semi-Inductive Link Prediction in Knowledge Graphs","summary":"  Semi-inductive link prediction (LP) in knowledge graphs (KG) is the task of\npredicting facts for new, previously unseen entities based on context\ninformation. Although new entities can be integrated by retraining the model\nfrom scratch in principle, such an approach is infeasible for large-scale KGs,\nwhere retraining is expensive and new entities may arise frequently. In this\npaper, we propose and describe a large-scale benchmark to evaluate\nsemi-inductive LP models. The benchmark is based on and extends Wikidata5M: It\nprovides transductive, k-shot, and 0-shot LP tasks, each varying the available\ninformation from (i) only KG structure, to (ii) including textual mentions, and\n(iii) detailed descriptions of the entities. We report on a small study of\nrecent approaches and found that semi-inductive LP performance is far from\ntransductive performance on long-tail entities throughout all experiments. The\nbenchmark provides a test bed for further research into integrating context and\ntextual information in semi-inductive LP models.\n","authors":["Adrian Kochsiek","Rainer Gemulla"],"pdf_url":"https://arxiv.org/pdf/2310.11917v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11910v1","updated":"2023-10-18T11:59:35Z","published":"2023-10-18T11:59:35Z","title":"Multi-modal Medical Neurological Image Fusion using Wavelet Pooled Edge\n  Preserving Autoencoder","summary":"  Medical image fusion integrates the complementary diagnostic information of\nthe source image modalities for improved visualization and analysis of\nunderlying anomalies. Recently, deep learning-based models have excelled the\nconventional fusion methods by executing feature extraction, feature selection,\nand feature fusion tasks, simultaneously. However, most of the existing\nconvolutional neural network (CNN) architectures use conventional pooling or\nstrided convolutional strategies to downsample the feature maps. It causes the\nblurring or loss of important diagnostic information and edge details available\nin the source images and dilutes the efficacy of the feature extraction\nprocess. Therefore, this paper presents an end-to-end unsupervised fusion model\nfor multimodal medical images based on an edge-preserving dense autoencoder\nnetwork. In the proposed model, feature extraction is improved by using wavelet\ndecomposition-based attention pooling of feature maps. This helps in preserving\nthe fine edge detail information present in both the source images and enhances\nthe visual perception of fused images. Further, the proposed model is trained\non a variety of medical image pairs which helps in capturing the intensity\ndistributions of the source images and preserves the diagnostic information\neffectively. Substantial experiments are conducted which demonstrate that the\nproposed method provides improved visual and quantitative results as compared\nto the other state-of-the-art fusion methods.\n","authors":["Manisha Das","Deep Gupta","Petia Radeva","Ashwini M Bakde"],"pdf_url":"https://arxiv.org/pdf/2310.11910v1.pdf","comment":"8 pages, 5 figures, 6 tables"},{"id":"http://arxiv.org/abs/2302.14182v2","updated":"2023-10-18T11:45:55Z","published":"2023-02-27T22:46:44Z","title":"Taylor TD-learning","summary":"  Many reinforcement learning approaches rely on temporal-difference (TD)\nlearning to learn a critic. However, TD-learning updates can be high variance.\nHere, we introduce a model-based RL framework, Taylor TD, which reduces this\nvariance in continuous state-action settings. Taylor TD uses a first-order\nTaylor series expansion of TD updates. This expansion allows Taylor TD to\nanalytically integrate over stochasticity in the action-choice, and some\nstochasticity in the state distribution for the initial state and action of\neach TD update. We include theoretical and empirical evidence that Taylor TD\nupdates are indeed lower variance than standard TD updates. Additionally, we\nshow Taylor TD has the same stable learning guarantees as standard TD-learning\nwith linear function approximation under a reasonable assumption. Next, we\ncombine Taylor TD with the TD3 algorithm, forming TaTD3. We show TaTD3 performs\nas well, if not better, than several state-of-the art model-free and\nmodel-based baseline algorithms on a set of standard benchmark tasks.\n","authors":["Michele Garibbo","Maxime Robeyns","Laurence Aitchison"],"pdf_url":"https://arxiv.org/pdf/2302.14182v2.pdf","comment":"Published as a conference paper at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2201.08078v3","updated":"2023-10-18T11:39:07Z","published":"2022-01-20T09:22:43Z","title":"Addressing Maximization Bias in Reinforcement Learning with Two-Sample\n  Testing","summary":"  Value-based reinforcement-learning algorithms have shown strong results in\ngames, robotics, and other real-world applications. Overestimation bias is a\nknown threat to those algorithms and can lead to dramatic performance decreases\nor even complete algorithmic failure. We frame the bias problem statistically\nand consider it an instance of estimating the maximum expected value (MEV) of a\nset of random variables. We propose the $T$-Estimator (TE) based on two-sample\ntesting for the mean, that flexibly interpolates between over- and\nunderestimation by adjusting the significance level of the underlying\nhypothesis tests. A generalization, termed $K$-Estimator (KE), obeys the same\nbias and variance bounds as the TE while relying on a nearly arbitrary kernel\nfunction. We introduce modifications of $Q$-Learning and the Bootstrapped Deep\n$Q$-Network (BDQN) using the TE and the KE, and prove convergence in the\ntabular setting. Furthermore, we propose an adaptive variant of the TE-based\nBDQN that dynamically adjusts the significance level to minimize the absolute\nestimation bias. All proposed estimators and algorithms are thoroughly tested\nand validated on diverse tasks and environments, illustrating the bias control\nand performance potential of the TE and KE.\n","authors":["Martin Waltz","Ostap Okhrin"],"pdf_url":"https://arxiv.org/pdf/2201.08078v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11897v1","updated":"2023-10-18T11:33:22Z","published":"2023-10-18T11:33:22Z","title":"Accelerated Policy Gradient: On the Nesterov Momentum for Reinforcement\n  Learning","summary":"  Policy gradient methods have recently been shown to enjoy global convergence\nat a $\\Theta(1/t)$ rate in the non-regularized tabular softmax setting.\nAccordingly, one important research question is whether this convergence rate\ncan be further improved, with only first-order updates. In this paper, we\nanswer the above question from the perspective of momentum by adapting the\ncelebrated Nesterov's accelerated gradient (NAG) method to reinforcement\nlearning (RL), termed \\textit{Accelerated Policy Gradient} (APG). To\ndemonstrate the potential of APG in achieving faster global convergence, we\nformally show that with the true gradient, APG with softmax policy\nparametrization converges to an optimal policy at a $\\tilde{O}(1/t^2)$ rate. To\nthe best of our knowledge, this is the first characterization of the global\nconvergence rate of NAG in the context of RL. Notably, our analysis relies on\none interesting finding: Regardless of the initialization, APG could end up\nreaching a locally nearly-concave regime, where APG could benefit significantly\nfrom the momentum, within finite iterations. By means of numerical validation,\nwe confirm that APG exhibits $\\tilde{O}(1/t^2)$ rate as well as show that APG\ncould significantly improve the convergence behavior over the standard policy\ngradient.\n","authors":["Yen-Ju Chen","Nai-Chieh Huang","Ping-Chun Hsieh"],"pdf_url":"https://arxiv.org/pdf/2310.11897v1.pdf","comment":"51 pages, 8 figures"},{"id":"http://arxiv.org/abs/2310.11896v1","updated":"2023-10-18T11:29:53Z","published":"2023-10-18T11:29:53Z","title":"A New Multimodal Medical Image Fusion based on Laplacian Autoencoder\n  with Channel Attention","summary":"  Medical image fusion combines the complementary information of multimodal\nmedical images to assist medical professionals in the clinical diagnosis of\npatients' disorders and provide guidance during preoperative and\nintra-operative procedures. Deep learning (DL) models have achieved end-to-end\nimage fusion with highly robust and accurate fusion performance. However, most\nDL-based fusion models perform down-sampling on the input images to minimize\nthe number of learnable parameters and computations. During this process,\nsalient features of the source images become irretrievable leading to the loss\nof crucial diagnostic edge details and contrast of various brain tissues. In\nthis paper, we propose a new multimodal medical image fusion model is proposed\nthat is based on integrated Laplacian-Gaussian concatenation with attention\npooling (LGCA). We prove that our model preserves effectively complementary\ninformation and important tissue structures.\n","authors":["Payal Wankhede","Manisha Das","Deep Gupta","Petia Radeva","Ashwini M Bakde"],"pdf_url":"https://arxiv.org/pdf/2310.11896v1.pdf","comment":"10 pages, 6 figures, % tables"},{"id":"http://arxiv.org/abs/2305.14952v2","updated":"2023-10-18T11:24:31Z","published":"2023-05-24T09:42:30Z","title":"Focus Your Attention (with Adaptive IIR Filters)","summary":"  We present a new layer in which dynamic (i.e.,input-dependent) Infinite\nImpulse Response (IIR) filters of order two are used to process the input\nsequence prior to applying conventional attention. The input is split into\nchunks, and the coefficients of these filters are determined based on previous\nchunks to maintain causality. Despite their relatively low order, the causal\nadaptive filters are shown to focus attention on the relevant sequence\nelements. The new layer is grounded in control theory, and is shown to\ngeneralize diagonal state-space layers. The layer performs on-par with\nstate-of-the-art networks, with a fraction of their parameters and with time\ncomplexity that is sub-quadratic with input size. The obtained layer is\nfavorable to layers such as Heyna, GPT2, and Mega, both with respect to the\nnumber of parameters and the obtained level of performance on multiple\nlong-range sequence problems.\n","authors":["Shahar Lutati","Itamar Zimerman","Lior Wolf"],"pdf_url":"https://arxiv.org/pdf/2305.14952v2.pdf","comment":"Accepted to EMNLP 2023"},{"id":"http://arxiv.org/abs/2309.06800v4","updated":"2023-10-18T11:21:10Z","published":"2023-09-13T08:48:00Z","title":"Uncertainty-aware Traffic Prediction under Missing Data","summary":"  Traffic prediction is a crucial topic because of its broad scope of\napplications in the transportation domain. Recently, various studies have\nachieved promising results. However, most studies assume the prediction\nlocations have complete or at least partial historical records and cannot be\nextended to non-historical recorded locations. In real-life scenarios, the\ndeployment of sensors could be limited due to budget limitations and\ninstallation availability, which makes most current models not applicable.\nThough few pieces of literature tried to impute traffic states at the missing\nlocations, these methods need the data simultaneously observed at the locations\nwith sensors, making them not applicable to prediction tasks. Another drawback\nis the lack of measurement of uncertainty in prediction, making prior works\nunsuitable for risk-sensitive tasks or involving decision-making. To fill the\ngap, inspired by the previous inductive graph neural network, this work\nproposed an uncertainty-aware framework with the ability to 1) extend\nprediction to missing locations with no historical records and significantly\nextend spatial coverage of prediction locations while reducing deployment of\nsensors and 2) generate probabilistic prediction with uncertainty\nquantification to help the management of risk and decision making in the\ndown-stream tasks. Through extensive experiments on real-life datasets, the\nresult shows our method achieved promising results on prediction tasks, and the\nuncertainty quantification gives consistent results which highly correlated\nwith the locations with and without historical data. We also show that our\nmodel could help support sensor deployment tasks in the transportation field to\nachieve higher accuracy with a limited sensor deployment budget.\n","authors":["Hao Mei","Junxian Li","Zhiming Liang","Guanjie Zheng","Bin Shi","Hua Wei"],"pdf_url":"https://arxiv.org/pdf/2309.06800v4.pdf","comment":"11 pages, 3 figures, extended version of our short paper\n  \"Uncertainty-aware Traffic Prediction under Missing Data\" submited to ICDM\n  2023"},{"id":"http://arxiv.org/abs/2310.11891v1","updated":"2023-10-18T11:20:59Z","published":"2023-10-18T11:20:59Z","title":"A Hyperparameter Study for Quantum Kernel Methods","summary":"  Quantum kernel methods are a promising method in quantum machine learning\nthanks to the guarantees connected to them. Their accessibility for analytic\nconsiderations also opens up the possibility of prescreening datasets based on\ntheir potential for a quantum advantage. To do so, earlier works developed the\ngeometric difference, which can be understood as a closeness measure between\ntwo kernel-based machine learning approaches, most importantly between a\nquantum kernel and classical kernel. This metric links the quantum and\nclassical model complexities. Therefore, it raises the question of whether the\ngeometric difference, based on its relation to model complexity, can be a\nuseful tool in evaluations other than for the potential for quantum advantage.\nIn this work, we investigate the effects of hyperparameter choice on the model\nperformance and the generalization gap between classical and quantum kernels.\nThe importance of hyperparameter optimization is well known also for classical\nmachine learning. Especially for the quantum Hamiltonian evolution feature map,\nthe scaling of the input data has been shown to be crucial. However, there are\nadditional parameters left to be optimized, like the best number of qubits to\ntrace out before computing a projected quantum kernel. We investigate the\ninfluence of these hyperparameters and compare the classically reliable method\nof cross validation with the method of choosing based on the geometric\ndifference. Based on the thorough investigation of the hyperparameters across\n11 datasets we identified commodities that can be exploited when examining a\nnew dataset. In addition, our findings contribute to better understanding of\nthe applicability of the geometric difference.\n","authors":["Sebastian Egginger","Alona Sakhnenko","Jeanette Miriam Lorenz"],"pdf_url":"https://arxiv.org/pdf/2310.11891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06026v3","updated":"2023-10-18T11:19:54Z","published":"2023-05-10T10:22:28Z","title":"Uncertainty in GNN Learning Evaluations: The Importance of a Consistent\n  Benchmark for Community Detection","summary":"  Graph Neural Networks (GNNs) have improved unsupervised community detection\nof clustered nodes due to their ability to encode the dual dimensionality of\nthe connectivity and feature information spaces of graphs. Identifying the\nlatent communities has many practical applications from social networks to\ngenomics. Current benchmarks of real world performance are confusing due to the\nvariety of decisions influencing the evaluation of GNNs at this task. To\naddress this, we propose a framework to establish a common evaluation protocol.\nWe motivate and justify it by demonstrating the differences with and without\nthe protocol. The W Randomness Coefficient is a metric proposed for assessing\nthe consistency of algorithm rankings to quantify the reliability of results\nunder the presence of randomness. We find that by ensuring the same evaluation\ncriteria is followed, there may be significant differences from the reported\nperformance of methods at this task, but a more complete evaluation and\ncomparison of methods is possible.\n","authors":["William Leeney","Ryan McConville"],"pdf_url":"https://arxiv.org/pdf/2305.06026v3.pdf","comment":"Accepted by Twelfth International Conference on Complex Networks &\n  Their Applications"},{"id":"http://arxiv.org/abs/2310.11889v1","updated":"2023-10-18T11:16:32Z","published":"2023-10-18T11:16:32Z","title":"Building a Graph-based Deep Learning network model from captured traffic\n  traces","summary":"  Currently the state of the art network models are based or depend on Discrete\nEvent Simulation (DES). While DES is highly accurate, it is also\ncomputationally costly and cumbersome to parallelize, making it unpractical to\nsimulate high performance networks. Additionally, simulated scenarios fail to\ncapture all of the complexities present in real network scenarios. While there\nexists network models based on Machine Learning (ML) techniques to minimize\nthese issues, these models are also trained with simulated data and hence\nvulnerable to the same pitfalls. Consequently, the Graph Neural Networking\nChallenge 2023 introduces a dataset of captured traffic traces that can be used\nto build a ML-based network model without these limitations. In this paper we\npropose a Graph Neural Network (GNN)-based solution specifically designed to\nbetter capture the complexities of real network scenarios. This is done through\na novel encoding method to capture information from the sequence of captured\npackets, and an improved message passing algorithm to better represent the\ndependencies present in physical networks. We show that the proposed solution\nit is able to learn and generalize to unseen captured network scenarios.\n","authors":["Carlos Güemes-Palau","Miquel Ferriol Galmés","Albert Cabellos-Aparicio","Pere Barlet-Ros"],"pdf_url":"https://arxiv.org/pdf/2310.11889v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2310.11888v1","updated":"2023-10-18T11:14:46Z","published":"2023-10-18T11:14:46Z","title":"Analyze Mass Spectrometry data with Artificial Intelligence to assist\n  the understanding of past habitability of Mars and provide insights for\n  future missions","summary":"  This paper presents an application of artificial intelligence on mass\nspectrometry data for detecting habitability potential of ancient Mars.\nAlthough data was collected for planet Mars the same approach can be replicated\nfor any terrestrial object of our solar system. Furthermore, proposed\nmethodology can be adapted to any domain that uses mass spectrometry. This\nresearch is focused in data analysis of two mass spectrometry techniques,\nevolved gas analysis (EGA-MS) and gas chromatography (GC-MS), which are used to\nidentify specific chemical compounds in geological material samples. The study\ndemonstrates the applicability of EGA-MS and GC-MS data to extra-terrestrial\nmaterial analysis. Most important features of proposed methodology includes\nsquare root transformation of mass spectrometry values, conversion of raw data\nto 2D sprectrograms and utilization of specific machine learning models and\ntechniques to avoid overfitting on relative small datasets. Both EGA-MS and\nGC-MS datasets come from NASA and two machine learning competitions that the\nauthor participated and exploited. Complete running code for the GC-MS\ndataset/competition is available at GitHub.1 Raw training mass spectrometry\ndata include [0, 1] labels of specific chemical compounds, selected to provide\nvaluable insights and contribute to our understanding of the potential past\nhabitability of Mars.\n","authors":["Ioannis Nasios"],"pdf_url":"https://arxiv.org/pdf/2310.11888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11884v1","updated":"2023-10-18T11:08:02Z","published":"2023-10-18T11:08:02Z","title":"From Neural Activations to Concepts: A Survey on Explaining Concepts in\n  Neural Networks","summary":"  In this paper, we review recent approaches for explaining concepts in neural\nnetworks. Concepts can act as a natural link between learning and reasoning:\nonce the concepts are identified that a neural learning system uses, one can\nintegrate those concepts with a reasoning system for inference or use a\nreasoning system to act upon them to improve or enhance the learning system. On\nthe other hand, knowledge can not only be extracted from neural networks but\nconcept knowledge can also be inserted into neural network architectures. Since\nintegrating learning and reasoning is at the core of neuro-symbolic AI, the\ninsights gained from this survey can serve as an important step towards\nrealizing neuro-symbolic AI based on explainable concepts.\n","authors":["Jae Hee Lee","Sergio Lanza","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2310.11884v1.pdf","comment":"Submitted to Neurosymbolic Artificial Intelligence\n  (https://neurosymbolic-ai-journal.com/paper/neural-activations-concepts-survey-explaining-concepts-neural-networks)"},{"id":"http://arxiv.org/abs/2310.04292v3","updated":"2023-10-18T11:06:43Z","published":"2023-10-06T14:51:17Z","title":"Towards Foundational Models for Molecular Learning on Large-Scale\n  Multi-Task Datasets","summary":"  Recently, pre-trained foundation models have enabled significant advancements\nin multiple fields. In molecular machine learning, however, where datasets are\noften hand-curated, and hence typically small, the lack of datasets with\nlabeled features, and codebases to manage those datasets, has hindered the\ndevelopment of foundation models. In this work, we present seven novel datasets\ncategorized by size into three distinct categories: ToyMix, LargeMix and\nUltraLarge. These datasets push the boundaries in both the scale and the\ndiversity of supervised labels for molecular learning. They cover nearly 100\nmillion molecules and over 3000 sparsely defined tasks, totaling more than 13\nbillion individual labels of both quantum and biological nature. In comparison,\nour datasets contain 300 times more data points than the widely used OGB-LSC\nPCQM4Mv2 dataset, and 13 times more than the quantum-only QM1B dataset. In\naddition, to support the development of foundational models based on our\nproposed datasets, we present the Graphium graph machine learning library which\nsimplifies the process of building and training molecular machine learning\nmodels for multi-task and multi-level molecular datasets. Finally, we present a\nrange of baseline results as a starting point of multi-task and multi-level\ntraining on these datasets. Empirically, we observe that performance on\nlow-resource biological datasets show improvement by also training on large\namounts of quantum data. This indicates that there may be potential in\nmulti-task and multi-level training of a foundation model and fine-tuning it to\nresource-constrained downstream tasks.\n","authors":["Dominique Beaini","Shenyang Huang","Joao Alex Cunha","Zhiyi Li","Gabriela Moisescu-Pareja","Oleksandr Dymov","Samuel Maddrell-Mander","Callum McLean","Frederik Wenkel","Luis Müller","Jama Hussein Mohamud","Ali Parviz","Michael Craig","Michał Koziarski","Jiarui Lu","Zhaocheng Zhu","Cristian Gabellini","Kerstin Klaser","Josef Dean","Cas Wognum","Maciej Sypetkowski","Guillaume Rabusseau","Reihaneh Rabbany","Jian Tang","Christopher Morris","Ioannis Koutis","Mirco Ravanelli","Guy Wolf","Prudencio Tossou","Hadrien Mary","Therence Bois","Andrew Fitzgibbon","Błażej Banaszewski","Chad Martin","Dominic Masters"],"pdf_url":"https://arxiv.org/pdf/2310.04292v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11880v1","updated":"2023-10-18T11:06:06Z","published":"2023-10-18T11:06:06Z","title":"Online Convex Optimization with Switching Cost and Delayed Gradients","summary":"  We consider the online convex optimization (OCO) problem with quadratic and\nlinear switching cost in the limited information setting, where an online\nalgorithm can choose its action using only gradient information about the\nprevious objective function. For $L$-smooth and $\\mu$-strongly convex objective\nfunctions, we propose an online multiple gradient descent (OMGD) algorithm and\nshow that its competitive ratio for the OCO problem with quadratic switching\ncost is at most $4(L + 5) + \\frac{16(L + 5)}{\\mu}$. The competitive ratio upper\nbound for OMGD is also shown to be order-wise tight in terms of $L,\\mu$. In\naddition, we show that the competitive ratio of any online algorithm is\n$\\max\\{\\Omega(L), \\Omega(\\frac{L}{\\sqrt{\\mu}})\\}$ in the limited information\nsetting when the switching cost is quadratic. We also show that the OMGD\nalgorithm achieves the optimal (order-wise) dynamic regret in the limited\ninformation setting. For the linear switching cost, the competitive ratio upper\nbound of the OMGD algorithm is shown to depend on both the path length and the\nsquared path length of the problem instance, in addition to $L, \\mu$, and is\nshown to be order-wise, the best competitive ratio any online algorithm can\nachieve. Consequently, we conclude that the optimal competitive ratio for the\nquadratic and linear switching costs are fundamentally different in the limited\ninformation setting.\n","authors":["Spandan Senapati","Rahul Vaze"],"pdf_url":"https://arxiv.org/pdf/2310.11880v1.pdf","comment":"35 pages, accepted at IFIP Performance'23, appears in Elsevier\n  Performance Evaluation"},{"id":"http://arxiv.org/abs/2310.11876v1","updated":"2023-10-18T10:56:57Z","published":"2023-10-18T10:56:57Z","title":"SQ Lower Bounds for Learning Mixtures of Linear Classifiers","summary":"  We study the problem of learning mixtures of linear classifiers under\nGaussian covariates. Given sample access to a mixture of $r$ distributions on\n$\\mathbb{R}^n$ of the form $(\\mathbf{x},y_{\\ell})$, $\\ell\\in [r]$, where\n$\\mathbf{x}\\sim\\mathcal{N}(0,\\mathbf{I}_n)$ and\n$y_\\ell=\\mathrm{sign}(\\langle\\mathbf{v}_\\ell,\\mathbf{x}\\rangle)$ for an unknown\nunit vector $\\mathbf{v}_\\ell$, the goal is to learn the underlying distribution\nin total variation distance. Our main result is a Statistical Query (SQ) lower\nbound suggesting that known algorithms for this problem are essentially best\npossible, even for the special case of uniform mixtures. In particular, we show\nthat the complexity of any SQ algorithm for the problem is\n$n^{\\mathrm{poly}(1/\\Delta) \\log(r)}$, where $\\Delta$ is a lower bound on the\npairwise $\\ell_2$-separation between the $\\mathbf{v}_\\ell$'s. The key technical\ningredient underlying our result is a new construction of spherical designs\nthat may be of independent interest.\n","authors":["Ilias Diakonikolas","Daniel M. Kane","Yuxin Sun"],"pdf_url":"https://arxiv.org/pdf/2310.11876v1.pdf","comment":"To appear in NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.11875v1","updated":"2023-10-18T10:49:29Z","published":"2023-10-18T10:49:29Z","title":"Fractional Concepts in Neural Networks: Enhancing Activation and Loss\n  Functions","summary":"  The paper presents a method for using fractional concepts in a neural network\nto modify the activation and loss functions. The methodology allows the neural\nnetwork to define and optimize its activation functions by determining the\nfractional derivative order of the training process as an additional\nhyperparameter. This will enable neurons in the network to adjust their\nactivation functions to match input data better and reduce output errors,\npotentially improving the network's overall performance.\n","authors":["Zahra Alijani","Vojtech Molek"],"pdf_url":"https://arxiv.org/pdf/2310.11875v1.pdf","comment":"12 pages, 6 figures, submitted to Neurocomputing journal"},{"id":"http://arxiv.org/abs/2310.11867v1","updated":"2023-10-18T10:32:39Z","published":"2023-10-18T10:32:39Z","title":"Evaluating the Fairness of Discriminative Foundation Models in Computer\n  Vision","summary":"  We propose a novel taxonomy for bias evaluation of discriminative foundation\nmodels, such as Contrastive Language-Pretraining (CLIP), that are used for\nlabeling tasks. We then systematically evaluate existing methods for mitigating\nbias in these models with respect to our taxonomy. Specifically, we evaluate\nOpenAI's CLIP and OpenCLIP models for key applications, such as zero-shot\nclassification, image retrieval and image captioning. We categorize desired\nbehaviors based around three axes: (i) if the task concerns humans; (ii) how\nsubjective the task is (i.e., how likely it is that people from a diverse range\nof backgrounds would agree on a labeling); and (iii) the intended purpose of\nthe task and if fairness is better served by impartiality (i.e., making\ndecisions independent of the protected attributes) or representation (i.e.,\nmaking decisions to maximize diversity). Finally, we provide quantitative\nfairness evaluations for both binary-valued and multi-valued protected\nattributes over ten diverse datasets. We find that fair PCA, a post-processing\nmethod for fair representations, works very well for debiasing in most of the\naforementioned tasks while incurring only minor loss of performance. However,\ndifferent debiasing approaches vary in their effectiveness depending on the\ntask. Hence, one should choose the debiasing approach depending on the specific\nuse case.\n","authors":["Junaid Ali","Matthaeus Kleindessner","Florian Wenzel","Kailash Budhathoki","Volkan Cevher","Chris Russell"],"pdf_url":"https://arxiv.org/pdf/2310.11867v1.pdf","comment":"Accepted at AIES'23"},{"id":"http://arxiv.org/abs/2310.11866v1","updated":"2023-10-18T10:29:58Z","published":"2023-10-18T10:29:58Z","title":"Stochastic Optimization for Non-convex Problem with Inexact Hessian\n  Matrix, Gradient, and Function","summary":"  Trust-region (TR) and adaptive regularization using cubics (ARC) have proven\nto have some very appealing theoretical properties for non-convex optimization\nby concurrently computing function value, gradient, and Hessian matrix to\nobtain the next search direction and the adjusted parameters. Although\nstochastic approximations help largely reduce the computational cost, it is\nchallenging to theoretically guarantee the convergence rate. In this paper, we\nexplore a family of stochastic TR and ARC methods that can simultaneously\nprovide inexact computations of the Hessian matrix, gradient, and function\nvalues. Our algorithms require much fewer propagations overhead per iteration\nthan TR and ARC. We prove that the iteration complexity to achieve\n$\\epsilon$-approximate second-order optimality is of the same order as the\nexact computations demonstrated in previous studies. Additionally, the mild\nconditions on inexactness can be met by leveraging a random sampling technology\nin the finite-sum minimization problem. Numerical experiments with a non-convex\nproblem support these findings and demonstrate that, with the same or a similar\nnumber of iterations, our algorithms require less computational overhead per\niteration than current second-order methods.\n","authors":["Liu Liu","Xuanqing Liu","Cho-Jui Hsieh","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2310.11866v1.pdf","comment":"arXiv admin note: text overlap with arXiv:1809.09853"},{"id":"http://arxiv.org/abs/2310.11865v1","updated":"2023-10-18T10:28:29Z","published":"2023-10-18T10:28:29Z","title":"Effective and Efficient Federated Tree Learning on Hybrid Data","summary":"  Federated learning has emerged as a promising distributed learning paradigm\nthat facilitates collaborative learning among multiple parties without\ntransferring raw data. However, most existing federated learning studies focus\non either horizontal or vertical data settings, where the data of different\nparties are assumed to be from the same feature or sample space. In practice, a\ncommon scenario is the hybrid data setting, where data from different parties\nmay differ both in the features and samples. To address this, we propose\nHybridTree, a novel federated learning approach that enables federated tree\nlearning on hybrid data. We observe the existence of consistent split rules in\ntrees. With the help of these split rules, we theoretically show that the\nknowledge of parties can be incorporated into the lower layers of a tree. Based\non our theoretical analysis, we propose a layer-level solution that does not\nneed frequent communication traffic to train a tree. Our experiments\ndemonstrate that HybridTree can achieve comparable accuracy to the centralized\nsetting with low computational and communication overhead. HybridTree can\nachieve up to 8 times speedup compared with the other baselines.\n","authors":["Qinbin Li","Chulin Xie","Xiaojun Xu","Xiaoyuan Liu","Ce Zhang","Bo Li","Bingsheng He","Dawn Song"],"pdf_url":"https://arxiv.org/pdf/2310.11865v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11864v1","updated":"2023-10-18T10:26:56Z","published":"2023-10-18T10:26:56Z","title":"VQ-NeRF: Neural Reflectance Decomposition and Editing with Vector\n  Quantization","summary":"  We propose VQ-NeRF, a two-branch neural network model that incorporates\nVector Quantization (VQ) to decompose and edit reflectance fields in 3D scenes.\nConventional neural reflectance fields use only continuous representations to\nmodel 3D scenes, despite the fact that objects are typically composed of\ndiscrete materials in reality. This lack of discretization can result in noisy\nmaterial decomposition and complicated material editing. To address these\nlimitations, our model consists of a continuous branch and a discrete branch.\nThe continuous branch follows the conventional pipeline to predict decomposed\nmaterials, while the discrete branch uses the VQ mechanism to quantize\ncontinuous materials into individual ones. By discretizing the materials, our\nmodel can reduce noise in the decomposition process and generate a segmentation\nmap of discrete materials. Specific materials can be easily selected for\nfurther editing by clicking on the corresponding area of the segmentation\noutcomes. Additionally, we propose a dropout-based VQ codeword ranking strategy\nto predict the number of materials in a scene, which reduces redundancy in the\nmaterial segmentation process. To improve usability, we also develop an\ninteractive interface to further assist material editing. We evaluate our model\non both computer-generated and real-world scenes, demonstrating its superior\nperformance. To the best of our knowledge, our model is the first to enable\ndiscrete material editing in 3D scenes.\n","authors":["Hongliang Zhong","Jingbo Zhang","Jing Liao"],"pdf_url":"https://arxiv.org/pdf/2310.11864v1.pdf","comment":"Under Review. Project Page:\n  https://jtbzhl.github.io/VQ-NeRF.github.io/"},{"id":"http://arxiv.org/abs/2308.03666v4","updated":"2023-10-18T10:26:18Z","published":"2023-08-07T15:35:32Z","title":"Bridging Trustworthiness and Open-World Learning: An Exploratory Neural\n  Approach for Enhancing Interpretability, Generalization, and Robustness","summary":"  As researchers strive to narrow the gap between machine intelligence and\nhuman through the development of artificial intelligence technologies, it is\nimperative that we recognize the critical importance of trustworthiness in\nopen-world, which has become ubiquitous in all aspects of daily life for\neveryone. However, several challenges may create a crisis of trust in current\nartificial intelligence systems that need to be bridged: 1) Insufficient\nexplanation of predictive results; 2) Inadequate generalization for learning\nmodels; 3) Poor adaptability to uncertain environments. Consequently, we\nexplore a neural program to bridge trustworthiness and open-world learning,\nextending from single-modal to multi-modal scenarios for readers. 1) To enhance\ndesign-level interpretability, we first customize trustworthy networks with\nspecific physical meanings; 2) We then design environmental well-being\ntask-interfaces via flexible learning regularizers for improving the\ngeneralization of trustworthy learning; 3) We propose to increase the\nrobustness of trustworthy learning by integrating open-world recognition losses\nwith agent mechanisms. Eventually, we enhance various trustworthy properties\nthrough the establishment of design-level explainability, environmental\nwell-being task-interfaces and open-world recognition programs. These designed\nopen-world protocols are applicable across a wide range of surroundings, under\nopen-world multimedia recognition scenarios with significant performance\nimprovements observed.\n","authors":["Shide Du","Zihan Fang","Shiyang Lan","Yanchao Tan","Manuel Günther","Shiping Wang","Wenzhong Guo"],"pdf_url":"https://arxiv.org/pdf/2308.03666v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11850v1","updated":"2023-10-18T10:06:42Z","published":"2023-10-18T10:06:42Z","title":"Revisiting Transferable Adversarial Image Examples: Attack\n  Categorization, Evaluation Guidelines, and New Insights","summary":"  Transferable adversarial examples raise critical security concerns in\nreal-world, black-box attack scenarios. However, in this work, we identify two\nmain problems in common evaluation practices: (1) For attack transferability,\nlack of systematic, one-to-one attack comparison and fair hyperparameter\nsettings. (2) For attack stealthiness, simply no comparisons. To address these\nproblems, we establish new evaluation guidelines by (1) proposing a novel\nattack categorization strategy and conducting systematic and fair\nintra-category analyses on transferability, and (2) considering diverse\nimperceptibility metrics and finer-grained stealthiness characteristics from\nthe perspective of attack traceback. To this end, we provide the first\nlarge-scale evaluation of transferable adversarial examples on ImageNet,\ninvolving 23 representative attacks against 9 representative defenses. Our\nevaluation leads to a number of new insights, including consensus-challenging\nones: (1) Under a fair attack hyperparameter setting, one early attack method,\nDI, actually outperforms all the follow-up methods. (2) A state-of-the-art\ndefense, DiffPure, actually gives a false sense of (white-box) security since\nit is indeed largely bypassed by our (black-box) transferable attacks. (3) Even\nwhen all attacks are bounded by the same $L_p$ norm, they lead to dramatically\ndifferent stealthiness performance, which negatively correlates with their\ntransferability performance. Overall, our work demonstrates that existing\nproblematic evaluations have indeed caused misleading conclusions and missing\npoints, and as a result, hindered the assessment of the actual progress in this\nfield.\n","authors":["Zhengyu Zhao","Hanwei Zhang","Renjue Li","Ronan Sicre","Laurent Amsaleg","Michael Backes","Qi Li","Chao Shen"],"pdf_url":"https://arxiv.org/pdf/2310.11850v1.pdf","comment":"Code is available at\n  https://github.com/ZhengyuZhao/TransferAttackEval"},{"id":"http://arxiv.org/abs/2310.00627v2","updated":"2023-10-18T09:55:02Z","published":"2023-10-01T09:40:40Z","title":"Intelligent Client Selection for Federated Learning using Cellular\n  Automata","summary":"  Federated Learning (FL) has emerged as a promising solution for\nprivacy-enhancement and latency minimization in various real-world\napplications, such as transportation, communications, and healthcare. FL\nendeavors to bring Machine Learning (ML) down to the edge by harnessing data\nfrom million of devices and IoT sensors, thus enabling rapid responses to\ndynamic environments and yielding highly personalized results. However, the\nincreased amount of sensors across diverse applications poses challenges in\nterms of communication and resource allocation, hindering the participation of\nall devices in the federated process and prompting the need for effective FL\nclient selection. To address this issue, we propose Cellular Automaton-based\nClient Selection (CA-CS), a novel client selection algorithm, which leverages\nCellular Automata (CA) as models to effectively capture spatio-temporal changes\nin a fast-evolving environment. CA-CS considers the computational resources and\ncommunication capacity of each participating client, while also accounting for\ninter-client interactions between neighbors during the client selection\nprocess, enabling intelligent client selection for online FL processes on data\nstreams that closely resemble real-world scenarios. In this paper, we present a\nthorough evaluation of the proposed CA-CS algorithm using MNIST and CIFAR-10\ndatasets, while making a direct comparison against a uniformly random client\nselection scheme. Our results demonstrate that CA-CS achieves comparable\naccuracy to the random selection approach, while effectively avoiding\nhigh-latency clients.\n","authors":["Nikolaos Pavlidis","Vasileios Perifanis","Theodoros Panagiotis Chatzinikolaou","Georgios Ch. Sirakoulis","Pavlos S. Efraimidis"],"pdf_url":"https://arxiv.org/pdf/2310.00627v2.pdf","comment":"18th IEEE International Workshop on Cellular Nanoscale Networks and\n  their Applications"},{"id":"http://arxiv.org/abs/2310.11845v1","updated":"2023-10-18T09:51:59Z","published":"2023-10-18T09:51:59Z","title":"Accelerate Presolve in Large-Scale Linear Programming via Reinforcement\n  Learning","summary":"  Large-scale LP problems from industry usually contain much redundancy that\nseverely hurts the efficiency and reliability of solving LPs, making presolve\n(i.e., the problem simplification module) one of the most critical components\nin modern LP solvers. However, how to design high-quality presolve routines --\nthat is, the program determining (P1) which presolvers to select, (P2) in what\norder to execute, and (P3) when to stop -- remains a highly challenging task\ndue to the extensive requirements on expert knowledge and the large search\nspace. Due to the sequential decision property of the task and the lack of\nexpert demonstrations, we propose a simple and efficient reinforcement learning\n(RL) framework -- namely, reinforcement learning for presolve (RL4Presolve) --\nto tackle (P1)-(P3) simultaneously. Specifically, we formulate the routine\ndesign task as a Markov decision process and propose an RL framework with\nadaptive action sequences to generate high-quality presolve routines\nefficiently. Note that adaptive action sequences help learn complex behaviors\nefficiently and adapt to various benchmarks. Experiments on two solvers\n(open-source and commercial) and eight benchmarks (real-world and synthetic)\ndemonstrate that RL4Presolve significantly and consistently improves the\nefficiency of solving large-scale LPs, especially on benchmarks from industry.\nFurthermore, we optimize the hard-coded presolve routines in LP solvers by\nextracting rules from learned policies for simple and efficient deployment to\nHuawei's supply chain. The results show encouraging economic and academic\npotential for incorporating machine learning to modern solvers.\n","authors":["Yufei Kuang","Xijun Li","Jie Wang","Fangzhou Zhu","Meng Lu","Zhihai Wang","Jia Zeng","Houqiang Li","Yongdong Zhang","Feng Wu"],"pdf_url":"https://arxiv.org/pdf/2310.11845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11840v1","updated":"2023-10-18T09:46:01Z","published":"2023-10-18T09:46:01Z","title":"On The Expressivity of Objective-Specification Formalisms in\n  Reinforcement Learning","summary":"  To solve a task with reinforcement learning (RL), it is necessary to formally\nspecify the goal of that task. Although most RL algorithms require that the\ngoal is formalised as a Markovian reward function, alternatives have been\ndeveloped (such as Linear Temporal Logic and Multi-Objective Reinforcement\nLearning). Moreover, it is well known that some of these formalisms are able to\nexpress certain tasks that other formalisms cannot express. However, there has\nnot yet been any thorough analysis of how these formalisms relate to each other\nin terms of expressivity. In this work, we fill this gap in the existing\nliterature by providing a comprehensive comparison of the expressivities of 17\nobjective-specification formalisms in RL. We place these formalisms in a\npreorder based on their expressive power, and present this preorder as a Hasse\ndiagram. We find a variety of limitations for the different formalisms, and\nthat no formalism is both dominantly expressive and straightforward to optimise\nwith current techniques. For example, we prove that each of Regularised RL,\nOuter Nonlinear Markov Rewards, Reward Machines, Linear Temporal Logic, and\nLimit Average Rewards can express an objective that the others cannot. Our\nfindings have implications for both policy optimisation and reward learning.\nFirstly, we identify expressivity limitations which are important to consider\nwhen specifying objectives in practice. Secondly, our results highlight the\nneed for future research which adapts reward learning to work with a variety of\nformalisms, since many existing reward learning methods implicitly assume that\ndesired objectives can be expressed with Markovian rewards. Our work\ncontributes towards a more cohesive understanding of the costs and benefits of\ndifferent RL objective-specification formalisms.\n","authors":["Rohan Subramani","Marcus Williams","Max Heitmann","Halfdan Holm","Charlie Griffin","Joar Skalse"],"pdf_url":"https://arxiv.org/pdf/2310.11840v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11838v1","updated":"2023-10-18T09:43:15Z","published":"2023-10-18T09:43:15Z","title":"Equivariant Bootstrapping for Uncertainty Quantification in Imaging\n  Inverse Problems","summary":"  Scientific imaging problems are often severely ill-posed, and hence have\nsignificant intrinsic uncertainty. Accurately quantifying the uncertainty in\nthe solutions to such problems is therefore critical for the rigorous\ninterpretation of experimental results as well as for reliably using the\nreconstructed images as scientific evidence. Unfortunately, existing imaging\nmethods are unable to quantify the uncertainty in the reconstructed images in a\nmanner that is robust to experiment replications. This paper presents a new\nuncertainty quantification methodology based on an equivariant formulation of\nthe parametric bootstrap algorithm that leverages symmetries and invariance\nproperties commonly encountered in imaging problems. Additionally, the proposed\nmethodology is general and can be easily applied with any image reconstruction\ntechnique, including unsupervised training strategies that can be trained from\nobserved data alone, thus enabling uncertainty quantification in situations\nwhere there is no ground truth data available. We demonstrate the proposed\napproach with a series of numerical experiments and through comparisons with\nalternative uncertainty quantification strategies from the state-of-the-art,\nsuch as Bayesian strategies involving score-based diffusion models and Langevin\nsamplers. In all our experiments, the proposed method delivers remarkably\naccurate high-dimensional confidence regions and outperforms the competing\napproaches in terms of estimation accuracy, uncertainty quantification\naccuracy, and computing time.\n","authors":["Julian Tachella","Marcelo Pereyra"],"pdf_url":"https://arxiv.org/pdf/2310.11838v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11837v1","updated":"2023-10-18T09:42:39Z","published":"2023-10-18T09:42:39Z","title":"Optimising Distributions with Natural Gradient Surrogates","summary":"  Natural gradient methods have been used to optimise the parameters of\nprobability distributions in a variety of settings, often resulting in\nfast-converging procedures. Unfortunately, for many distributions of interest,\ncomputing the natural gradient has a number of challenges. In this work we\npropose a novel technique for tackling such issues, which involves reframing\nthe optimisation as one with respect to the parameters of a surrogate\ndistribution, for which computing the natural gradient is easy. We give several\nexamples of existing methods that can be interpreted as applying this\ntechnique, and propose a new method for applying it to a wide variety of\nproblems. Our method expands the set of distributions that can be efficiently\ntargeted with natural gradients. Furthermore, it is fast, easy to understand,\nsimple to implement using standard autodiff software, and does not require\nlengthy model-specific derivations. We demonstrate our method on maximum\nlikelihood estimation and variational inference tasks.\n","authors":["Jonathan So","Richard E. Turner"],"pdf_url":"https://arxiv.org/pdf/2310.11837v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.12321v3","updated":"2023-10-18T09:41:34Z","published":"2023-01-29T02:09:13Z","title":"Neural Relation Graph: A Unified Framework for Identifying Label Noise\n  and Outlier Data","summary":"  Diagnosing and cleaning data is a crucial step for building robust machine\nlearning systems. However, identifying problems within large-scale datasets\nwith real-world distributions is challenging due to the presence of complex\nissues such as label errors, under-representation, and outliers. In this paper,\nwe propose a unified approach for identifying the problematic data by utilizing\na largely ignored source of information: a relational structure of data in the\nfeature-embedded space. To this end, we present scalable and effective\nalgorithms for detecting label errors and outlier data based on the relational\ngraph structure of data. We further introduce a visualization tool that\nprovides contextual information of a data point in the feature-embedded space,\nserving as an effective tool for interactively diagnosing data. We evaluate the\nlabel error and outlier/out-of-distribution (OOD) detection performances of our\napproach on the large-scale image, speech, and language domain tasks, including\nImageNet, ESC-50, and SST2. Our approach achieves state-of-the-art detection\nperformance on all tasks considered and demonstrates its effectiveness in\ndebugging large-scale real-world datasets across various domains. We release\ncodes at https://github.com/snu-mllab/Neural-Relation-Graph.\n","authors":["Jang-Hyun Kim","Sangdoo Yun","Hyun Oh Song"],"pdf_url":"https://arxiv.org/pdf/2301.12321v3.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2308.07688v3","updated":"2023-10-18T09:35:35Z","published":"2023-08-15T10:37:13Z","title":"Enhancing Network Initialization for Medical AI Models Using\n  Large-Scale, Unlabeled Natural Images","summary":"  Pre-training datasets, like ImageNet, have become the gold standard in\nmedical image analysis. However, the emergence of self-supervised learning\n(SSL), which leverages unlabeled data to learn robust features, presents an\nopportunity to bypass the intensive labeling process. In this study, we\nexplored if SSL for pre-training on non-medical images can be applied to chest\nradiographs and how it compares to supervised pre-training on non-medical\nimages and on medical images. We utilized a vision transformer and initialized\nits weights based on (i) SSL pre-training on natural images (DINOv2), (ii) SL\npre-training on natural images (ImageNet dataset), and (iii) SL pre-training on\nchest radiographs from the MIMIC-CXR database. We tested our approach on over\n800,000 chest radiographs from six large global datasets, diagnosing more than\n20 different imaging findings. Our SSL pre-training on curated images not only\noutperformed ImageNet-based pre-training (P<0.001 for all datasets) but, in\ncertain cases, also exceeded SL on the MIMIC-CXR dataset. Our findings suggest\nthat selecting the right pre-training strategy, especially with SSL, can be\npivotal for improving artificial intelligence (AI)'s diagnostic accuracy in\nmedical imaging. By demonstrating the promise of SSL in chest radiograph\nanalysis, we underline a transformative shift towards more efficient and\naccurate AI models in medical imaging.\n","authors":["Soroosh Tayebi Arasteh","Leo Misera","Jakob Nikolas Kather","Daniel Truhn","Sven Nebelung"],"pdf_url":"https://arxiv.org/pdf/2308.07688v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11830v1","updated":"2023-10-18T09:31:56Z","published":"2023-10-18T09:31:56Z","title":"CLARA: Multilingual Contrastive Learning for Audio Representation\n  Acquisition","summary":"  This paper proposes a novel framework for multilingual speech and sound\nrepresentation learning using contrastive learning. The lack of sizeable\nlabelled datasets hinders speech-processing research across languages. Recent\nadvances in contrastive learning provide self-supervised techniques to learn\nfrom unlabelled data. Motivated by reducing data dependence and improving\ngeneralisation across diverse languages and conditions, we develop a\nmultilingual contrastive framework. This framework enables models to acquire\nshared representations across languages, facilitating cross-lingual transfer\nwith limited target language data.\n  Additionally, capturing emotional cues within speech is challenging due to\nsubjective perceptual assessments. By learning expressive representations from\ndiverse, multilingual data in a self-supervised manner, our approach aims to\ndevelop speech representations that encode emotive dimensions.\n  Our method trains encoders on a large corpus of multi-lingual audio data.\nData augmentation techniques are employed to expand the dataset. The\ncontrastive learning approach trains the model to maximise agreement between\npositive pairs and minimise agreement between negative pairs. Extensive\nexperiments demonstrate state-of-the-art performance of the proposed model on\nemotion recognition, audio classification, and retrieval benchmarks under\nzero-shot and few-shot conditions. This provides an effective approach for\nacquiring shared and generalised speech representations across languages and\nacoustic conditions while encoding latent emotional dimensions.\n","authors":["Kari A Noriy","Xiaosong Yang","Marcin Budka","Jian Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.11830v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11829v1","updated":"2023-10-18T09:31:21Z","published":"2023-10-18T09:31:21Z","title":"Towards Graph Foundation Models: A Survey and Beyond","summary":"  Emerging as fundamental building blocks for diverse artificial intelligence\napplications, foundation models have achieved notable success across natural\nlanguage processing and many other domains. Parallelly, graph machine learning\nhas witnessed a transformative shift, with shallow methods giving way to deep\nlearning approaches. The emergence and homogenization capabilities of\nfoundation models have piqued the interest of graph machine learning\nresearchers, sparking discussions about developing the next graph learning\nparadigm that is pre-trained on broad graph data and can be adapted to a wide\nrange of downstream graph tasks. However, there is currently no clear\ndefinition and systematic analysis for this type of work. In this article, we\npropose the concept of graph foundation models (GFMs), and provide the first\ncomprehensive elucidation on their key characteristics and technologies.\nFollowing that, we categorize existing works towards GFMs into three categories\nbased on their reliance on graph neural networks and large language models.\nBeyond providing a comprehensive overview of the current landscape of graph\nfoundation models, this article also discusses potential research directions\nfor this evolving field.\n","authors":["Jiawei Liu","Cheng Yang","Zhiyuan Lu","Junze Chen","Yibo Li","Mengmei Zhang","Ting Bai","Yuan Fang","Lichao Sun","Philip S. Yu","Chuan Shi"],"pdf_url":"https://arxiv.org/pdf/2310.11829v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11815v1","updated":"2023-10-18T09:14:19Z","published":"2023-10-18T09:14:19Z","title":"Conservative Predictions on Noisy Financial Data","summary":"  Price movements in financial markets are well known to be very noisy. As a\nresult, even if there are, on occasion, exploitable patterns that could be\npicked up by machine-learning algorithms, these are obscured by feature and\nlabel noise rendering the predictions less useful, and risky in practice.\nTraditional rule-learning techniques developed for noisy data, such as CN2,\nwould seek only high precision rules and refrain from making predictions where\ntheir antecedents did not apply. We apply a similar approach, where a model\nabstains from making a prediction on data points that it is uncertain on.\nDuring training, a cascade of such models are learned in sequence, similar to\nrule lists, with each model being trained only on data on which the previous\nmodel(s) were uncertain. Similar pruning of data takes place at test-time, with\n(higher accuracy) predictions being made albeit only on a fraction (support) of\ntest-time data. In a financial prediction setting, such an approach allows\ndecisions to be taken only when the ensemble model is confident, thereby\nreducing risk. We present results using traditional MLPs as well as\ndifferentiable decision trees, on synthetic data as well as real financial\nmarket data, to predict fixed-term returns using commonly used features. We\nsubmit that our approach is likely to result in better overall returns at a\nlower level of risk. In this context we introduce an utility metric to measure\nthe average gain per trade, as well as the return adjusted for downside risk,\nboth of which are improved significantly by our approach.\n","authors":["Omkar Nabar","Gautam Shroff"],"pdf_url":"https://arxiv.org/pdf/2310.11815v1.pdf","comment":"Accepted at ACM ICAIF 2023"},{"id":"http://arxiv.org/abs/2309.13365v2","updated":"2023-10-18T09:04:32Z","published":"2023-09-23T13:06:20Z","title":"Limits of Actor-Critic Algorithms for Decision Tree Policies Learning in\n  IBMDPs","summary":"  Interpretability of AI models allows for user safety checks to build trust in\nsuch AIs. In particular, Decision Trees (DTs) provide a global look at the\nlearned model and transparently reveal which features of the input are critical\nfor making a decision. However, interpretability is hindered if the DT is too\nlarge. To learn compact trees, a recent Reinforcement Learning (RL) framework\nhas been proposed to explore the space of DTs using deep RL. This framework\naugments a decision problem (e.g. a supervised classification task) with\nadditional actions that gather information about the features of an otherwise\nhidden input. By appropriately penalizing these actions, the agent learns to\noptimally trade-off size and performance of DTs. In practice, a reactive policy\nfor a partially observable Markov decision process (MDP) needs to be learned,\nwhich is still an open problem. We show in this paper that deep RL can fail\neven on simple toy tasks of this class. However, when the underlying decision\nproblem is a supervised classification task, we show that finding the optimal\ntree can be cast as a fully observable Markov decision problem and be solved\nefficiently, giving rise to a new family of algorithms for learning DTs that go\nbeyond the classical greedy maximization ones.\n","authors":["Hecotr Kohler","Riad Akrour","Philippe Preux"],"pdf_url":"https://arxiv.org/pdf/2309.13365v2.pdf","comment":"To be included in an other submission"},{"id":"http://arxiv.org/abs/2206.08005v3","updated":"2023-10-18T08:51:16Z","published":"2022-06-16T09:01:53Z","title":"Evaluating Self-Supervised Learning for Molecular Graph Embeddings","summary":"  Graph Self-Supervised Learning (GSSL) provides a robust pathway for acquiring\nembeddings without expert labelling, a capability that carries profound\nimplications for molecular graphs due to the staggering number of potential\nmolecules and the high cost of obtaining labels. However, GSSL methods are\ndesigned not for optimisation within a specific domain but rather for\ntransferability across a variety of downstream tasks. This broad applicability\ncomplicates their evaluation. Addressing this challenge, we present \"Molecular\nGraph Representation Evaluation\" (MOLGRAPHEVAL), generating detailed profiles\nof molecular graph embeddings with interpretable and diversified attributes.\nMOLGRAPHEVAL offers a suite of probing tasks grouped into three categories: (i)\ngeneric graph, (ii) molecular substructure, and (iii) embedding space\nproperties. By leveraging MOLGRAPHEVAL to benchmark existing GSSL methods\nagainst both current downstream datasets and our suite of tasks, we uncover\nsignificant inconsistencies between inferences drawn solely from existing\ndatasets and those derived from more nuanced probing. These findings suggest\nthat current evaluation methodologies fail to capture the entirety of the\nlandscape.\n","authors":["Hanchen Wang","Jean Kaddour","Shengchao Liu","Jian Tang","Joan Lasenby","Qi Liu"],"pdf_url":"https://arxiv.org/pdf/2206.08005v3.pdf","comment":"Camera ready, NeurIPS Benchmark 2023"},{"id":"http://arxiv.org/abs/2303.04906v2","updated":"2023-10-18T08:47:34Z","published":"2023-03-08T21:51:14Z","title":"Model-Agnostic Federated Learning","summary":"  Since its debut in 2016, Federated Learning (FL) has been tied to the inner\nworkings of Deep Neural Networks (DNNs). On the one hand, this allowed its\ndevelopment and widespread use as DNNs proliferated. On the other hand, it\nneglected all those scenarios in which using DNNs is not possible or\nadvantageous. The fact that most current FL frameworks only allow training DNNs\nreinforces this problem. To address the lack of FL solutions for non-DNN-based\nuse cases, we propose MAFL (Model-Agnostic Federated Learning). MAFL marries a\nmodel-agnostic FL algorithm, AdaBoost.F, with an open industry-grade FL\nframework: Intel OpenFL. MAFL is the first FL system not tied to any specific\ntype of machine learning model, allowing exploration of FL scenarios beyond\nDNNs and trees. We test MAFL from multiple points of view, assessing its\ncorrectness, flexibility and scaling properties up to 64 nodes. We optimised\nthe base software achieving a 5.5x speedup on a standard FL scenario. MAFL is\ncompatible with x86-64, ARM-v8, Power and RISC-V.\n","authors":["Gianluca Mittone","Walter Riviera","Iacopo Colonnelli","Robert Birke","Marco Aldinucci"],"pdf_url":"https://arxiv.org/pdf/2303.04906v2.pdf","comment":"Published at the EuroPar'23 conference, Limassol, Cyprus"},{"id":"http://arxiv.org/abs/2307.07063v2","updated":"2023-10-18T08:46:20Z","published":"2023-07-13T21:08:15Z","title":"Bootstrapping Vision-Language Learning with Decoupled Language\n  Pre-training","summary":"  We present a novel methodology aimed at optimizing the application of frozen\nlarge language models (LLMs) for resource-intensive vision-language (VL)\npre-training. The current paradigm uses visual features as prompts to guide\nlanguage models, with a focus on determining the most relevant visual features\nfor corresponding text. Our approach diverges by concentrating on the language\ncomponent, specifically identifying the optimal prompts to align with visual\nfeatures. We introduce the Prompt-Transformer (P-Former), a model that predicts\nthese ideal prompts, which is trained exclusively on linguistic data, bypassing\nthe need for image-text pairings. This strategy subtly bifurcates the\nend-to-end VL training process into an additional, separate stage. Our\nexperiments reveal that our framework significantly enhances the performance of\na robust image-to-text baseline (BLIP-2), and effectively narrows the\nperformance gap between models trained with either 4M or 129M image-text pairs.\nImportantly, our framework is modality-agnostic and flexible in terms of\narchitectural design, as validated by its successful application in a video\nlearning task using varied base modules. The code will be made available at\nhttps://github.com/yiren-jian/BLIText.\n","authors":["Yiren Jian","Chongyang Gao","Soroush Vosoughi"],"pdf_url":"https://arxiv.org/pdf/2307.07063v2.pdf","comment":"Accepted to NeurIPS 2023 (spotlight)"},{"id":"http://arxiv.org/abs/2310.11802v1","updated":"2023-10-18T08:45:57Z","published":"2023-10-18T08:45:57Z","title":"De novo protein design using geometric vector field networks","summary":"  Innovations like protein diffusion have enabled significant progress in de\nnovo protein design, which is a vital topic in life science. These methods\ntypically depend on protein structure encoders to model residue backbone\nframes, where atoms do not exist. Most prior encoders rely on atom-wise\nfeatures, such as angles and distances between atoms, which are not available\nin this context. Thus far, only several simple encoders, such as IPA, have been\nproposed for this scenario, exposing the frame modeling as a bottleneck. In\nthis work, we proffer the Vector Field Network (VFN), which enables network\nlayers to perform learnable vector computations between coordinates of\nframe-anchored virtual atoms, thus achieving a higher capability for modeling\nframes. The vector computation operates in a manner similar to a linear layer,\nwith each input channel receiving 3D virtual atom coordinates instead of scalar\nvalues. The multiple feature vectors output by the vector computation are then\nused to update the residue representations and virtual atom coordinates via\nattention aggregation. Remarkably, VFN also excels in modeling both frames and\natoms, as the real atoms can be treated as the virtual atoms for modeling,\npositioning VFN as a potential universal encoder. In protein diffusion (frame\nmodeling), VFN exhibits an impressive performance advantage over IPA, excelling\nin terms of both designability (67.04% vs. 53.58%) and diversity (66.54% vs.\n51.98%). In inverse folding (frame and atom modeling), VFN outperforms the\nprevious SoTA model, PiFold (54.7% vs. 51.66%), on sequence recovery rate. We\nalso propose a method of equipping VFN with the ESM model, which significantly\nsurpasses the previous ESM-based SoTA (62.67% vs. 55.65%), LM-Design, by a\nsubstantial margin.\n","authors":["Weian Mao","Muzhi Zhu","Zheng Sun","Shuaike Shen","Lin Yuanbo Wu","Hao Chen","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2310.11802v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.07946v3","updated":"2023-10-18T08:35:00Z","published":"2023-02-15T20:57:42Z","title":"Experimenting with Emerging RISC-V Systems for Decentralised Machine\n  Learning","summary":"  Decentralised Machine Learning (DML) enables collaborative machine learning\nwithout centralised input data. Federated Learning (FL) and Edge Inference are\nexamples of DML. While tools for DML (especially FL) are starting to flourish,\nmany are not flexible and portable enough to experiment with novel processors\n(e.g., RISC-V), non-fully connected network topologies, and asynchronous\ncollaboration schemes. We overcome these limitations via a domain-specific\nlanguage allowing us to map DML schemes to an underlying middleware, i.e. the\nFastFlow parallel programming library. We experiment with it by generating\ndifferent working DML schemes on x86-64 and ARM platforms and an emerging\nRISC-V one. We characterise the performance and energy efficiency of the\npresented schemes and systems. As a byproduct, we introduce a RISC-V porting of\nthe PyTorch framework, the first publicly available to our knowledge.\n","authors":["Gianluca Mittone","Nicolò Tonci","Robert Birke","Iacopo Colonnelli","Doriana Medić","Andrea Bartolini","Roberto Esposito","Emanuele Parisi","Francesco Beneventi","Mirko Polato","Massimo Torquati","Luca Benini","Marco Aldinucci"],"pdf_url":"https://arxiv.org/pdf/2302.07946v3.pdf","comment":"This paper is the accepted version of ACM copyrighted material\n  presented at the CF'23 conference in Bologna, Italy"},{"id":"http://arxiv.org/abs/2310.11789v1","updated":"2023-10-18T08:28:43Z","published":"2023-10-18T08:28:43Z","title":"Adversarial Training for Physics-Informed Neural Networks","summary":"  Physics-informed neural networks have shown great promise in solving partial\ndifferential equations. However, due to insufficient robustness, vanilla PINNs\noften face challenges when solving complex PDEs, especially those involving\nmulti-scale behaviors or solutions with sharp or oscillatory characteristics.\nTo address these issues, based on the projected gradient descent adversarial\nattack, we proposed an adversarial training strategy for PINNs termed by\nAT-PINNs. AT-PINNs enhance the robustness of PINNs by fine-tuning the model\nwith adversarial samples, which can accurately identify model failure locations\nand drive the model to focus on those regions during training. AT-PINNs can\nalso perform inference with temporal causality by selecting the initial\ncollocation points around temporal initial values. We implement AT-PINNs to the\nelliptic equation with multi-scale coefficients, Poisson equation with\nmulti-peak solutions, Burgers equation with sharp solutions and the Allen-Cahn\nequation. The results demonstrate that AT-PINNs can effectively locate and\nreduce failure regions. Moreover, AT-PINNs are suitable for solving complex\nPDEs, since locating failure regions through adversarial attacks is independent\nof the size of failure regions or the complexity of the distribution.\n","authors":["Yao Li","Shengzhu Shi","Zhichang Guo","Boying Wu"],"pdf_url":"https://arxiv.org/pdf/2310.11789v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11787v1","updated":"2023-10-18T08:27:09Z","published":"2023-10-18T08:27:09Z","title":"NeuroCUT: A Neural Approach for Robust Graph Partitioning","summary":"  Graph partitioning aims to divide a graph into $k$ disjoint subsets while\noptimizing a specific partitioning objective. The majority of formulations\nrelated to graph partitioning exhibit NP-hardness due to their combinatorial\nnature. As a result, conventional approximation algorithms rely on heuristic\nmethods, sometimes with approximation guarantees and sometimes without.\nUnfortunately, traditional approaches are tailored for specific partitioning\nobjectives and do not generalize well across other known partitioning\nobjectives from the literature. To overcome this limitation, and learn\nheuristics from the data directly, neural approaches have emerged,\ndemonstrating promising outcomes. In this study, we extend this line of work\nthrough a novel framework, NeuroCut. NeuroCut introduces two key innovations\nover prevailing methodologies. First, it is inductive to both graph topology\nand the partition count, which is provided at query time. Second, by leveraging\na reinforcement learning based framework over node representations derived from\na graph neural network, NeuroCut can accommodate any optimization objective,\neven those encompassing non-differentiable functions. Through empirical\nevaluation, we demonstrate that NeuroCut excels in identifying high-quality\npartitions, showcases strong generalization across a wide spectrum of\npartitioning objectives, and exhibits resilience to topological modifications.\n","authors":["Rishi Shah","Krishnanshu Jain","Sahil Manchanda","Sourav Medya","Sayan Ranu"],"pdf_url":"https://arxiv.org/pdf/2310.11787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2010.02968v4","updated":"2023-10-18T08:19:30Z","published":"2020-10-06T18:49:51Z","title":"Modelling of functional profiles and explainable shape shifts detection:\n  An approach combining the notion of the Fréchet mean with the shape\n  invariant model","summary":"  A modelling framework suitable for detecting shape shifts in functional\nprofiles combining the notion of Fr\\'echet mean and the concept of deformation\nmodels is developed and proposed. The generalized mean sense offered by the\nFr\\'echet mean notion is employed to capture the typical pattern of the\nprofiles under study, while the concept of deformation models, and in\nparticular of the shape invariant model, allows for interpretable\nparameterizations of profile's deviations from the typical shape. EWMA-type\ncontrol charts compatible with the functional nature of data and the employed\ndeformation model are built and proposed, exploiting certain shape\ncharacteristics of the profiles under study with respect to the generalized\nmean sense, allowing for the identification of potential shifts concerning the\nshape and/or the deformation process. Potential shifts in the shape deformation\nprocess, are further distinguished to significant shifts with respect to\namplitude and/or the phase of the profile under study. The proposed modelling\nand shift detection framework is implemented to a real world case study, where\ndaily concentration profiles concerning air pollutants from an area in the city\nof Athens are modelled, while profiles indicating hazardous concentration\nlevels are successfully identified in most of the cases.\n","authors":["Georgios I. Papayiannis","Stelios Psarakis","Athanasios N. Yannacopoulos"],"pdf_url":"https://arxiv.org/pdf/2010.02968v4.pdf","comment":"23 pages, 7 figures"},{"id":"http://arxiv.org/abs/2302.07684v4","updated":"2023-10-18T08:14:43Z","published":"2023-02-15T14:21:31Z","title":"A Federated Learning Benchmark for Drug-Target Interaction","summary":"  Aggregating pharmaceutical data in the drug-target interaction (DTI) domain\nhas the potential to deliver life-saving breakthroughs. It is, however,\nnotoriously difficult due to regulatory constraints and commercial interests.\nThis work proposes the application of federated learning, which we argue to be\nreconcilable with the industry's constraints, as it does not require sharing of\nany information that would reveal the entities' data or any other high-level\nsummary of it. When used on a representative GraphDTA model and the KIBA\ndataset it achieves up to 15% improved performance relative to the best\navailable non-privacy preserving alternative. Our extensive battery of\nexperiments shows that, unlike in other domains, the non-IID data distribution\nin the DTI datasets does not deteriorate FL performance. Additionally, we\nidentify a material trade-off between the benefits of adding new data, and the\ncost of adding more clients.\n","authors":["Gianluca Mittone","Filip Svoboda","Marco Aldinucci","Nicholas D. Lane","Pietro Lio"],"pdf_url":"https://arxiv.org/pdf/2302.07684v4.pdf","comment":"This paper is the accepted version of ACM copyrighted material\n  published at the WWW'23 conference"},{"id":"http://arxiv.org/abs/2107.14432v5","updated":"2023-10-18T07:59:36Z","published":"2021-07-30T05:33:43Z","title":"Adaptive Optimizers with Sparse Group Lasso for Neural Networks in CTR\n  Prediction","summary":"  We develop a novel framework that adds the regularizers of the sparse group\nlasso to a family of adaptive optimizers in deep learning, such as Momentum,\nAdagrad, Adam, AMSGrad, AdaHessian, and create a new class of optimizers, which\nare named Group Momentum, Group Adagrad, Group Adam, Group AMSGrad and Group\nAdaHessian, etc., accordingly. We establish theoretically proven convergence\nguarantees in the stochastic convex settings, based on primal-dual methods. We\nevaluate the regularized effect of our new optimizers on three large-scale\nreal-world ad click datasets with state-of-the-art deep learning models. The\nexperimental results reveal that compared with the original optimizers with the\npost-processing procedure which uses the magnitude pruning method, the\nperformance of the models can be significantly improved on the same sparsity\nlevel. Furthermore, in comparison to the cases without magnitude pruning, our\nmethods can achieve extremely high sparsity with significantly better or highly\ncompetitive performance. The code is available at\nhttps://github.com/intelligent-machine-learning/dlrover/blob/master/tfplus.\n","authors":["Yun Yue","Yongchao Liu","Suo Tong","Minghao Li","Zhen Zhang","Chunyang Wen","Huanjun Bao","Lihong Gu","Jinjie Gu","Yixiang Mu"],"pdf_url":"https://arxiv.org/pdf/2107.14432v5.pdf","comment":"24 pages. Published as a conference paper at ECML PKDD 2021. This\n  version includes Appendix which was not included in the published version\n  because of page limit"},{"id":"http://arxiv.org/abs/2310.11762v1","updated":"2023-10-18T07:39:05Z","published":"2023-10-18T07:39:05Z","title":"A Quasi-Wasserstein Loss for Learning Graph Neural Networks","summary":"  When learning graph neural networks (GNNs) in node-level prediction tasks,\nmost existing loss functions are applied for each node independently, even if\nnode embeddings and their labels are non-i.i.d. because of their graph\nstructures. To eliminate such inconsistency, in this study we propose a novel\nQuasi-Wasserstein (QW) loss with the help of the optimal transport defined on\ngraphs, leading to new learning and prediction paradigms of GNNs. In\nparticular, we design a \"Quasi-Wasserstein\" distance between the observed\nmulti-dimensional node labels and their estimations, optimizing the label\ntransport defined on graph edges. The estimations are parameterized by a GNN in\nwhich the optimal label transport may determine the graph edge weights\noptionally. By reformulating the strict constraint of the label transport to a\nBregman divergence-based regularizer, we obtain the proposed Quasi-Wasserstein\nloss associated with two efficient solvers learning the GNN together with\noptimal label transport. When predicting node labels, our model combines the\noutput of the GNN with the residual component provided by the optimal label\ntransport, leading to a new transductive prediction paradigm. Experiments show\nthat the proposed QW loss applies to various GNNs and helps to improve their\nperformance in node-level classification and regression tasks.\n","authors":["Minjie Cheng","Hongteng Xu"],"pdf_url":"https://arxiv.org/pdf/2310.11762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11758v1","updated":"2023-10-18T07:31:35Z","published":"2023-10-18T07:31:35Z","title":"Domain-Generalized Face Anti-Spoofing with Unknown Attacks","summary":"  Although face anti-spoofing (FAS) methods have achieved remarkable\nperformance on specific domains or attack types, few studies have focused on\nthe simultaneous presence of domain changes and unknown attacks, which is\ncloser to real application scenarios. To handle domain-generalized unknown\nattacks, we introduce a new method, DGUA-FAS, which consists of a\nTransformer-based feature extractor and a synthetic unknown attack sample\ngenerator (SUASG). The SUASG network simulates unknown attack samples to assist\nthe training of the feature extractor. Experimental results show that our\nmethod achieves superior performance on domain generalization FAS with known or\nunknown attacks.\n","authors":["Zong-Wei Hong","Yu-Chen Lin","Hsuan-Tung Liu","Yi-Ren Yeh","Chu-Song Chen"],"pdf_url":"https://arxiv.org/pdf/2310.11758v1.pdf","comment":"IEEE International Conference on Image Processing (ICIP 2023)"},{"id":"http://arxiv.org/abs/2310.08754v3","updated":"2023-10-18T07:30:00Z","published":"2023-10-12T22:44:19Z","title":"Tokenizer Choice For LLM Training: Negligible or Crucial?","summary":"  The recent success of LLMs has been predominantly driven by curating the\ntraining dataset composition, scaling of model architectures and dataset sizes\nand advancements in pretraining objectives, leaving tokenizer influence as a\nblind spot. Shedding light on this underexplored area, we conduct a\ncomprehensive study on the influence of tokenizer choice on LLM downstream\nperformance by training 24 mono- and multilingual LLMs at a 2.6B parameter\nscale, ablating different tokenizer algorithms and parameterizations. Our\nstudies highlight that the tokenizer choice can significantly impact the\nmodel's downstream performance, training and inference costs. In particular, we\nfind that the common tokenizer evaluation metrics fertility and parity are not\nalways predictive of model downstream performance, rendering these metrics a\nquestionable proxy for the model's downstream performance. Furthermore, we show\nthat multilingual tokenizers trained on the five most frequent European\nlanguages require vocabulary size increases of factor three in comparison to\nEnglish. While English-only tokenizers have been applied to the training of\nmulti-lingual LLMs, we find that this approach results in a severe downstream\nperformance degradation and additional training costs of up to 68%, due to an\ninefficient tokenization vocabulary.\n","authors":["Mehdi Ali","Michael Fromm","Klaudia Thellmann","Richard Rutmann","Max Lübbering","Johannes Leveling","Katrin Klug","Jan Ebert","Niclas Doll","Jasper Schulze Buschhoff","Charvi Jain","Alexander Arno Weber","Lena Jurkschat","Hammam Abdelwahab","Chelsea John","Pedro Ortiz Suarez","Malte Ostendorff","Samuel Weinbach","Rafet Sifa","Stefan Kesselheim","Nicolas Flores-Herr"],"pdf_url":"https://arxiv.org/pdf/2310.08754v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11749v1","updated":"2023-10-18T07:16:06Z","published":"2023-10-18T07:16:06Z","title":"Estimating Material Properties of Interacting Objects Using Sum-GP-UCB","summary":"  Robots need to estimate the material and dynamic properties of objects from\nobservations in order to simulate them accurately. We present a Bayesian\noptimization approach to identifying the material property parameters of\nobjects based on a set of observations. Our focus is on estimating these\nproperties based on observations of scenes with different sets of interacting\nobjects. We propose an approach that exploits the structure of the reward\nfunction by modeling the reward for each observation separately and using only\nthe parameters of the objects in that scene as inputs. The resulting\nlower-dimensional models generalize better over the parameter space, which in\nturn results in a faster optimization. To speed up the optimization process\nfurther, and reduce the number of simulation runs needed to find good parameter\nvalues, we also propose partial evaluations of the reward function, wherein the\nselected parameters are only evaluated on a subset of real world evaluations.\nThe approach was successfully evaluated on a set of scenes with a wide range of\nobject interactions, and we showed that our method can effectively perform\nincremental learning without resetting the rewards of the gathered\nobservations.\n","authors":["M. Yunus Seker","Oliver Kroemer"],"pdf_url":"https://arxiv.org/pdf/2310.11749v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.02676v8","updated":"2023-10-18T07:11:12Z","published":"2023-02-06T10:28:16Z","title":"Chain of Hindsight Aligns Language Models with Feedback","summary":"  Learning from human preferences is important for language models to match\nhuman needs and to align with human and social values. Prior works have\nachieved remarkable successes by learning from human feedback to understand and\nfollow instructions. Nonetheless, these methods are either founded on\nhand-picked model generations that are favored by human annotators, rendering\nthem inefficient in terms of data utilization and challenging to apply in\ngeneral, or they depend on reinforcement learning, which often suffers from\nimperfect reward functions and relies on extremely challenging optimizations.\nIn this work, we propose a novel technique, Chain of Hindsight, that is easy to\noptimize and can learn from any form of feedback, regardless of its polarity.\nOur idea is inspired by how humans learn from extensive feedback presented in\nthe form of languages. We convert all types of feedback into sequences of\nsentences, which are then used to fine-tune the model, allowing us to take\nadvantage of the language comprehension capabilities of language models. We\ncondition the model on a sequence of model generations paired with feedback. By\ndoing so, the model is trained to generate outputs based on feedback, while\nlearning to identify and correct negative attributes or errors. Applying our\nmethod to large language models, we observed that Chain of Hindsight\nsignificantly surpasses previous methods in aligning language models with human\npreferences. We report significant improvements on summarization and dialogue\nbenchmarks, with our approach markedly preferred in human evaluations.\n","authors":["Hao Liu","Carmelo Sferrazza","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2302.02676v8.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11913v2","updated":"2023-10-18T07:02:02Z","published":"2023-05-18T12:30:26Z","title":"Machine learning for phase-resolved reconstruction of nonlinear ocean\n  wave surface elevations from sparse remote sensing data","summary":"  Accurate short-term predictions of phase-resolved water wave conditions are\ncrucial for decision-making in ocean engineering. However, the initialization\nof remote-sensing-based wave prediction models first requires a reconstruction\nof wave surfaces from sparse measurements like radar. Existing reconstruction\nmethods either rely on computationally intensive optimization procedures or\nsimplistic modelling assumptions that compromise the real-time capability or\naccuracy of the subsequent prediction process. We therefore address these\nissues by proposing a novel approach for phase-resolved wave surface\nreconstruction using neural networks based on the U-Net and Fourier neural\noperator (FNO) architectures. Our approach utilizes synthetic yet highly\nrealistic training data on uniform one-dimensional grids, that is generated by\nthe high-order spectral method for wave simulation and a geometric radar\nmodelling approach. The investigation reveals that both models deliver accurate\nwave reconstruction results and show good generalization for different sea\nstates when trained with spatio-temporal radar data containing multiple\nhistoric radar snapshots in each input. Notably, the FNO demonstrates superior\nperformance in handling the data structure imposed by wave physics due to its\nglobal approach to learn the mapping between input and output in Fourier space.\n","authors":["Svenja Ehlers","Marco Klein","Alexander Heinlein","Mathies Wedler","Nicolas Desmars","Norbert Hoffmann","Merten Stender"],"pdf_url":"https://arxiv.org/pdf/2305.11913v2.pdf","comment":"19 pages, 13 figures (without appendix), final version"},{"id":"http://arxiv.org/abs/2310.10520v2","updated":"2023-10-18T06:56:29Z","published":"2023-10-16T15:38:02Z","title":"Semantic Parsing by Large Language Models for Intricate Updating\n  Strategies of Zero-Shot Dialogue State Tracking","summary":"  Zero-shot Dialogue State Tracking (DST) addresses the challenge of acquiring\nand annotating task-oriented dialogues, which can be time consuming and costly.\nHowever, DST extends beyond simple slot-filling and requires effective updating\nstrategies for tracking dialogue state as conversations progress. In this\npaper, we propose ParsingDST, a new In-Context Learning (ICL) method, to\nintroduce additional intricate updating strategies in zero-shot DST. Our\napproach reformulates the DST task by leveraging powerful Large Language Models\n(LLMs) and translating the original dialogue text to JSON through semantic\nparsing as an intermediate state. We also design a novel framework that\nincludes more modules to ensure the effectiveness of updating strategies in the\ntext-to-JSON process. Experimental results demonstrate that our approach\noutperforms existing zero-shot DST methods on MultiWOZ, exhibiting significant\nimprovements in Joint Goal Accuracy (JGA) and slot accuracy compared to\nexisting ICL methods.\n","authors":["Yuxiang Wu","Guanting Dong","Weiran Xu"],"pdf_url":"https://arxiv.org/pdf/2310.10520v2.pdf","comment":"Accepted to the Findings of EMNLP 2023 (Short Paper)"},{"id":"http://arxiv.org/abs/2310.11739v1","updated":"2023-10-18T06:45:49Z","published":"2023-10-18T06:45:49Z","title":"Unintended Memorization in Large ASR Models, and How to Mitigate It","summary":"  It is well-known that neural networks can unintentionally memorize their\ntraining examples, causing privacy concerns. However, auditing memorization in\nlarge non-auto-regressive automatic speech recognition (ASR) models has been\nchallenging due to the high compute cost of existing methods such as hardness\ncalibration. In this work, we design a simple auditing method to measure\nmemorization in large ASR models without the extra compute overhead.\nConcretely, we speed up randomly-generated utterances to create a mapping\nbetween vocal and text information that is difficult to learn from typical\ntraining examples. Hence, accurate predictions only for sped-up training\nexamples can serve as clear evidence for memorization, and the corresponding\naccuracy can be used to measure memorization. Using the proposed method, we\nshowcase memorization in the state-of-the-art ASR models. To mitigate\nmemorization, we tried gradient clipping during training to bound the influence\nof any individual example on the final model. We empirically show that clipping\neach example's gradient can mitigate memorization for sped-up training examples\nwith up to 16 repetitions in the training set. Furthermore, we show that in\nlarge-scale distributed training, clipping the average gradient on each compute\ncore maintains neutral model quality and compute cost while providing strong\nprivacy protection.\n","authors":["Lun Wang","Om Thakkar","Rajiv Mathews"],"pdf_url":"https://arxiv.org/pdf/2310.11739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/1510.03826v4","updated":"2023-10-18T06:14:51Z","published":"2015-10-13T19:14:43Z","title":"Adopting Robustness and Optimality in Fitting and Learning","summary":"  We generalized a modified exponentialized estimator by pushing the\nrobust-optimal (RO) index $\\lambda$ to $-\\infty$ for achieving robustness to\noutliers by optimizing a quasi-Minimin function. The robustness is realized and\ncontrolled adaptively by the RO index without any predefined threshold.\nOptimality is guaranteed by expansion of the convexity region in the Hessian\nmatrix to largely avoid local optima. Detailed quantitative analysis on both\nrobustness and optimality are provided. The results of proposed experiments on\nfitting tasks for three noisy non-convex functions and the digits recognition\ntask on the MNIST dataset consolidate the conclusions.\n","authors":["Zhiguang Wang","Tim Oates","James Lo"],"pdf_url":"https://arxiv.org/pdf/1510.03826v4.pdf","comment":"arXiv admin note: text overlap with arXiv:1506.02690"},{"id":"http://arxiv.org/abs/2310.11732v1","updated":"2023-10-18T06:07:28Z","published":"2023-10-18T06:07:28Z","title":"Investigating Uncertainty Calibration of Aligned Language Models under\n  the Multiple-Choice Setting","summary":"  Despite the significant progress made in practical applications of aligned\nlanguage models (LMs), they tend to be overconfident in output answers compared\nto the corresponding pre-trained LMs. In this work, we systematically evaluate\nthe impact of the alignment process on logit-based uncertainty calibration of\nLMs under the multiple-choice setting. We first conduct a thoughtful empirical\nstudy on how aligned LMs differ in calibration from their pre-trained\ncounterparts. Experimental results reveal that there are two distinct\nuncertainties in LMs under the multiple-choice setting, which are responsible\nfor the answer decision and the format preference of the LMs, respectively.\nThen, we investigate the role of these two uncertainties on aligned LM's\ncalibration through fine-tuning in simple synthetic alignment schemes and\nconclude that one reason for aligned LMs' overconfidence is the conflation of\nthese two types of uncertainty. Furthermore, we examine the utility of common\npost-hoc calibration methods for aligned LMs and propose an easy-to-implement\nand sample-efficient method to calibrate aligned LMs. We hope our findings\ncould provide insights into the design of more reliable alignment processes for\nLMs.\n","authors":["Guande He","Peng Cui","Jianfei Chen","Wenbo Hu","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.11732v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03807v4","updated":"2023-10-18T06:03:27Z","published":"2023-05-05T19:20:29Z","title":"Evading Watermark based Detection of AI-Generated Content","summary":"  A generative AI model can generate extremely realistic-looking content,\nposing growing challenges to the authenticity of information. To address the\nchallenges, watermark has been leveraged to detect AI-generated content.\nSpecifically, a watermark is embedded into an AI-generated content before it is\nreleased. A content is detected as AI-generated if a similar watermark can be\ndecoded from it. In this work, we perform a systematic study on the robustness\nof such watermark-based AI-generated content detection. We focus on\nAI-generated images. Our work shows that an attacker can post-process a\nwatermarked image via adding a small, human-imperceptible perturbation to it,\nsuch that the post-processed image evades detection while maintaining its\nvisual quality. We show the effectiveness of our attack both theoretically and\nempirically. Moreover, to evade detection, our adversarial post-processing\nmethod adds much smaller perturbations to AI-generated images and thus better\nmaintain their visual quality than existing popular post-processing methods\nsuch as JPEG compression, Gaussian blur, and Brightness/Contrast. Our work\nshows the insufficiency of existing watermark-based detection of AI-generated\ncontent, highlighting the urgent needs of new methods. Our code is publicly\navailable: https://github.com/zhengyuan-jiang/WEvade.\n","authors":["Zhengyuan Jiang","Jinghuai Zhang","Neil Zhenqiang Gong"],"pdf_url":"https://arxiv.org/pdf/2305.03807v4.pdf","comment":"To appear in ACM Conference on Computer and Communications Security\n  (CCS), 2023"},{"id":"http://arxiv.org/abs/2305.19185v4","updated":"2023-10-18T06:00:28Z","published":"2023-05-30T16:29:52Z","title":"Compression with Bayesian Implicit Neural Representations","summary":"  Many common types of data can be represented as functions that map\ncoordinates to signal values, such as pixel locations to RGB values in the case\nof an image. Based on this view, data can be compressed by overfitting a\ncompact neural network to its functional representation and then encoding the\nnetwork weights. However, most current solutions for this are inefficient, as\nquantization to low-bit precision substantially degrades the reconstruction\nquality. To address this issue, we propose overfitting variational Bayesian\nneural networks to the data and compressing an approximate posterior weight\nsample using relative entropy coding instead of quantizing and entropy coding\nit. This strategy enables direct optimization of the rate-distortion\nperformance by minimizing the $\\beta$-ELBO, and target different\nrate-distortion trade-offs for a given network architecture by adjusting\n$\\beta$. Moreover, we introduce an iterative algorithm for learning prior\nweight distributions and employ a progressive refinement process for the\nvariational posterior that significantly enhances performance. Experiments show\nthat our method achieves strong performance on image and audio compression\nwhile retaining simplicity.\n","authors":["Zongyu Guo","Gergely Flamich","Jiajun He","Zhibo Chen","José Miguel Hernández-Lobato"],"pdf_url":"https://arxiv.org/pdf/2305.19185v4.pdf","comment":"Accepted as a Spotlight paper in NeurIPS 2023. Camera-ready version"},{"id":"http://arxiv.org/abs/2310.11730v1","updated":"2023-10-18T05:59:41Z","published":"2023-10-18T05:59:41Z","title":"Federated Heterogeneous Graph Neural Network for Privacy-preserving\n  Recommendation","summary":"  Heterogeneous information network (HIN), which contains rich semantics\ndepicted by meta-paths, has become a powerful tool to alleviate data sparsity\nin recommender systems. Existing HIN-based recommendations hold the data\ncentralized storage assumption and conduct centralized model training. However,\nthe real-world data is often stored in a distributed manner for privacy\nconcerns, resulting in the failure of centralized HIN-based recommendations. In\nthis paper, we suggest the HIN is partitioned into private HINs stored in the\nclient side and shared HINs in the server. Following this setting, we propose a\nfederated heterogeneous graph neural network (FedHGNN) based framework, which\ncan collaboratively train a recommendation model on distributed HINs without\nleaking user privacy. Specifically, we first formalize the privacy definition\nin the light of differential privacy for HIN-based federated recommendation,\nwhich aims to protect user-item interactions of private HIN as well as user's\nhigh-order patterns from shared HINs. To recover the broken meta-path based\nsemantics caused by distributed data storage and satisfy the proposed privacy,\nwe elaborately design a semantic-preserving user interactions publishing\nmethod, which locally perturbs user's high-order patterns as well as related\nuser-item interactions for publishing. After that, we propose a HGNN model for\nrecommendation, which conducts node- and semantic-level aggregations to capture\nrecovered semantics. Extensive experiments on three datasets demonstrate our\nmodel outperforms existing methods by a large margin (up to 34% in HR@10 and\n42% in NDCG@10) under an acceptable privacy budget.\n","authors":["Bo Yan","Yang Cao","Haoyu Wang","Wenchuan Yang","Junping Du","Chuan Shi"],"pdf_url":"https://arxiv.org/pdf/2310.11730v1.pdf","comment":"Submit to WWW 2024"},{"id":"http://arxiv.org/abs/2310.11721v1","updated":"2023-10-18T05:39:20Z","published":"2023-10-18T05:39:20Z","title":"Chain-of-Thought Tuning: Masked Language Models can also Think Step By\n  Step in Natural Language Understanding","summary":"  Chain-of-Thought (CoT) is a technique that guides Large Language Models\n(LLMs) to decompose complex tasks into multi-step reasoning through\nintermediate steps in natural language form. Briefly, CoT enables LLMs to think\nstep by step. However, although many Natural Language Understanding (NLU) tasks\nalso require thinking step by step, LLMs perform less well than small-scale\nMasked Language Models (MLMs). To migrate CoT from LLMs to MLMs, we propose\nChain-of-Thought Tuning (CoTT), a two-step reasoning framework based on prompt\ntuning, to implement step-by-step thinking for MLMs on NLU tasks. From the\nperspective of CoT, CoTT's two-step framework enables MLMs to implement task\ndecomposition; CoTT's prompt tuning allows intermediate steps to be used in\nnatural language form. Thereby, the success of CoT can be extended to NLU tasks\nthrough MLMs. To verify the effectiveness of CoTT, we conduct experiments on\ntwo NLU tasks: hierarchical classification and relation extraction, and the\nresults show that CoTT outperforms baselines and achieves state-of-the-art\nperformance.\n","authors":["Caoyun Fan","Jidong Tian","Yitian Li","Wenqing Chen","Hao He","Yaohui Jin"],"pdf_url":"https://arxiv.org/pdf/2310.11721v1.pdf","comment":"EMNLP2023 Main Conference"},{"id":"http://arxiv.org/abs/2211.02681v2","updated":"2023-10-18T05:39:16Z","published":"2022-11-02T05:15:48Z","title":"Deep Distance Sensitivity Oracles","summary":"  One of the most fundamental graph problems is finding a shortest path from a\nsource to a target node. While in its basic forms the problem has been studied\nextensively and efficient algorithms are known, it becomes significantly harder\nas soon as parts of the graph are susceptible to failure. Although one can\nrecompute a shortest replacement path after every outage, this is rather\ninefficient both in time and/or storage. One way to overcome this problem is to\nshift computational burden from the queries into a pre-processing step, where a\ndata structure is computed that allows for fast querying of replacement paths,\ntypically referred to as a Distance Sensitivity Oracle (DSO). While DSOs have\nbeen extensively studied in the theoretical computer science community, to the\nbest of our knowledge this is the first work to construct DSOs using deep\nlearning techniques. We show how to use deep learning to utilize a\ncombinatorial structure of replacement paths. More specifically, we utilize the\ncombinatorial structure of replacement paths as a concatenation of shortest\npaths and use deep learning to find the pivot nodes for stitching shortest\npaths into replacement paths.\n","authors":["Davin Jeong","Allison Gunby-Mann","Sarel Cohen","Maximilian Katzmann","Chau Pham","Arnav Bhakta","Tobias Friedrich","Sang Chin"],"pdf_url":"https://arxiv.org/pdf/2211.02681v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2007.11495 by other authors"},{"id":"http://arxiv.org/abs/2305.10744v2","updated":"2023-10-18T05:34:08Z","published":"2023-05-18T06:28:34Z","title":"Online Resource Allocation in Episodic Markov Decision Processes","summary":"  This paper studies a long-term resource allocation problem over multiple\nperiods where each period requires a multi-stage decision-making process. We\nformulate the problem as an online allocation problem in an episodic\nfinite-horizon constrained Markov decision process with an unknown\nnon-stationary transition function and stochastic non-stationary reward and\nresource consumption functions. We propose the observe-then-decide regime and\nimprove the existing decide-then-observe regime, while the two settings differ\nin how the observations and feedback about the reward and resource consumption\nfunctions are given to the decision-maker. We develop an online dual mirror\ndescent algorithm that achieves near-optimal regret bounds for both settings.\nFor the observe-then-decide regime, we prove that the expected regret against\nthe dynamic clairvoyant optimal policy is bounded by $\\tilde\nO(\\rho^{-1}{H^{3/2}}S\\sqrt{AT})$ where $\\rho\\in(0,1)$ is the budget parameter,\n$H$ is the length of the horizon, $S$ and $A$ are the numbers of states and\nactions, and $T$ is the number of episodes. For the decide-then-observe regime,\nwe show that the regret against the static optimal policy that has access to\nthe mean reward and mean resource consumption functions is bounded by $\\tilde\nO(\\rho^{-1}{H^{3/2}}S\\sqrt{AT})$ with high probability. We test the numerical\nefficiency of our method for a variant of the resource-constrained inventory\nmanagement problem.\n","authors":["Duksang Lee","Dabeen Lee"],"pdf_url":"https://arxiv.org/pdf/2305.10744v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05169v2","updated":"2023-10-18T05:08:26Z","published":"2023-03-09T10:51:08Z","title":"Reconstructing the Hubble parameter with future Gravitational Wave\n  missions using Machine Learning","summary":"  We study the prospects of Gaussian processes (GP), a machine learning (ML)\nalgorithm, as a tool to reconstruct the Hubble parameter $H(z)$ with two\nupcoming gravitational wave missions, namely the evolved Laser Interferometer\nSpace Antenna (eLISA) and the Einstein Telescope (ET). Assuming various\nbackground cosmological models, the Hubble parameter has been reconstructed in\na non-parametric manner with the help of GP using realistically generated\ncatalogs for each mission. The effects of early-time and late-time priors on\nthe reconstruction of $H(z)$, and hence on the Hubble constant ($H_0$), have\nalso been focused on separately. Our analysis reveals that GP is quite robust\nin reconstructing the expansion history of the Universe within the\nobservational window of the specific missions under consideration. We further\nconfirm that both eLISA and ET would be able to provide constraints on $H(z)$\nand $H_0$ which would be competitive to those inferred from current datasets.\nIn particular, we observe that an eLISA run of $\\sim10$-year duration with\n$\\sim80$ detected bright siren events would be able to constrain $H_0$ as good\nas a $\\sim3$-year ET run assuming $\\sim 1000$ bright siren event detections.\nFurther improvement in precision is expected for longer eLISA mission durations\nsuch as a $\\sim15$-year time-frame having $\\sim120$ events. Lastly, we discuss\nthe possible role of these future gravitational wave missions in addressing the\nHubble tension, for each model, on a case-by-case basis.\n","authors":["Purba Mukherjee","Rahul Shah","Arko Bhaumik","Supratik Pal"],"pdf_url":"https://arxiv.org/pdf/2303.05169v2.pdf","comment":"14 pages, 8 sets of figures, Accepted in ApJ"},{"id":"http://arxiv.org/abs/2310.11714v1","updated":"2023-10-18T05:06:04Z","published":"2023-10-18T05:06:04Z","title":"On the Evaluation of Generative Models in Distributed Learning Tasks","summary":"  The evaluation of deep generative models including generative adversarial\nnetworks (GANs) and diffusion models has been extensively studied in the\nliterature. While the existing evaluation methods mainly target a centralized\nlearning problem with training data stored by a single client, many\napplications of generative models concern distributed learning settings, e.g.\nthe federated learning scenario, where training data are collected by and\ndistributed among several clients. In this paper, we study the evaluation of\ngenerative models in distributed learning tasks with heterogeneous data\ndistributions. First, we focus on the Fr\\'echet inception distance (FID) and\nconsider the following FID-based aggregate scores over the clients: 1) FID-avg\nas the mean of clients' individual FID scores, 2) FID-all as the FID distance\nof the trained model to the collective dataset containing all clients' data. We\nprove that the model rankings according to the FID-all and FID-avg scores could\nbe inconsistent, which can lead to different optimal generative models\naccording to the two aggregate scores. Next, we consider the kernel inception\ndistance (KID) and similarly define the KID-avg and KID-all aggregations.\nUnlike the FID case, we prove that KID-all and KID-avg result in the same\nrankings of generative models. We perform several numerical experiments on\nstandard image datasets and training schemes to support our theoretical\nfindings on the evaluation of generative models in distributed learning\nproblems.\n","authors":["Zixiao Wang","Farzan Farnia","Zhenghao Lin","Yunheng Shen","Bei Yu"],"pdf_url":"https://arxiv.org/pdf/2310.11714v1.pdf","comment":"17 pages, 10 figures"},{"id":"http://arxiv.org/abs/2310.11707v1","updated":"2023-10-18T04:39:25Z","published":"2023-10-18T04:39:25Z","title":"Learning under Label Proportions for Text Classification","summary":"  We present one of the preliminary NLP works under the challenging setup of\nLearning from Label Proportions (LLP), where the data is provided in an\naggregate form called bags and only the proportion of samples in each class as\nthe ground truth. This setup is inline with the desired characteristics of\ntraining models under Privacy settings and Weakly supervision. By\ncharacterizing some irregularities of the most widely used baseline technique\nDLLP, we propose a novel formulation that is also robust. This is accompanied\nwith a learnability result that provides a generalization bound under LLP.\nCombining this formulation with a self-supervised objective, our method\nachieves better results as compared to the baselines in almost 87% of the\nexperimental configurations which include large scale models for both long and\nshort range texts across multiple metrics.\n","authors":["Jatin Chauhan","Xiaoxuan Wang","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2310.11707v1.pdf","comment":"accepted as long paper in Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.09903v3","updated":"2023-10-18T04:21:32Z","published":"2023-10-15T18:09:09Z","title":"Evaluation of feature selection performance for identification of best\n  effective technical indicators on stock market price prediction","summary":"  Due to the influence of many factors, including technical indicators on stock\nmarket prediction, feature selection is important to choose the best\nindicators. One of the feature selection methods that consider the performance\nof models during feature selection is the wrapper feature selection method. The\naim of this research is to identify a combination of the best stock market\nindicators through feature selection to predict the stock market price with the\nleast error. In order to evaluate the impact of wrapper feature selection\ntechniques on stock market prediction, in this paper SFS and SBS with 10\nestimators and 123 technical indicators have been examined on the last 13 years\nof Apple Company. Also, by the proposed method, the data created by the 3-day\ntime window were converted to the appropriate input for regression methods.\nBased on the results observed: (1) Each wrapper feature selection method has\ndifferent results with different machine learning methods, and each method is\nmore correlated with a specific set of technical indicators of the stock\nmarket. (2) Ridge and LR estimates alone, and with two methods of the wrapper\nfeature selection, namely SFS and SBS; They had the best results with all\nassessment criteria for market forecast. (3)The Ridge and LR method with all\nthe R2, MSE, RMSE, MAE and MAPE have the best stock market prediction results.\nAlso, the MLP Regression Method, along with the Sequential Forwards Selection\nand the MSE, had the best performance. SVR regression, along with the SFS and\nthe MSE, has improved greatly compared to the SVR regression with all\nindicators. (4) It was also observed that different features are selected by\ndifferent ML methods with different evaluation parameters. (5) Most ML methods\nhave used the Squeeze_pro, Percentage Price Oscillator, Thermo, Decay, Archer\nOn-Balance Volume, Bollinger Bands, Squeeze and Ichimoku indicator.\n","authors":["Fatemeh Moodi","Amir Jahangard-Rafsanjani"],"pdf_url":"https://arxiv.org/pdf/2310.09903v3.pdf","comment":"18 pages, 8 figures,4 tables,45 references"},{"id":"http://arxiv.org/abs/2310.11700v1","updated":"2023-10-18T04:15:39Z","published":"2023-10-18T04:15:39Z","title":"Runner re-identification from single-view video in the open-world\n  setting","summary":"  In many sports, player re-identification is crucial for automatic video\nprocessing and analysis. However, most of the current studies on player\nre-identification in multi- or single-view sports videos focus on\nre-identification in the closed-world setting using labeled image dataset, and\nplayer re-identification in the open-world setting for automatic video analysis\nis not well developed. In this paper, we propose a runner re-identification\nsystem that directly processes single-view video to address the open-world\nsetting. In the open-world setting, we cannot use labeled dataset and have to\nprocess video directly. The proposed system automatically processes raw video\nas input to identify runners, and it can identify runners even when they are\nframed out multiple times. For the automatic processing, we first detect the\nrunners in the video using the pre-trained YOLOv8 and the fine-tuned\nEfficientNet. We then track the runners using ByteTrack and detect their shoes\nwith the fine-tuned YOLOv8. Finally, we extract the image features of the\nrunners using an unsupervised method using the gated recurrent unit autoencoder\nmodel. To improve the accuracy of runner re-identification, we use dynamic\nfeatures of running sequence images. We evaluated the system on a running\npractice video dataset and showed that the proposed method identified runners\nwith higher accuracy than one of the state-of-the-art models in unsupervised\nre-identification. We also showed that our unsupervised running dynamic feature\nextractor was effective for runner re-identification. Our runner\nre-identification system can be useful for the automatic analysis of running\nvideos.\n","authors":["Tomohiro Suzuki","Kazushi Tsutsui","Kazuya Takeda","Keisuke Fujii"],"pdf_url":"https://arxiv.org/pdf/2310.11700v1.pdf","comment":"18 pages, 8 figures"},{"id":"http://arxiv.org/abs/2012.03436v5","updated":"2023-10-18T04:10:42Z","published":"2020-12-07T03:34:03Z","title":"Euclidean-Norm-Induced Schatten-p Quasi-Norm Regularization for Low-Rank\n  Tensor Completion and Tensor Robust Principal Component Analysis","summary":"  The nuclear norm and Schatten-$p$ quasi-norm are popular rank proxies in\nlow-rank matrix recovery. However, computing the nuclear norm or Schatten-$p$\nquasi-norm of a tensor is hard in both theory and practice, hindering their\napplication to low-rank tensor completion (LRTC) and tensor robust principal\ncomponent analysis (TRPCA). In this paper, we propose a new class of tensor\nrank regularizers based on the Euclidean norms of the CP component vectors of a\ntensor and show that these regularizers are monotonic transformations of tensor\nSchatten-$p$ quasi-norm. This connection enables us to minimize the\nSchatten-$p$ quasi-norm in LRTC and TRPCA implicitly via the component vectors.\nThe method scales to big tensors and provides an arbitrarily sharper rank proxy\nfor low-rank tensor recovery compared to the nuclear norm. On the other hand,\nwe study the generalization abilities of LRTC with the Schatten-$p$ quasi-norm\nregularizer and LRTC with the proposed regularizers. The theorems show that a\nrelatively sharper regularizer leads to a tighter error bound, which is\nconsistent with our numerical results. Particularly, we prove that for LRTC\nwith Schatten-$p$ quasi-norm regularizer on $d$-order tensors, $p=1/d$ is\nalways better than any $p>1/d$ in terms of the generalization ability. We also\nprovide a recovery error bound to verify the usefulness of small $p$ in the\nSchatten-$p$ quasi-norm for TRPCA. Numerical results on synthetic data and real\ndata demonstrate the effectiveness of the regularization methods and theorems.\n","authors":["Jicong Fan","Lijun Ding","Chengrun Yang","Zhao Zhang","Madeleine Udell"],"pdf_url":"https://arxiv.org/pdf/2012.03436v5.pdf","comment":"Published by Transactions on Machine Learning Research, January 2023;\n  https://openreview.net/forum?id=Grhi800jVz"},{"id":"http://arxiv.org/abs/2308.11594v3","updated":"2023-10-18T04:02:41Z","published":"2023-08-20T05:03:31Z","title":"Quantization-based Optimization with Perspective of Quantum Mechanics","summary":"  Statistical and stochastic analysis based on thermodynamics has been the main\nanalysis framework for stochastic global optimization. Recently, appearing\nquantum annealing or quantum tunneling algorithm for global optimization, we\nrequire a new researching framework for global optimization algorithms. In this\npaper, we provide the analysis for quantization-based optimization based on the\nSchr\\\"odinger equation to reveal what property in quantum mechanics enables\nglobal optimization. We present that the tunneling effect derived by the\nSchr\\\"odinger equation in quantization-based optimization enables to escape of\na local minimum. Additionally, we confirm that this tunneling effect is the\nsame property included in quantum mechanics-based global optimization.\nExperiments with standard multi-modal benchmark functions represent that the\nproposed analysis is valid.\n","authors":["Jinwuk Seok","Changsik Cho"],"pdf_url":"https://arxiv.org/pdf/2308.11594v3.pdf","comment":"Published in ICTC 2023 conference"},{"id":"http://arxiv.org/abs/2310.11693v1","updated":"2023-10-18T03:43:11Z","published":"2023-10-18T03:43:11Z","title":"AUC-mixup: Deep AUC Maximization with Mixup","summary":"  While deep AUC maximization (DAM) has shown remarkable success on imbalanced\nmedical tasks, e.g., chest X-rays classification and skin lesions\nclassification, it could suffer from severe overfitting when applied to small\ndatasets due to its aggressive nature of pushing prediction scores of positive\ndata away from that of negative data. This paper studies how to improve\ngeneralization of DAM by mixup data augmentation -- an approach that is widely\nused for improving generalization of the cross-entropy loss based deep learning\nmethods. %For overfitting issues arising from limited data, the common approach\nis to employ mixup data augmentation to boost the models' generalization\nperformance by enriching the training data. However, AUC is defined over\npositive and negative pairs, which makes it challenging to incorporate mixup\ndata augmentation into DAM algorithms. To tackle this challenge, we employ the\nAUC margin loss and incorporate soft labels into the formulation to effectively\nlearn from data generated by mixup augmentation, which is referred to as the\nAUC-mixup loss. Our experimental results demonstrate the effectiveness of the\nproposed AUC-mixup methods on imbalanced benchmark and medical image datasets\ncompared to standard DAM training methods.\n","authors":["Jianzhi Xv","Gang Li","Tianbao Yang"],"pdf_url":"https://arxiv.org/pdf/2310.11693v1.pdf","comment":"3 pages, 4 figures"},{"id":"http://arxiv.org/abs/2310.11690v1","updated":"2023-10-18T03:36:10Z","published":"2023-10-18T03:36:10Z","title":"Deep learning based on Transformer architecture for power system\n  short-term voltage stability assessment with class imbalance","summary":"  Most existing data-driven power system short-term voltage stability\nassessment (STVSA) approaches presume class-balanced input data. However, in\npractical applications, the occurrence of short-term voltage instability\nfollowing a disturbance is minimal, leading to a significant class imbalance\nproblem and a consequent decline in classifier performance. This work proposes\na Transformer-based STVSA method to address this challenge. By utilizing the\nbasic Transformer architecture, a stability assessment Transformer (StaaT) is\ndeveloped {as a classification model to reflect the correlation between the\noperational states of the system and the resulting stability outcomes}. To\ncombat the negative impact of imbalanced datasets, this work employs a\nconditional Wasserstein generative adversarial network with gradient penalty\n(CWGAN-GP) for synthetic data generation, aiding in the creation of a balanced,\nrepresentative training set for the classifier. Semi-supervised clustering\nlearning is implemented to enhance clustering quality, addressing the lack of a\nunified quantitative criterion for short-term voltage stability. {Numerical\ntests on the IEEE 39-bus test system extensively demonstrate that the proposed\nmethod exhibits robust performance under class imbalances up to 100:1 and noisy\nenvironments, and maintains consistent effectiveness even with an increased\npenetration of renewable energy}. Comparative results reveal that the CWGAN-GP\ngenerates more balanced datasets than traditional oversampling methods and that\nthe StaaT outperforms other deep learning algorithms. This study presents a\ncompelling solution for real-world STVSA applications that often face class\nimbalance and data noise challenges.\n","authors":["Yang Li","Jiting Cao","Yan Xu","Lipeng Zhu","Zhao Yang Dong"],"pdf_url":"https://arxiv.org/pdf/2310.11690v1.pdf","comment":"Accepted by Renewable and Sustainable Energy Reviews"},{"id":"http://arxiv.org/abs/2310.11689v1","updated":"2023-10-18T03:34:59Z","published":"2023-10-18T03:34:59Z","title":"Adaptation with Self-Evaluation to Improve Selective Prediction in LLMs","summary":"  Large language models (LLMs) have recently shown great advances in a variety\nof tasks, including natural language understanding and generation. However,\ntheir use in high-stakes decision-making scenarios is still limited due to the\npotential for errors. Selective prediction is a technique that can be used to\nimprove the reliability of the LLMs by allowing them to abstain from making\npredictions when they are unsure of the answer. In this work, we propose a\nnovel framework for adaptation with self-evaluation to improve the selective\nprediction performance of LLMs. Our framework is based on the idea of using\nparameter-efficient tuning to adapt the LLM to the specific task at hand while\nimproving its ability to perform self-evaluation. We evaluate our method on a\nvariety of question-answering (QA) datasets and show that it outperforms\nstate-of-the-art selective prediction methods. For example, on the CoQA\nbenchmark, our method improves the AUACC from 91.23% to 92.63% and improves the\nAUROC from 74.61% to 80.25%.\n","authors":["Jiefeng Chen","Jinsung Yoon","Sayna Ebrahimi","Sercan O Arik","Tomas Pfister","Somesh Jha"],"pdf_url":"https://arxiv.org/pdf/2310.11689v1.pdf","comment":"Paper published at Findings of the Association for Computational\n  Linguistics: EMNLP, 2023"},{"id":"http://arxiv.org/abs/2307.11704v2","updated":"2023-10-18T03:25:08Z","published":"2023-07-21T17:00:06Z","title":"JoinGym: An Efficient Query Optimization Environment for Reinforcement\n  Learning","summary":"  Join order selection (JOS) is the problem of ordering join operations to\nminimize total query execution cost and it is the core NP-hard combinatorial\noptimization problem of query optimization. In this paper, we present JoinGym,\na lightweight and easy-to-use query optimization environment for reinforcement\nlearning (RL) that captures both the left-deep and bushy variants of the JOS\nproblem. Compared to existing query optimization environments, the key\nadvantages of JoinGym are usability and significantly higher throughput which\nwe accomplish by simulating query executions entirely offline. Under the hood,\nJoinGym simulates a query plan's cost by looking up intermediate result\ncardinalities from a pre-computed dataset. We release a novel cardinality\ndataset for $3300$ SQL queries based on real IMDb workloads which may be of\nindependent interest, e.g., for cardinality estimation. Finally, we extensively\nbenchmark four RL algorithms and find that their cost distributions are\nheavy-tailed, which motivates future work in risk-sensitive RL. In sum, JoinGym\nenables users to rapidly prototype RL algorithms on realistic database problems\nwithout needing to setup and run live systems.\n","authors":["Kaiwen Wang","Junxiong Wang","Yueying Li","Nathan Kallus","Immanuel Trummer","Wen Sun"],"pdf_url":"https://arxiv.org/pdf/2307.11704v2.pdf","comment":"JoinGym is available at https://github.com/kaiwenw/JoinGym!"},{"id":"http://arxiv.org/abs/2310.11685v1","updated":"2023-10-18T03:17:57Z","published":"2023-10-18T03:17:57Z","title":"Superiority of Softmax: Unveiling the Performance Edge Over Linear\n  Attention","summary":"  Large transformer models have achieved state-of-the-art results in numerous\nnatural language processing tasks. Among the pivotal components of the\ntransformer architecture, the attention mechanism plays a crucial role in\ncapturing token interactions within sequences through the utilization of\nsoftmax function.\n  Conversely, linear attention presents a more computationally efficient\nalternative by approximating the softmax operation with linear complexity.\nHowever, it exhibits substantial performance degradation when compared to the\ntraditional softmax attention mechanism.\n  In this paper, we bridge the gap in our theoretical understanding of the\nreasons behind the practical performance gap between softmax and linear\nattention. By conducting a comprehensive comparative analysis of these two\nattention mechanisms, we shed light on the underlying reasons for why softmax\nattention outperforms linear attention in most scenarios.\n","authors":["Yichuan Deng","Zhao Song","Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.11685v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11684v1","updated":"2023-10-18T03:17:51Z","published":"2023-10-18T03:17:51Z","title":"Quantum Acceleration of Infinite Horizon Average-Reward Reinforcement\n  Learning","summary":"  This paper investigates the potential of quantum acceleration in addressing\ninfinite horizon Markov Decision Processes (MDPs) to enhance average reward\noutcomes. We introduce an innovative quantum framework for the agent's\nengagement with an unknown MDP, extending the conventional interaction\nparadigm. Our approach involves the design of an optimism-driven tabular\nReinforcement Learning algorithm that harnesses quantum signals acquired by the\nagent through efficient quantum mean estimation techniques. Through thorough\ntheoretical analysis, we demonstrate that the quantum advantage in mean\nestimation leads to exponential advancements in regret guarantees for infinite\nhorizon Reinforcement Learning. Specifically, the proposed Quantum algorithm\nachieves a regret bound of $\\tilde{\\mathcal{O}}(1)$, a significant improvement\nover the $\\tilde{\\mathcal{O}}(\\sqrt{T})$ bound exhibited by classical\ncounterparts.\n","authors":["Bhargav Ganguly","Vaneet Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2310.11684v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16530v2","updated":"2023-10-18T03:11:07Z","published":"2023-05-25T23:22:33Z","title":"Bi-fidelity Variational Auto-encoder for Uncertainty Quantification","summary":"  Quantifying the uncertainty of quantities of interest (QoIs) from physical\nsystems is a primary objective in model validation. However, achieving this\ngoal entails balancing the need for computational efficiency with the\nrequirement for numerical accuracy. To address this trade-off, we propose a\nnovel bi-fidelity formulation of variational auto-encoders (BF-VAE) designed to\nestimate the uncertainty associated with a QoI from low-fidelity (LF) and\nhigh-fidelity (HF) samples of the QoI. This model allows for the approximation\nof the statistics of the HF QoI by leveraging information derived from its LF\ncounterpart. Specifically, we design a bi-fidelity auto-regressive model in the\nlatent space that is integrated within the VAE's probabilistic encoder-decoder\nstructure. An effective algorithm is proposed to maximize the variational lower\nbound of the HF log-likelihood in the presence of limited HF data, resulting in\nthe synthesis of HF realizations with a reduced computational cost.\nAdditionally, we introduce the concept of the bi-fidelity information\nbottleneck (BF-IB) to provide an information-theoretic interpretation of the\nproposed BF-VAE model. Our numerical results demonstrate that BF-VAE leads to\nconsiderably improved accuracy, as compared to a VAE trained using only HF\ndata, when limited HF data is available.\n","authors":["Nuojin Cheng","Osman Asif Malik","Subhayan De","Stephen Becker","Alireza Doostan"],"pdf_url":"https://arxiv.org/pdf/2305.16530v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11102v2","updated":"2023-10-18T03:02:38Z","published":"2023-10-17T09:34:34Z","title":"HGCVAE: Integrating Generative and Contrastive Learning for\n  Heterogeneous Graph Learning","summary":"  Generative self-supervised learning (SSL) has exhibited significant potential\nand garnered increasing interest in graph learning. In this study, we aim to\nexplore the problem of generative SSL in the context of heterogeneous graph\nlearning (HGL). The previous SSL approaches for heterogeneous graphs have\nprimarily relied on contrastive learning, necessitating the design of complex\nviews to capture heterogeneity. However, existing generative SSL methods have\nnot fully leveraged the capabilities of generative models to address the\nchallenges of HGL. In this paper, we present HGCVAE, a novel contrastive\nvariational graph auto-encoder that liberates HGL from the burden of intricate\nheterogeneity capturing. Instead of focusing on complicated heterogeneity,\nHGCVAE harnesses the full potential of generative SSL. HGCVAE innovatively\nconsolidates contrastive learning with generative SSL, introducing several key\ninnovations. Firstly, we employ a progressive mechanism to generate\nhigh-quality hard negative samples for contrastive learning, utilizing the\npower of variational inference. Additionally, we present a dynamic mask\nstrategy to ensure effective and stable learning. Moreover, we propose an\nenhanced scaled cosine error as the criterion for better attribute\nreconstruction. As an initial step in combining generative and contrastive SSL,\nHGCVAE achieves remarkable results compared to various state-of-the-art\nbaselines, confirming its superiority.\n","authors":["Yulan Hu","Zhirui Yang","Sheng Ouyang","Junchen Wan","Fuzheng Zhang","Zhongyuan Wang","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2310.11102v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11678v1","updated":"2023-10-18T03:00:59Z","published":"2023-10-18T03:00:59Z","title":"Using Experience Classification for Training Non-Markovian Tasks","summary":"  Unlike the standard Reinforcement Learning (RL) model, many real-world tasks\nare non-Markovian, whose rewards are predicated on state history rather than\nsolely on the current state. Solving a non-Markovian task, frequently applied\nin practical applications such as autonomous driving, financial trading, and\nmedical diagnosis, can be quite challenging. We propose a novel RL approach to\nachieve non-Markovian rewards expressed in temporal logic LTL$_f$ (Linear\nTemporal Logic over Finite Traces). To this end, an encoding of linear\ncomplexity from LTL$_f$ into MDPs (Markov Decision Processes) is introduced to\ntake advantage of advanced RL algorithms. Then, a prioritized experience replay\ntechnique based on the automata structure (semantics equivalent to LTL$_f$\nspecification) is utilized to improve the training process. We empirically\nevaluate several benchmark problems augmented with non-Markovian tasks to\ndemonstrate the feasibility and effectiveness of our approach.\n","authors":["Ruixuan Miao","Xu Lu","Cong Tian","Bin Yu","Zhenhua Duan"],"pdf_url":"https://arxiv.org/pdf/2310.11678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11677v1","updated":"2023-10-18T03:00:15Z","published":"2023-10-18T03:00:15Z","title":"Improved Sample Complexity Analysis of Natural Policy Gradient Algorithm\n  with General Parameterization for Infinite Horizon Discounted Reward Markov\n  Decision Processes","summary":"  We consider the problem of designing sample efficient learning algorithms for\ninfinite horizon discounted reward Markov Decision Process. Specifically, we\npropose the Accelerated Natural Policy Gradient (ANPG) algorithm that utilizes\nan accelerated stochastic gradient descent process to obtain the natural policy\ngradient. ANPG achieves $\\mathcal{O}({\\epsilon^{-2}})$ sample complexity and\n$\\mathcal{O}(\\epsilon^{-1})$ iteration complexity with general parameterization\nwhere $\\epsilon$ defines the optimality error. This improves the\nstate-of-the-art sample complexity by a $\\log(\\frac{1}{\\epsilon})$ factor. ANPG\nis a first-order algorithm and unlike some existing literature, does not\nrequire the unverifiable assumption that the variance of importance sampling\n(IS) weights is upper bounded. In the class of Hessian-free and IS-free\nalgorithms, ANPG beats the best-known sample complexity by a factor of\n$\\mathcal{O}(\\epsilon^{-\\frac{1}{2}})$ and simultaneously matches their\nstate-of-the-art iteration complexity.\n","authors":["Washim Uddin Mondal","Vaneet Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2310.11677v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11676v1","updated":"2023-10-18T02:59:57Z","published":"2023-10-18T02:59:57Z","title":"PREM: A Simple Yet Effective Approach for Node-Level Graph Anomaly\n  Detection","summary":"  Node-level graph anomaly detection (GAD) plays a critical role in identifying\nanomalous nodes from graph-structured data in various domains such as medicine,\nsocial networks, and e-commerce. However, challenges have arisen due to the\ndiversity of anomalies and the dearth of labeled data. Existing methodologies -\nreconstruction-based and contrastive learning - while effective, often suffer\nfrom efficiency issues, stemming from their complex objectives and elaborate\nmodules. To improve the efficiency of GAD, we introduce a simple method termed\nPREprocessing and Matching (PREM for short). Our approach streamlines GAD,\nreducing time and memory consumption while maintaining powerful anomaly\ndetection capabilities. Comprising two modules - a pre-processing module and an\nego-neighbor matching module - PREM eliminates the necessity for\nmessage-passing propagation during training, and employs a simple contrastive\nloss, leading to considerable reductions in training time and memory usage.\nMoreover, through rigorous evaluations of five real-world datasets, our method\ndemonstrated robustness and effectiveness. Notably, when validated on the ACM\ndataset, PREM achieved a 5% improvement in AUC, a 9-fold increase in training\nspeed, and sharply reduce memory usage compared to the most efficient baseline.\n","authors":["Junjun Pan","Yixin Liu","Yizhen Zheng","Shirui Pan"],"pdf_url":"https://arxiv.org/pdf/2310.11676v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.09299v4","updated":"2023-10-18T02:49:11Z","published":"2022-11-17T02:27:44Z","title":"FedFA: Federated Learning with Feature Anchors to Align Features and\n  Classifiers for Heterogeneous Data","summary":"  Federated learning allows multiple clients to collaboratively train a model\nwithout exchanging their data, thus preserving data privacy. Unfortunately, it\nsuffers significant performance degradation due to heterogeneous data at\nclients. Common solutions involve designing an auxiliary loss to regularize\nweight divergence or feature inconsistency during local training. However, we\ndiscover that these approaches fall short of the expected performance because\nthey ignore the existence of a vicious cycle between feature inconsistency and\nclassifier divergence across clients. This vicious cycle causes client models\nto be updated in inconsistent feature spaces with more diverged classifiers. To\nbreak the vicious cycle, we propose a novel framework named Federated learning\nwith Feature Anchors (FedFA). FedFA utilizes feature anchors to align features\nand calibrate classifiers across clients simultaneously. This enables client\nmodels to be updated in a shared feature space with consistent classifiers\nduring local training. Theoretically, we analyze the non-convex convergence\nrate of FedFA. We also demonstrate that the integration of feature alignment\nand classifier calibration in FedFA brings a virtuous cycle between feature and\nclassifier updates, which breaks the vicious cycle existing in current\napproaches. Extensive experiments show that FedFA significantly outperforms\nexisting approaches on various classification datasets under label distribution\nskew and feature distribution skew.\n","authors":["Tailin Zhou","Jun Zhang","Danny H. K. Tsang"],"pdf_url":"https://arxiv.org/pdf/2211.09299v4.pdf","comment":"Accepted by IEEE Transactions on Mobile Computing"},{"id":"http://arxiv.org/abs/2310.07923v3","updated":"2023-10-18T02:38:02Z","published":"2023-10-11T22:35:18Z","title":"The Expressive Power of Transformers with Chain of Thought","summary":"  Recent theoretical work has identified surprisingly simple reasoning\nproblems, such as checking if two nodes in a graph are connected or simulating\nfinite-state machines, that are provably unsolvable by standard transformers\nthat answer immediately after reading their input. However, in practice,\ntransformers' reasoning can be improved by allowing them to use a \"chain of\nthought\" or \"scratchpad\", i.e., generate and condition on a sequence of\nintermediate tokens before answering. Motivated by this, we ask: Does such\nintermediate generation fundamentally extend the computational power of a\ndecoder-only transformer? We show that the answer is yes, but the amount of\nincrease depends crucially on the amount of intermediate generation. For\ninstance, we find that transformer decoders with a logarithmic number of\ndecoding steps (w.r.t. the input length) push the limits of standard\ntransformers only slightly, while a linear number of decoding steps adds a\nclear new ability (under standard complexity conjectures): recognizing all\nregular languages. Our results also imply that linear steps keep transformer\ndecoders within context-sensitive languages, and polynomial steps make them\nrecognize exactly the class of polynomial-time solvable problems -- the first\nexact characterization of a type of transformers in terms of standard\ncomplexity classes. Together, our results provide a nuanced framework for\nunderstanding how the length of a transformer's chain of thought or scratchpad\nimpacts its reasoning power.\n","authors":["William Merrill","Ashish Sabharwal"],"pdf_url":"https://arxiv.org/pdf/2310.07923v3.pdf","comment":"9-page preprint"},{"id":"http://arxiv.org/abs/2310.11211v2","updated":"2023-10-18T02:34:21Z","published":"2023-10-17T12:40:53Z","title":"Understanding Fairness Surrogate Functions in Algorithmic Fairness","summary":"  It has been observed that machine learning algorithms exhibit biased\npredictions against certain population groups. To mitigate such bias while\nachieving comparable accuracy, a promising approach is to introduce surrogate\nfunctions of the concerned fairness definition and solve a constrained\noptimization problem. However, an intriguing issue in previous work is that\nsuch fairness surrogate functions may yield unfair results. In this work, in\norder to deeply understand this issue, taking a widely used fairness\ndefinition, demographic parity as an example, we both theoretically and\nempirically show that there is a surrogate-fairness gap between the fairness\ndefinition and the fairness surrogate function. The \"gap\" directly determines\nwhether a surrogate function is an appropriate substitute for a fairness\ndefinition. Also, the theoretical analysis and experimental results about the\n\"gap\" motivate us that the unbounded surrogate functions will be affected by\nthe points far from the decision boundary, which is the large margin points\nissue investigated in this paper. To address it, we propose the general sigmoid\nsurrogate with a rigorous and reliable fairness guarantee. Interestingly, the\ntheory also provides insights into two important issues that deal with the\nlarge margin points as well as obtaining a more balanced dataset are beneficial\nto fairness. Furthermore, we elaborate a novel and general algorithm called\nBalanced Surrogate, which iteratively reduces the \"gap\" to improve fairness.\nFinally, we provide empirical evidence showing that our methods achieve better\nfairness performance in three real-world datasets.\n","authors":["Wei Yao","Zhanke Zhou","Zhicong Li","Bo Han","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2310.11211v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2303.13408v2","updated":"2023-10-18T02:29:55Z","published":"2023-03-23T16:29:27Z","title":"Paraphrasing evades detectors of AI-generated text, but retrieval is an\n  effective defense","summary":"  The rise in malicious usage of large language models, such as fake content\ncreation and academic plagiarism, has motivated the development of approaches\nthat identify AI-generated text, including those based on watermarking or\noutlier detection. However, the robustness of these detection algorithms to\nparaphrases of AI-generated text remains unclear. To stress test these\ndetectors, we build a 11B parameter paraphrase generation model (DIPPER) that\ncan paraphrase paragraphs, condition on surrounding context, and control\nlexical diversity and content reordering. Using DIPPER to paraphrase text\ngenerated by three large language models (including GPT3.5-davinci-003)\nsuccessfully evades several detectors, including watermarking, GPTZero,\nDetectGPT, and OpenAI's text classifier. For example, DIPPER drops detection\naccuracy of DetectGPT from 70.3% to 4.6% (at a constant false positive rate of\n1%), without appreciably modifying the input semantics.\n  To increase the robustness of AI-generated text detection to paraphrase\nattacks, we introduce a simple defense that relies on retrieving\nsemantically-similar generations and must be maintained by a language model API\nprovider. Given a candidate text, our algorithm searches a database of\nsequences previously generated by the API, looking for sequences that match the\ncandidate text within a certain threshold. We empirically verify our defense\nusing a database of 15M generations from a fine-tuned T5-XXL model and find\nthat it can detect 80% to 97% of paraphrased generations across different\nsettings while only classifying 1% of human-written sequences as AI-generated.\nWe open-source our models, code and data.\n","authors":["Kalpesh Krishna","Yixiao Song","Marzena Karpinska","John Wieting","Mohit Iyyer"],"pdf_url":"https://arxiv.org/pdf/2303.13408v2.pdf","comment":"NeurIPS 2023 camera ready (32 pages). Code, models, data available in\n  https://github.com/martiansideofthemoon/ai-detection-paraphrases"}],"Multimedia":[{"id":"http://arxiv.org/abs/2310.12100v1","updated":"2023-10-18T16:43:08Z","published":"2023-10-18T16:43:08Z","title":"Non-Intrusive Adaptation: Input-Centric Parameter-efficient Fine-Tuning\n  for Versatile Multimodal Modeling","summary":"  Large language models (LLMs) and vision language models (VLMs) demonstrate\nexcellent performance on a wide range of tasks by scaling up parameter counts\nfrom O(10^9) to O(10^{12}) levels and further beyond. These large scales make\nit impossible to adapt and deploy fully specialized models given a task of\ninterest. Parameter-efficient fine-tuning (PEFT) emerges as a promising\ndirection to tackle the adaptation and serving challenges for such large\nmodels. We categorize PEFT techniques into two types: intrusive and\nnon-intrusive. Intrusive PEFT techniques directly change a model's internal\narchitecture. Though more flexible, they introduce significant complexities for\ntraining and serving. Non-intrusive PEFT techniques leave the internal\narchitecture unchanged and only adapt model-external parameters, such as\nembeddings for input. In this work, we describe AdaLink as a non-intrusive PEFT\ntechnique that achieves competitive performance compared to SoTA intrusive PEFT\n(LoRA) and full model fine-tuning (FT) on various tasks. We evaluate using both\ntext-only and multimodal tasks, with experiments that account for both\nparameter-count scaling and training regime (with and without instruction\ntuning).\n","authors":["Yaqing Wang","Jialin Wu","Tanmaya Dabral","Jiageng Zhang","Geoff Brown","Chun-Ta Lu","Frederick Liu","Yi Liang","Bo Pang","Michael Bendersky","Radu Soricut"],"pdf_url":"https://arxiv.org/pdf/2310.12100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11954v1","updated":"2023-10-18T13:31:10Z","published":"2023-10-18T13:31:10Z","title":"MusicAgent: An AI Agent for Music Understanding and Generation with\n  Large Language Models","summary":"  AI-empowered music processing is a diverse field that encompasses dozens of\ntasks, ranging from generation tasks (e.g., timbre synthesis) to comprehension\ntasks (e.g., music classification). For developers and amateurs, it is very\ndifficult to grasp all of these task to satisfy their requirements in music\nprocessing, especially considering the huge differences in the representations\nof music data and the model applicability across platforms among various tasks.\nConsequently, it is necessary to build a system to organize and integrate these\ntasks, and thus help practitioners to automatically analyze their demand and\ncall suitable tools as solutions to fulfill their requirements. Inspired by the\nrecent success of large language models (LLMs) in task automation, we develop a\nsystem, named MusicAgent, which integrates numerous music-related tools and an\nautonomous workflow to address user requirements. More specifically, we build\n1) toolset that collects tools from diverse sources, including Hugging Face,\nGitHub, and Web API, etc. 2) an autonomous workflow empowered by LLMs (e.g.,\nChatGPT) to organize these tools and automatically decompose user requests into\nmultiple sub-tasks and invoke corresponding music tools. The primary goal of\nthis system is to free users from the intricacies of AI-music tools, enabling\nthem to concentrate on the creative aspect. By granting users the freedom to\neffortlessly combine tools, the system offers a seamless and enriching music\nexperience.\n","authors":["Dingyao Yu","Kaitao Song","Peiling Lu","Tianyu He","Xu Tan","Wei Ye","Shikun Zhang","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2310.11954v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.14133v3","updated":"2023-10-18T13:19:52Z","published":"2023-04-27T12:28:29Z","title":"VERITE: A Robust Benchmark for Multimodal Misinformation Detection\n  Accounting for Unimodal Bias","summary":"  Multimedia content has become ubiquitous on social media platforms, leading\nto the rise of multimodal misinformation (MM) and the urgent need for effective\nstrategies to detect and prevent its spread. In recent years, the challenge of\nmultimodal misinformation detection (MMD) has garnered significant attention by\nresearchers and has mainly involved the creation of annotated, weakly\nannotated, or synthetically generated training datasets, along with the\ndevelopment of various deep learning MMD models. However, the problem of\nunimodal bias has been overlooked, where specific patterns and biases in MMD\nbenchmarks can result in biased or unimodal models outperforming their\nmultimodal counterparts on an inherently multimodal task; making it difficult\nto assess progress. In this study, we systematically investigate and identify\nthe presence of unimodal bias in widely-used MMD benchmarks, namely VMU-Twitter\nand COSMOS. To address this issue, we introduce the \"VERification of Image-TExt\npairs\" (VERITE) benchmark for MMD which incorporates real-world data, excludes\n\"asymmetric multimodal misinformation\" and utilizes \"modality balancing\". We\nconduct an extensive comparative study with a Transformer-based architecture\nthat shows the ability of VERITE to effectively address unimodal bias,\nrendering it a robust evaluation framework for MMD. Furthermore, we introduce a\nnew method -- termed Crossmodal HArd Synthetic MisAlignment (CHASMA) -- for\ngenerating realistic synthetic training data that preserve crossmodal relations\nbetween legitimate images and false human-written captions. By leveraging\nCHASMA in the training process, we observe consistent and notable improvements\nin predictive performance on VERITE; with a 9.2% increase in accuracy. We\nrelease our code at: https://github.com/stevejpapad/image-text-verification\n","authors":["Stefanos-Iordanis Papadopoulos","Christos Koutlis","Symeon Papadopoulos","Panagiotis C. Petrantonakis"],"pdf_url":"https://arxiv.org/pdf/2304.14133v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11830v1","updated":"2023-10-18T09:31:56Z","published":"2023-10-18T09:31:56Z","title":"CLARA: Multilingual Contrastive Learning for Audio Representation\n  Acquisition","summary":"  This paper proposes a novel framework for multilingual speech and sound\nrepresentation learning using contrastive learning. The lack of sizeable\nlabelled datasets hinders speech-processing research across languages. Recent\nadvances in contrastive learning provide self-supervised techniques to learn\nfrom unlabelled data. Motivated by reducing data dependence and improving\ngeneralisation across diverse languages and conditions, we develop a\nmultilingual contrastive framework. This framework enables models to acquire\nshared representations across languages, facilitating cross-lingual transfer\nwith limited target language data.\n  Additionally, capturing emotional cues within speech is challenging due to\nsubjective perceptual assessments. By learning expressive representations from\ndiverse, multilingual data in a self-supervised manner, our approach aims to\ndevelop speech representations that encode emotive dimensions.\n  Our method trains encoders on a large corpus of multi-lingual audio data.\nData augmentation techniques are employed to expand the dataset. The\ncontrastive learning approach trains the model to maximise agreement between\npositive pairs and minimise agreement between negative pairs. Extensive\nexperiments demonstrate state-of-the-art performance of the proposed model on\nemotion recognition, audio classification, and retrieval benchmarks under\nzero-shot and few-shot conditions. This provides an effective approach for\nacquiring shared and generalised speech representations across languages and\nacoustic conditions while encoding latent emotional dimensions.\n","authors":["Kari A Noriy","Xiaosong Yang","Marcin Budka","Jian Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.11830v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08205v2","updated":"2023-10-18T07:37:09Z","published":"2023-10-12T10:51:17Z","title":"LiveVV: Human-Centered Live Volumetric Video Streaming System","summary":"  Volumetric video has emerged as a prominent medium within the realm of\neXtended Reality (XR) with the advancements in computer graphics and depth\ncapture hardware. Users can fully immersive themselves in volumetric video with\nthe ability to switch their viewport in six degree-of-freedom (DOF), including\nthree rotational dimensions (yaw, pitch, roll) and three translational\ndimensions (X, Y, Z). Different from traditional 2D videos that are composed of\npixel matrices, volumetric videos employ point clouds, meshes, or voxels to\nrepresent a volumetric scene, resulting in significantly larger data sizes.\nWhile previous works have successfully achieved volumetric video streaming in\nvideo-on-demand scenarios, the live streaming of volumetric video remains an\nunresolved challenge due to the limited network bandwidth and stringent latency\nconstraints. In this paper, we for the first time propose a holistic live\nvolumetric video streaming system, LiveVV, which achieves multi-view capture,\nscene segmentation \\& reuse, adaptive transmission, and rendering. LiveVV\ncontains multiple lightweight volumetric video capture modules that are capable\nof being deployed without prior preparation. To reduce bandwidth consumption,\nLiveVV processes static and dynamic volumetric content separately by reusing\nstatic data with low disparity and decimating data with low visual saliency.\nBesides, to deal with network fluctuation, LiveVV integrates a volumetric video\nadaptive bitrate streaming algorithm (VABR) to enable fluent playback with the\nmaximum quality of experience. Extensive real-world experiment shows that\nLiveVV can achieve live volumetric video streaming at a frame rate of 24 fps\nwith a latency of less than 350ms.\n","authors":["Kaiyuan Hu","Yongting Chen","Kaiying Han","Junhua Liu","Haowen Yang","Yili Jin","Boyan Li","Fangxin Wang"],"pdf_url":"https://arxiv.org/pdf/2310.08205v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11650v1","updated":"2023-10-18T01:28:29Z","published":"2023-10-18T01:28:29Z","title":"VKIE: The Application of Key Information Extraction on Video Text","summary":"  Extracting structured information from videos is critical for numerous\ndownstream applications in the industry. In this paper, we define a significant\ntask of extracting hierarchical key information from visual texts on videos. To\nfulfill this task, we decouples it into four subtasks and introduce two\nimplementation solutions called PipVKIE and UniVKIE. PipVKIE sequentially\ncompletes the four subtasks in continuous stages, while UniVKIE is improved by\nunifying all the subtasks into one backbone. Both PipVKIE and UniVKIE leverage\nmultimodal information from vision, text, and coordinates for feature\nrepresentation. Extensive experiments on one well-defined dataset demonstrate\nthat our solutions can achieve remarkable performance and efficient inference\nspeed. The code and dataset will be publicly available.\n","authors":["Siyu An","Ye Liu","Haoyuan Peng","Di Yin"],"pdf_url":"https://arxiv.org/pdf/2310.11650v1.pdf","comment":null}]}}
\ No newline at end of file
diff --git a/favicon.ico b/favicon.ico
new file mode 100644
index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5
GIT binary patch
literal 15086
zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau
zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}<z`M#I>4!8Q=syhFvI(6#Q
zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c
zg{r6iyXi3T|KFt>>TsDWWe%JEG<m`Z;&-3^R0ug1f<^}$q5Q=P?#uie-}>eI;m)t_
z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa
zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y<ax-x_)V}OWlBT0T`Px7TNY}*zI;?
zWn^JY)))ky4lEHv(5WD_*xdquU*8ck(q|ykPW+@)bE-C$rN!lK{RRgcVzzm<##`_U
zhBRv$;y#E&mknJ|`AlUeIZXK`M0_2g$!Ip4kuom@NlD3tvX9Ccg@E$`PIbJ+;%m9!
z<G~o6H5!>2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8`
zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%P<t#lxR;fx2$gDB*dT
z-+m<>AnhXUA$><WLApjNA(g#c7XMh?0~WKTNN+d>^4n&yX>&2hmV*Ry0tB<smelLF
ztRFHgMKX(=h8&WgVVQw@_BMsOVbpSXo@0Ik=~dE0q{g&!VYIhlK_u#FczLLMOi)ko
z+Z57n(iM_JDs#0Ax^C!-1?B}BcHt@g_Dc3Eh}sc_4%<2)dUZ4gEg6IkTRNc3wK5L1
z6{?)5f*}isz;3of^glU085Y(z<Pg1^YMF|L2O1XD6WkV32U49XP;J{o4SyMGej3!O
zHBg^!hWca^)bbCZX3l{c@B&m*+OSZ{)`NA{lQ<?~=R0ohh!k^5A$GAB-B&+}N~ay>
zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j
zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z
z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4<Ggka
zN-&m0Qby!Ce-0`$i+Xi0!o%ue+*@xUD=Q20&GR)q^1tFcc0&for^wLqi<X-H+iiBl
z#>AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u
zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B
zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El
zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE<x*gAYQrYe#$37OG7fouZ<ko_rE&?p&yo
zCyTcs<FJ?VMnpipMqVCl%uPv=##(hlptfu+uJ3~V+i#)z_SJEY8wa&^E!0<ELH+Op
z)Q>+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+
zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_
zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~
zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M
zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm
z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2
zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R
zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BG<cICna&vQY(Nf}SL
zjkW~;^jT0XN{Rn#!5;v%dTnw2F?h*S?8%NDFg0ri)4X}b^+C$8-{qT>k+NDH$2J`E
zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL
zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JL<XQxi*{lO~cQ!n*ZwBj{Fu`
zPo2`&;BhzI^F<PkXB({|^>y9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^
z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0<rr~E<k87UPdvlyVcO_;~
z+9dH1Sx0(2qxYG+F<_sW-hEfc_BefP@ueNdN9ypMiQhE*EbDgu3icyMpqel?^>(R|
z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE
z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;<V&^EBZkW(`MPt+n<=pJk
z$Nm1v6JBrL%bXQIqD4!nXNbKtHor(~w4ChSo9}9Y>DK9ZoqOLJHETi*8l?M+!q*#o
zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*Dx<aiO|CwQ)fvke~&&#7;MztE%p`Pxc
z@5_b1JAEh#vwRc%mwMtGHgAQ-4I3e0Y64QNsd#8pCrCWeS1k%U8HDE-4??0j5ic%z
z5pj#+FxN7-F#aYn#3{BZ2+au9eZ#wGCm74t_Tw_IJ<Ul&7Qrj>N;%mZkENXqy&csb
zsD_E}O+<E9w#I9oc^*1#?1ZwHxDWO+1)dGWphbhU7_RhnmW@QbM*LHr(=5}_WM4tt
zUG@eeNOc&?l>J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS
zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9T<bF4y0;bcEGh6t6X6&+-)P5}%CR$#vK<
zu8~jt5(j;O)PPv(*MoYP``Tfnpb`=pZ%ARhgmS&Zc-NpIP*EMYu5$j|OZzyEbq<jV
z_UIA=e{$`UFk8$TKds-ac1*QS#j9DbVzO;AlFUh%xnL$*?&3UmN%wDSgOs!sP0t;%
zOJ2qyJ}Vv-&a<!GCGoM2knE(Pq%NeEB#D_fB}Fpdo%A~CBhnuvH?f5ZxfKwzJVuN4
z8yG|m;jhg<=H)D}&q~@r7r##8rIDYv#=F*cwQ~X~FXuF7#?mF9Xh}q!AM3b7@BB`1
z$$BAim*XVy0nU?Tth`8-PgTZ07yc7=ThaN0A&We-GqN%4gW>LDq~u-ur`m{J5sTMs
zuVdouiO8@OoM~_@pI-BHF}c0L<ZGE)Kh@IW^qLM4vw`1YwP1Go9Q0fnjgroS#dcLV
zS3x^}O=tfba*Ga?&n@poFZznur9Atw+sEBBJutA_*?)bP_eHHAYat+)dnzwdrPGxV
zZ;ywZIc1nL(EG#QJQw!eE_=hvwM-M9FD5TY)^u0&_kHvG2t7i(?PHR2*daV~?M~ZJ
z+3_sL$98<>>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI<ni3ebD5Irm*N}m
z5ae+_=7k*!&!fJ3;xDIy|Fs0|&*?_c|Nr<8^nlUta!)Iq)R9zpPm6cCiv2OtNj>}D
zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50
zU`*>4B=PI|ujbte-Xj>la6GD=q@VAS<lMgAv3lsbsVma0>BzQb<MA!rv)_VL`<L37
zY3Cgh+Iv~|2d)~3M&C1L!MjY-e$d$<L~o47+>E(Mou3Mu)rQ&jISBtLywuzSE(YL*
zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut
zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&<
zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`-
zv1q-oHJX3j0-ZkZ<kYuvPa<k}6#A|0kH@w?hDkY-bi6hzdTj22vIn>?7)E-Ue)=q)
z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ
zL&}654a0<m6Od!iLHrx9pmN6usKjaXuTSWEpX&vCQ`!3%B=3ZNJj%1^cr=M>gZHLP
z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDU<D_7}zFX3hEaUJ(yk{8~_H;JRC&PQmwrl|Hs
z@LzE9&?}5F`#;8UUd-5(W7~2kLt+}smT`}}fP7M+va`AW+gTKcGx}uu4BkPZ{>XWm
zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J
z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*<Lp%UOTTUSl=Fn+?@R1Q@D7Wk43tCo
z@{Xj&Th7U)O+ycizLBTAld+LI9HU1=rKRb0&*yS(c7fXNLm!Oc9JjD<MMjCs$vNcI
zsegUGA=tKU(=kXL?Op}j7(bqI1TXoEo#`J0wP|aS-U*Hk8}zu4jIE>Gj(PbecRHl3
z)RAxU<Q_x4rqAuaL?L$c*&5X6Jw^Pb9XXekc_sbqssH7@l##LqF5$n3b7e&F*oIpy
zSNXOt=j_(cocT+8BA?7V(KF#=@Q}Q`lW(s5BmB8HRx+mErs2ezv$&`a4L%%CEmq#7
z3Gc$t<{Ev$yovARTd`QL@SjZGndUWj+7y<Jf7MwQ;l((UDKZkKTD5rh@jLyy$jiLE
zlW$rq%5i)ya$ffP2h_(t@IT11-J8?i{Njv3&I;r_ry*^F*djxI=H;Dyb2?*?`LOFt
zu3NPk3-p1%#Ii<lyyU)xv|;qW$VwYvdgBde9g}k!c_;S4as2bB7b9r1X8ORpKu7$(
zQYhDnXFl7hhcbo+2GMPq*T!$-DeoNhUT}Xv9qLJ4_VK(&_OtEww!`zo2Oy|3ZI|3%
za>QWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG
z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY
zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q
zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D
zrv-UL^G+<+sG)<QVxQt2G}?qg0qiH1DOm5D9&CzPcK$yB+Mfls1$Zat9VBKYVuo!7
z?*67A-b@r#&Q!s3s|RR)zn$xj+)EZeRs2N*W8qU`h~JYol9rH`k+zUz&6c&pF>xM6
z?TJFFEmf0C{C}YwOAfki>*iPz<?_mD{9;zdV2*i?hI#mk5h$DIrQLRFtq6t0^LlOQ
zrTL-a+nY1Y7_)ecll(=c={GYDHek&FWSg@wa@k0fyIjsg%Z{~5&dXkBo^_rUZyUXI
zH2=RyV2S!z^LR~k-q9Jgexsee1xfY-G7c}TcnS4?s_)ujB6<AViF@fp#@Q1YA9vy<
z@q%J8f&btOiBm{eBAiqhr{MoJEA~f=#(r8?`9kG&)z4Pf^77jw-B9Zq9BZ(y=INS7
z83g49q1V=4h}|3uzduX%hpL{higC-wVc@2L2tFT-`oGr4gk=+;#cQ|q*7ZmBqGw1)
z3P|qFGnKto7Jat$(R4k<o`MJVJ%Cy#YawoZ940NBgeglWBm8UH<;9`&ZE4dMPp9ol
zWWF!wa|j;cnNh;x1hhTa7I*x52dZ4CqP6Rv?~kgvRndQ8ESkJE66(ucw1c#}vd&#5
zT_o+;j#jS^N0Tq_bk#HcH*JLff|f9S`@Md@ZQVNk{C&w1-UnJu`%Qm&AJ^=xJ7~{!
zd7S<|Xd>QOx^KY$zohMj<o=SZe{#Q&cg;0hoH?F;vr|o-|2o?7-8=PjTDjvc>$PFW
zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD
z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^
zpZemTUj}!jMxoMTuBTTxc8<P;=0jXXV5S|FR#8reRN+bmJi7Z)#BGS<8uA_@lG<}$
z_cr%v->|>VvOmu-_V5+=E%E4@&<i714M*KS>Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM
z<F}jxOX_l6iz2n5?$u^rm{9XfO}x4CP0Y=niz$nyBH}35Z>Ac2-+N1Mj9EDb0eKE%
zBWGqVy3+V)V<crI70w@)*dtUWx5}P~pCa~f{%OCB_pZ-!SIg~bzrVvCV$sVLGYG!D
F_!mjUQ#=3w

literal 0
HcmV?d00001

diff --git a/index.css b/index.css
new file mode 100644
index 00000000..9ded9d94
--- /dev/null
+++ b/index.css
@@ -0,0 +1,355 @@
+:root {
+    /* Palette: Nord (https://www.nordtheme.com)*/
+    --nord00: #2e3440;
+    --nord01: #3b4252;
+    --nord02: #434c5e;
+    --nord03: #4c566a;
+    --nord04: #d8dee9;
+    --nord05: #e5e9f0;
+    --nord06: #eceff4;
+    --nord07: #8fbcbb;
+    --nord08: #88c0d0;
+    --nord09: #81a1c1;
+    --nord0A: #5e81ac;
+    --nord0B: #bf616a;
+    --nord0C: #d08770;
+    --nord0D: #ebcb8b;
+    --nord0E: #a3be8c;
+    --nord0F: #b48ead;
+
+
+    /* Typograph */
+    --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue",
+    sans-serif;
+    --font-size-scaler: 62.5%;
+    --font-size-m: 1.6rem;
+    --font-size-s: 1.4rem;
+
+    /* Components */
+    --body-color: var(--nord06);
+    --body-bg: var(--nord00);
+
+    --header-title: var(--nord06);
+    --header-container: var(--nord00);
+    --header-title-preffix: var(--nord0F);
+
+    --chip-font: var(--nord08);
+    --chip-color: var(--nord0B);
+
+    --icons: var(--nord06);
+    --icons-hover: var(--nord0F);
+
+    --day-container: var(--nord01);
+    --date: var(--nord09);
+
+    --summary: var(--nord0E);
+    --summary-hover: var(--nord0F);
+
+    --details-open: var(--nord02);
+    --details-content: var(--nord05);
+    --details-a: var(--nord07);
+    --details-a-hover: var(--nord0F);
+
+    --highlight-title: var(--nord0B);
+    --highlight-author: var(--nord0B);
+
+    --article-summary-hover-color: var(--nord0D);
+    --article-summary-color: var(--nord04);
+
+    --article-title-color: var(--nord05);
+    --article-title-hover-color: var(--nord0E);
+
+    --accordion-content-rail-color: var(--nord01);
+    --accordion-content-hover-rail-color: var(--nord0D);
+    --accordion-title-marker-color: var(--nord01);
+    --accordion-title-hover-marker-color: var(--nord0E);
+
+    --footer-color: var(--nord04);
+    --footer-link-hover-color: var(--nord0D);
+}
+
+[data-theme="light"] {
+    /* Theme design */
+
+    --color-primary: var(--nord07);
+    --color-primary-second: var(--nord00);
+    --color-info: var(--nord0A);
+    --color-success: var(--nord0E);
+    --color-warning: var(--nord0C);
+    --color-danger: var(--nord0B);
+
+    --color-text: var(--nord00);
+    --color-hover: var(--nord0D);
+    --color-shadow: var(--nord03);
+
+    --color-primary-h: var(--nord09);
+    --color-primary-s: var(--nord08);
+    --color-primary-l: var(--nord07);
+
+    --color-contrast-higher-h: var(--nord01);
+    --color-contrast-higher-l: var(--nord02);
+    --color-contrast-higher-s: var(--nord03);
+
+    --color-content: white;
+
+    --background: var(--nord06);
+    --background-content: var(--nord05);
+    --background-color: var(--nord04);
+
+    /* Components */
+
+    --chip-font: var(--nord06);
+    --chip-color: var(--nord09);
+
+    --body-color: var(--background-color);
+    --body-bg: var(--background);
+
+    --header-title: var(--color-shadow);
+    --header-container: var(--background);
+    --header-title-preffix: var(--color-primary-h);
+
+    --icons: var(--color-shadow);
+    --icons-hover: var(--color-hover);
+
+    --day-container: var(--background-content);
+    --date: var(--color-primary-l);
+
+    --summary: var(--color-info);
+    --summary-hover: var(--color-success);
+
+    --details-open: var(--color-content);
+    --details-content: var(--color-text);
+    --details-a: var(--color-primary-h);
+    --details-a-hover: var(--color-hover);
+
+    --highlight-title: var(--color-danger);
+    --highlight-author: var(--color-warning);
+
+    --article-summary-color: var(--color-text);
+    --article-summary-hover-color: var(--color-primary-s);
+
+    --article-title-color: var(--color-primary);
+    --article-title-hover-color: var(--color-success);
+
+    --accordion-content-rail-color: var(--color-warning);
+    --accordion-content-hover-rail-color: var(--color-warning);
+    --accordion-title-marker-color: var(--color-success);
+    --accordion-title-hover-marker-color: var(--color-success);
+
+    --footer-color: var(--color-text);
+    --footer-link-hover-color: var(--color-hover);
+}
+
+html {
+    font-size: var(--font-size-scaler);
+}
+
+body {
+    background-color: var(--body-bg);
+    font-family: var(--font-family-default);
+    color: var(--body-color);
+    margin: 0;
+    padding-top: 16px;
+    display: grid;
+}
+
+.header-container {
+    width: 90%;
+    max-width: 1200px;
+    background: var(--header-container);
+    margin: 0 auto;
+}
+
+.header-title {
+    font-size: 32px;
+    font-weight: bold;
+    color: var(--header-title);
+    margin: 0;
+    padding-bottom: 14px;
+}
+
+.header-title-preffix {
+    color: var(--header-title-preffix);
+}
+
+.icons {
+    color: var(--icons);
+    padding-bottom: 16px;
+}
+
+.icons a {
+    color: var(--icons);
+    text-decoration: none;
+}
+
+.icons a:hover {
+    color: var(--icons-hover);
+}
+
+.day-container {
+    padding: 16px 16px 16px 16px;
+    background: var(--day-container);
+    width: 90%;
+    max-width: 1200px;
+    margin: 0 auto;
+    margin-bottom: 8px;
+    border-radius: 10px;
+}
+
+.date {
+    font-size: 24px;
+    font-weight: 700;
+    margin: 0;
+    color: var(--date);
+}
+
+p {
+    margin: 0;
+}
+
+summary {
+    font-weight: 600;
+    color: var(--summary);
+}
+
+summary:hover {
+    text-decoration: underline;
+    cursor: pointer;
+    color: var(--summary-hover);
+}
+
+details {
+    --border-color: transparent;
+
+    padding: 2px 4px;
+    font-size: 20px;
+    border: 1px solid var(--border-color);
+    border-radius: 4px;
+}
+
+details[open] {
+    background-color: var(--details-open);
+    margin-bottom: 8px;
+}
+
+.details-content {
+    padding: 12px 3px;
+    gap: 16px;
+    color: var(--details-content);
+}
+
+details a {
+    color: var(--details-a);
+}
+
+details a:hover {
+    color: var(--details-a-hover);
+}
+
+footer {
+    margin: 0 auto;
+    color: var(--footer-color);
+    font-size: var(--font-size-s);
+    display: flex;
+    padding: 0 16px;
+    justify-content: space-between;
+}
+
+.description {
+    margin: 0 auto;
+    color: var(--footer-color);
+    font-size: var(--font-size-s);
+    display: flex;
+    padding: 0 16px;
+    text-align: center;
+}
+
+.highlight-author {
+    color: var(--highlight-author);
+    font-weight: bold;
+}
+
+.highlight-title {
+    color: var(--highlight-title);
+    font-weight: bold;
+}
+
+.channel-description {
+    text-align: center;
+    font-size: var(--font-size-scaler);
+}
+
+.article-summary-link {
+    color: var(--article-summary-color);
+    font-size: var(--font-size-s);
+    text-decoration: none;
+}
+
+.article-summary-link:hover {
+    color: var(--article-summary-hover-color);
+    --accordion-content-rail-color: var(--accordion-content-hover-rail-color);
+}
+
+.article-summary-box-outer {
+    display: block;
+    padding: 4px 8px 8px 4px;
+}
+
+.article-summary-box-inner {
+    padding-left: 8px;
+    border-left: 1px solid var(--accordion-content-rail-color);
+    font-size: var(--font-size-m);
+}
+
+.article-expander {
+    padding: 10px 4px;
+    border-radius: 4px;
+}
+
+.article-authors {
+    font-size: var(--font-size-m);
+    padding: 0.25em 1em;
+}
+
+.article-authors a {
+    text-decoration: none;
+}
+
+.article-expander-title {
+    font-size: var(--font-size-m);
+    font-weight: 600;
+}
+
+.article-expander-title:hover {
+    cursor: pointer;
+}
+
+.article-expander-title::marker {
+    color: var(--accordion-title-marker-color);
+}
+
+.article-expander-title:hover::marker {
+    color: var(--accordion-title-hover-marker-color);
+}
+
+/* for switcher */
+.theme-switch {
+    display: inline-block;
+    position: relative;
+}
+
+.theme-switch input {
+    display: none;
+}
+
+/* chip */
+.chip {
+    font-size: 90%;
+    align-items: center;
+    color: var(--chip-font);
+    background: var(--chip-color);
+    border-radius: 5rem;
+    display: inline-flex;
+    padding: .2rem .4rem;
+    vertical-align: middle;
+}
\ No newline at end of file
diff --git a/index.html b/index.html
new file mode 100644
index 00000000..d0957a76
--- /dev/null
+++ b/index.html
@@ -0,0 +1,82160 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+    <title>MyArxiv</title>
+    <meta charset="utf-8"/>
+    <meta http-equiv="X-UA-Compatible" content="IE=edge"/>
+    <meta name="robots" content="noindex, nofollow"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1"/>
+    <link rel="shortcut icon" type="image/x-icon" href="favicon.ico"/>
+    <link href="index.css" rel="stylesheet"/>
+    <link href="https://cdn.jsdelivr.net/npm/remixicon@2.5.0/fonts/remixicon.css" rel="stylesheet">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.css"
+          integrity="sha384-R4558gYOUz8mP9YWpZJjofhk+zx0AS11p36HnD2ZKj/6JR5z27gSSULCNHIRReVs" crossorigin="anonymous">
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.css"
+          integrity="sha384-R4558gYOUz8mP9YWpZJjofhk+zx0AS11p36HnD2ZKj/6JR5z27gSSULCNHIRReVs" crossorigin="anonymous">
+    <script defer src="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/katex.min.js"
+            integrity="sha384-z1fJDqw8ZApjGO3/unPWUPsIymfsJmyrDVWC8Tv/a1HeOtGmkwNd/7xUS0Xcnvsx"
+            crossorigin="anonymous"></script>
+    <script defer src="https://cdn.jsdelivr.net/npm/katex@0.15.1/dist/contrib/auto-render.min.js"
+            integrity="sha384-+XBljXPPiv+OzfbB3cVmLHf4hdUFHlWNZN5spNQ7rmHTXpd7WvJum6fIACpNNfIR"
+            crossorigin="anonymous"></script>
+    <script>
+        document.addEventListener("DOMContentLoaded", function () {
+            renderMathInElement(document.body, {
+                // customised options
+                // • auto-render specific keys, e.g.:
+                delimiters: [
+                    {left: '$$', right: '$$', display: true},
+                    {left: '$', right: '$', display: false},
+                    {left: '\\(', right: '\\)', display: false},
+                    {left: '\\[', right: '\\]', display: true},
+                    {left: "\\begin{equation}", right: "\\end{equation}", display: true},
+                    {left: "\\begin{align}", right: "\\end{align}", display: true},
+                    {left: "\\begin{alignat}", right: "\\end{alignat}", display: true},
+                    {left: "\\begin{gather}", right: "\\end{gather}", display: true},
+                    {left: "\\begin{CD}", right: "\\end{CD}", display: true},
+                ],
+                // • rendering keys, e.g.:
+                throwOnError: false
+            });
+        });
+    </script>
+</head>
+
+<body>
+<section class="header-container">
+    <div style="display:flex; justify-content:space-between; align-items:flex-end;">
+        <div>
+            <div class="header-title">
+                MyArxiv
+            </div>
+        </div>
+
+        <div class=icons>
+            <label class="theme-switch" for="checkbox">
+                <input type="checkbox" id="checkbox"/>
+                <i id="theme-icon" class="ri-moon-line" style="font-size: 32px" rel="noopener noreferrer"></i>
+            </label>
+        </div>
+    </div>
+</section>
+
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-10-18T00:00:00Z">2023-10-18</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">99</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding Retrieval Augmentation for Long-Form Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12150v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12150v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hung-Ting Chen, Fangyuan Xu, Shane A. Arora, Eunsol Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a study of retrieval-augmented language models (LMs) on long-form
+question answering. We analyze how retrieval augmentation impacts different
+LMs, by comparing answers generated from models while using the same evidence
+documents, and how differing quality of retrieval document set impacts the
+answers generated from the same LM. We study various attributes of generated
+answers (e.g., fluency, length, variance) with an emphasis on the attribution
+of generated long-form answers to in-context evidence documents. We collect
+human annotations of answer attribution and evaluate methods for automatically
+judging attribution. Our study provides new insights on how retrieval
+augmentation impacts long, knowledge-rich text generation of LMs. We further
+identify attribution patterns for long text generation and analyze the main
+culprits of attribution errors. Together, our analysis reveals how retrieval
+augmentation impacts long knowledge-rich text generation and provide directions
+for future work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simple Mechanisms for Representing, Indexing and Manipulating Concepts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12143v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12143v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanzhi Li, Raghu Meka, Rina Panigrahy, Kulin Shah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep networks typically learn concepts via classifiers, which involves
+setting up a model and training it via gradient descent to fit the
+concept-labeled data. We will argue instead that learning a concept could be
+done by looking at its moment statistics matrix to generate a concrete
+representation or signature of that concept. These signatures can be used to
+discover structure across the set of concepts and could recursively produce
+higher-level concepts by learning this structure from those signatures. When
+the concepts are `intersected', signatures of the concepts can be used to find
+a common theme across a number of related `intersected' concepts. This process
+could be used to keep a dictionary of concepts so that inputs could correctly
+identify and be routed to the set of concepts involved in the (latent)
+generation of the input.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pseudointelligence: A Unifying Framework for Language Model Evaluation <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12135v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12135v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shikhar Murty, Orr Paradise, Pratyusha Sharma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With large language models surpassing human performance on an increasing
+number of benchmarks, we must take a principled approach for targeted
+evaluation of model capabilities. Inspired by pseudorandomness, we propose
+pseudointelligence, which captures the maxim that "(perceived) intelligence
+lies in the eye of the beholder". That is, that claims of intelligence are
+meaningful only when their evaluator is taken into account. Concretely, we
+propose a complexity-theoretic framework of model evaluation cast as a dynamic
+interaction between a model and a learned evaluator. We demonstrate that this
+framework can be used to reason about two case studies in language model
+evaluation, as well as analyze existing evaluation methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diagrammer<span class="highlight-title">GPT</span>: Generating Open-Domain, Open-Platform Diagrams via LLM
+  Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12128v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12128v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhay Zala, Han Lin, Jaemin Cho, Mohit Bansal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image (T2I) generation has seen significant growth over the past few
+years. Despite this, there has been little work on generating diagrams with T2I
+models. A diagram is a symbolic/schematic representation that explains
+information using structurally rich and spatially complex visualizations (e.g.,
+a dense combination of related objects, text labels, directional arrows,
+connection lines, etc.). Existing state-of-the-art T2I models often fail at
+diagram generation because they lack fine-grained object layout control when
+many objects are densely connected via complex relations such as arrows/lines
+and also often fail to render comprehensible text labels. To address this gap,
+we present DiagrammerGPT, a novel two-stage text-to-diagram generation
+framework that leverages the layout guidance capabilities of LLMs (e.g., GPT-4)
+to generate more accurate open-domain, open-platform diagrams. In the first
+stage, we use LLMs to generate and iteratively refine 'diagram plans' (in a
+planner-auditor feedback loop) which describe all the entities (objects and
+text labels), their relationships (arrows or lines), and their bounding box
+layouts. In the second stage, we use a diagram generator, DiagramGLIGEN, and a
+text label rendering module to generate diagrams following the diagram plans.
+To benchmark the text-to-diagram generation task, we introduce AI2D-Caption, a
+densely annotated diagram dataset built on top of the AI2D dataset. We show
+quantitatively and qualitatively that our DiagrammerGPT framework produces more
+accurate diagrams, outperforming existing T2I models. We also provide
+comprehensive analysis including open-domain diagram generation, vector graphic
+diagram generation in different platforms, human-in-the-loop diagram plan
+editing, and multimodal planner/auditor LLMs (e.g., GPT-4Vision). We hope our
+work can inspire further research on diagram generation via T2I models and
+LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://diagrammerGPT.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Tale of Pronouns: Interpretability Informs Gender Bias Mitigation for
+  Fairer Instruction-Tuned Machine Translation <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12127v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12127v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giuseppe Attanasio, Flor Miriam Plaza-del-Arco, Debora Nozza, Anne Lauscher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent instruction fine-tuned models can solve multiple NLP tasks when
+prompted to do so, with machine translation (MT) being a prominent use case.
+However, current research often focuses on standard performance benchmarks,
+leaving compelling fairness and ethical considerations behind. In MT, this
+might lead to misgendered translations, resulting, among other harms, in the
+perpetuation of stereotypes and prejudices. In this work, we address this gap
+by investigating whether and to what extent such models exhibit gender bias in
+machine translation and how we can mitigate it. Concretely, we compute
+established gender bias metrics on the WinoMT corpus from English to German and
+Spanish. We discover that IFT models default to male-inflected translations,
+even disregarding female occupational stereotypes. Next, using interpretability
+methods, we unveil that models systematically overlook the pronoun indicating
+the gender of a target occupation in misgendered translations. Finally, based
+on this finding, we propose an easy-to-implement and effective bias mitigation
+solution based on few-shot learning that leads to significantly fairer
+translations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at EMNLP 2023. Code and data at
+  https://github.com/MilaNLProc/interpretability-mt-gender-bias</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SHARCS: Efficient <span class="highlight-title">Transformer</span>s through Routing with Dynamic Width
+  Sub-networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12126v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12126v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammadreza Salehi, Sachin Mehta, Aditya Kusupati, Ali Farhadi, Hannaneh Hajishirzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce SHARCS for adaptive inference that takes into account the
+hardness of input samples. SHARCS can train a router on any transformer
+network, enabling the model to direct different samples to sub-networks with
+varying widths. Our experiments demonstrate that: (1) SHARCS outperforms or
+complements existing per-sample adaptive inference methods across various
+classification tasks in terms of accuracy vs. FLOPs; (2) SHARCS generalizes
+across different architectures and can be even applied to compressed and
+efficient transformer encoders to further improve their efficiency; (3) SHARCS
+can provide a 2 times inference speed up at an insignificant drop in accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Harnessing <span class="highlight-title">Dataset</span> Cartography for Improved Compositional Generalization
+  in <span class="highlight-title">Transformer</span>s <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12118v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12118v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Osman Batur İnce, Tanin Zeraati, Semih Yagcioglu, Yadollah Yaghoobzadeh, Erkut Erdem, Aykut Erdem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks have revolutionized language modeling and excelled in various
+downstream tasks. However, the extent to which these models achieve
+compositional generalization comparable to human cognitive abilities remains a
+topic of debate. While existing approaches in the field have mainly focused on
+novel architectures and alternative learning paradigms, we introduce a
+pioneering method harnessing the power of dataset cartography (Swayamdipta et
+al., 2020). By strategically identifying a subset of compositional
+generalization data using this approach, we achieve a remarkable improvement in
+model accuracy, yielding enhancements of up to 10% on CFQ and COGS datasets.
+Notably, our technique incorporates dataset cartography as a curriculum
+learning criterion, eliminating the need for hyperparameter tuning while
+consistently achieving superior performance. Our findings highlight the
+untapped potential of dataset cartography in unleashing the full capabilities
+of compositional generalization within Transformer models. Our code is
+available at https://github.com/cyberiada/cartography-for-compositionality.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Non-Intrusive Adaptation: Input-Centric Parameter-efficient Fine-Tuning
+  for Versatile Multimodal Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12100v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12100v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaqing Wang, Jialin Wu, Tanmaya Dabral, Jiageng Zhang, Geoff Brown, Chun-Ta Lu, Frederick Liu, Yi Liang, Bo Pang, Michael Bendersky, Radu Soricut
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) and vision language models (VLMs) demonstrate
+excellent performance on a wide range of tasks by scaling up parameter counts
+from O(10^9) to O(10^{12}) levels and further beyond. These large scales make
+it impossible to adapt and deploy fully specialized models given a task of
+interest. Parameter-efficient fine-tuning (PEFT) emerges as a promising
+direction to tackle the adaptation and serving challenges for such large
+models. We categorize PEFT techniques into two types: intrusive and
+non-intrusive. Intrusive PEFT techniques directly change a model's internal
+architecture. Though more flexible, they introduce significant complexities for
+training and serving. Non-intrusive PEFT techniques leave the internal
+architecture unchanged and only adapt model-external parameters, such as
+embeddings for input. In this work, we describe AdaLink as a non-intrusive PEFT
+technique that achieves competitive performance compared to SoTA intrusive PEFT
+(LoRA) and full model fine-tuning (FT) on various tasks. We evaluate using both
+text-only and multimodal tasks, with experiments that account for both
+parameter-count scaling and training regime (with and without instruction
+tuning).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unveiling the Siren's Song: Towards Reliable Fact-Conflicting
+  Hallucination Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12086v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12086v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Chen, Duanzheng Song, Honghao Gui, Chengxi Wang, Ningyu Zhang, Fei Huang, Chengfei Lv, Dan Zhang, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs), such as ChatGPT/GPT-4, have garnered widespread
+attention owing to their myriad of practical applications, yet their adoption
+has been constrained by issues of fact-conflicting hallucinations across web
+platforms. The assessment of factuality in text, produced by LLMs, remains
+inadequately explored, extending not only to the judgment of vanilla facts but
+also encompassing the evaluation of factual errors emerging in complex
+inferential tasks like multi-hop, and etc. In response, we introduce FactCHD, a
+fact-conflicting hallucination detection benchmark meticulously designed for
+LLMs. Functioning as a pivotal tool in evaluating factuality within
+"Query-Respons" contexts, our benchmark assimilates a large-scale dataset,
+encapsulating a broad spectrum of factuality patterns, such as vanilla,
+multi-hops, comparison, and set-operation patterns. A distinctive feature of
+our benchmark is its incorporation of fact-based chains of evidence, thereby
+facilitating comprehensive and conducive factual reasoning throughout the
+assessment process. We evaluate multiple LLMs, demonstrating the effectiveness
+of the benchmark and current methods fall short of faithfully detecting factual
+errors. Furthermore, we present TRUTH-TRIANGULATOR that synthesizes reflective
+considerations by tool-enhanced ChatGPT and LoRA-tuning based on Llama2, aiming
+to yield more credible detection through the amalgamation of predictive results
+and evidence. The benchmark dataset and source code will be made available in
+https://github.com/zjunlp/FactCHD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Benefit of Generative Foundation Models for Human Activity
+  Recognition <span class="chip">ISWC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12085v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12085v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zikang Leng, Hyeokhyen Kwon, Thomas Plötz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In human activity recognition (HAR), the limited availability of annotated
+data presents a significant challenge. Drawing inspiration from the latest
+advancements in generative AI, including Large Language Models (LLMs) and
+motion synthesis models, we believe that generative AI can address this data
+scarcity by autonomously generating virtual IMU data from text descriptions.
+Beyond this, we spotlight several promising research pathways that could
+benefit from generative AI for the community, including the generating
+benchmark datasets, the development of foundational models specific to HAR, the
+exploration of hierarchical structures within HAR, breaking down complex
+activities, and applications in health sensing and activity summarization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Generative AI for Pervasive Computing (GenAI4PC) Symposium within
+  UbiComp/ISWC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Safer Operations: An Expert-involved <span class="highlight-title">Dataset</span> of High-Pressure
+  Gas Incidents for Preventing Future Failures <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12074v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12074v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shumpei Inoue, Minh-Tien Nguyen, Hiroki Mizokuchi, Tuan-Anh D. Nguyen, Huu-Hiep Nguyen, Dung Tien Le
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a new IncidentAI dataset for safety prevention.
+Different from prior corpora that usually contain a single task, our dataset
+comprises three tasks: named entity recognition, cause-effect extraction, and
+information retrieval. The dataset is annotated by domain experts who have at
+least six years of practical experience as high-pressure gas conservation
+managers. We validate the contribution of the dataset in the scenario of safety
+prevention. Preliminary results on the three tasks show that NLP techniques are
+beneficial for analyzing incident reports to prevent future failures. The
+dataset facilitates future research in NLP and incident management communities.
+The access to the dataset is also provided (the IncidentAI dataset is available
+at: https://github.com/Cinnamon/incident-ai-dataset).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by EMNLP 2023 (The industry track)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SPEED: Speculative Pipelined Execution for Efficient Decoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12072v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12072v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Coleman Hooper, Sehoon Kim, Hiva Mohammadzadeh, Hasan Genc, Kurt Keutzer, Amir Gholami, Sophia Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Large Language Models (LLMs) based on the Transformer architecture
+have recently emerged as a dominant foundation model for a wide range of
+Natural Language Processing tasks. Nevertheless, their application in real-time
+scenarios has been highly restricted due to the significant inference latency
+associated with these models. This is particularly pronounced due to the
+autoregressive nature of generative LLM inference, where tokens are generated
+sequentially since each token depends on all previous output tokens. It is
+therefore challenging to achieve any token-level parallelism, making inference
+extremely memory-bound. In this work, we propose SPEED, which improves
+inference efficiency by speculatively executing multiple future tokens in
+parallel with the current token using predicted values based on early-layer
+hidden states. For Transformer decoders that employ parameter sharing, the
+memory operations for the tokens executing in parallel can be amortized, which
+allows us to accelerate generative LLM inference. We demonstrate the efficiency
+of our method in terms of latency reduction relative to model accuracy and
+demonstrate how speculation allows for training deeper decoders with parameter
+sharing with minimal runtime overhead.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Code Book for the Annotation of Diverse Cross-Document Coreference of
+  Entities in News Articles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12064v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12064v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jakob Vogel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a scheme for annotating coreference across news articles,
+extending beyond traditional identity relations by also considering
+near-identity and bridging relations. It includes a precise description of how
+to set up Inception, a respective annotation tool, how to annotate entities in
+news articles, connect them with diverse coreferential relations, and link them
+across documents to Wikidata's global knowledge graph. This multi-layered
+annotation approach is discussed in the context of the problem of media bias.
+Our main contribution lies in providing a methodology for creating a diverse
+cross-document coreference corpus which can be applied to the analysis of media
+bias by word-choice and labelling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating the Symbol Binding Ability of Large Language Models for
+  Multiple-Choice Questions in Vietnamese General Education 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12059v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12059v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duc-Vu Nguyen, Quoc-Nam Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we evaluate the ability of large language models (LLMs) to
+perform multiple choice symbol binding (MCSB) for multiple choice question
+answering (MCQA) tasks in zero-shot, one-shot, and few-shot settings. We focus
+on Vietnamese, with fewer challenging MCQA datasets than in English. The two
+existing datasets, ViMMRC 1.0 and ViMMRC 2.0, focus on literature. Recent
+research in Vietnamese natural language processing (NLP) has focused on the
+Vietnamese National High School Graduation Examination (VNHSGE) from 2019 to
+2023 to evaluate ChatGPT. However, these studies have mainly focused on how
+ChatGPT solves the VNHSGE step by step. We aim to create a novel and
+high-quality dataset by providing structured guidelines for typing LaTeX
+formulas for mathematics, physics, chemistry, and biology. This dataset can be
+used to evaluate the MCSB ability of LLMs and smaller language models (LMs)
+because it is typed in a strict LaTeX style. We focus on predicting the
+character (A, B, C, or D) that is the most likely answer to a question, given
+the context of the question. Our evaluation of six well-known LLMs, namely
+BLOOMZ-7.1B-MT, LLaMA-2-7B, LLaMA-2-70B, GPT-3, GPT-3.5, and GPT-4.0, on the
+ViMMRC 1.0 and ViMMRC 2.0 benchmarks and our proposed dataset shows promising
+results on the MCSB ability of LLMs for Vietnamese. The dataset is available
+for research purposes only.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Concept-Guided Chain-of-Thought <span class="highlight-title">Prompt</span>ing for Pairwise Comparison
+  Scaling of Texts with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12049v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12049v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Y. Wu, Jonathan Nagler, Joshua A. Tucker, Solomon Messing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing text scaling methods often require a large corpus, struggle with
+short texts, or require labeled data. We develop a text scaling method that
+leverages the pattern recognition capabilities of generative large language
+models (LLMs). Specifically, we propose concept-guided chain-of-thought
+(CGCoT), which uses prompts designed to summarize ideas and identify target
+parties in texts to generate concept-specific breakdowns, in many ways similar
+to guidance for human coder content analysis. CGCoT effectively shifts pairwise
+text comparisons from a reasoning problem to a pattern recognition problem. We
+then pairwise compare concept-specific breakdowns using an LLM. We use the
+results of these pairwise comparisons to estimate a scale using the
+Bradley-Terry model. We use this approach to scale affective speech on Twitter.
+Our measures correlate more strongly with human judgments than alternative
+approaches like Wordfish. Besides a small set of pilot data to develop the
+CGCoT prompts, our measures require no additional labeled data and produce
+binary predictions comparable to a RoBERTa-Large model fine-tuned on thousands
+of human-labeled tweets. We demonstrate how combining substantive knowledge
+with LLMs can create state-of-the-art measures of abstract concepts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CORE: A Few-Shot Company Relation Classification <span class="highlight-title">Dataset</span> for Robust
+  Domain Adaptation <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12024v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12024v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philipp Borchert, Jochen De Weerdt, Kristof Coussement, Arno De Caigny, Marie-Francine Moens
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce CORE, a dataset for few-shot relation classification (RC)
+focused on company relations and business entities. CORE includes 4,708
+instances of 12 relation types with corresponding textual evidence extracted
+from company Wikipedia pages. Company names and business entities pose a
+challenge for few-shot RC models due to the rich and diverse information
+associated with them. For example, a company name may represent the legal
+entity, products, people, or business divisions depending on the context.
+Therefore, deriving the relation type between entities is highly dependent on
+textual context. To evaluate the performance of state-of-the-art RC models on
+the CORE dataset, we conduct experiments in the few-shot domain adaptation
+setting. Our results reveal substantial performance gaps, confirming that
+models trained on different domains struggle to adapt to CORE. Interestingly,
+we find that models trained on CORE showcase improved out-of-domain
+performance, which highlights the importance of high-quality data for robust
+domain adaptation. Specifically, the information richness embedded in business
+entities allows models to focus on contextual nuances, reducing their reliance
+on superficial clues such as relation-specific verbs. In addition to the
+dataset, we provide relevant code snippets to facilitate reproducibility and
+encourage further research in the field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP 2023 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LoHoRavens: A Long-Horizon Language-Conditioned Benchmark for Robotic
+  Tabletop Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12020v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12020v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengqiang Zhang, Philipp Wicke, Lütfi Kerem Şenel, Luis Figueredo, Abdeldjallil Naceri, Sami Haddadin, Barbara Plank, Hinrich Schütze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The convergence of embodied agents and large language models (LLMs) has
+brought significant advancements to embodied instruction following.
+Particularly, the strong reasoning capabilities of LLMs make it possible for
+robots to perform long-horizon tasks without expensive annotated
+demonstrations. However, public benchmarks for testing the long-horizon
+reasoning capabilities of language-conditioned robots in various scenarios are
+still missing. To fill this gap, this work focuses on the tabletop manipulation
+task and releases a simulation benchmark, \textit{LoHoRavens}, which covers
+various long-horizon reasoning aspects spanning color, size, space, arithmetics
+and reference. Furthermore, there is a key modality bridging problem for
+long-horizon manipulation tasks with LLMs: how to incorporate the observation
+feedback during robot execution for the LLM's closed-loop planning, which is
+however less studied by prior work. We investigate two methods of bridging the
+modality gap: caption generation and learnable interface for incorporating
+explicit and implicit observation feedback to the LLM, respectively. These
+methods serve as the two baselines for our proposed benchmark. Experiments show
+that both methods struggle to solve some tasks, indicating long-horizon
+manipulation tasks are still challenging for current popular models. We expect
+the proposed public benchmark and baselines can help the community develop
+better models for long-horizon tabletop manipulation tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 4 figures. The video and code of LoHoRavens are available at
+  https://shengqiang-zhang.github.io/lohoravens-webpage/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gold: A Global and Local-aware Denoising Framework for Commonsense
+  Knowledge Graph Noise Detection <span class="chip">EMNLP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12011v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12011v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheye Deng, Weiqi Wang, Zhaowei Wang, Xin Liu, Yangqiu Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Commonsense Knowledge Graphs (CSKGs) are crucial for commonsense reasoning,
+yet constructing them through human annotations can be costly. As a result,
+various automatic methods have been proposed to construct CSKG with larger
+semantic coverage. However, these unsupervised approaches introduce spurious
+noise that can lower the quality of the resulting CSKG, which cannot be tackled
+easily by existing denoising algorithms due to the unique characteristics of
+nodes and structures in CSKGs. To address this issue, we propose Gold (Global
+and Local-aware Denoising), a denoising framework for CSKGs that incorporates
+entity semantic information, global rules, and local structural information
+from the CSKG. Experiment results demonstrate that Gold outperforms all
+baseline methods in noise detection tasks on synthetic noisy CSKG benchmarks.
+Furthermore, we show that denoising a real-world CSKG is effective and even
+benefits the downstream zero-shot commonsense question-answering task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP findings 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-view Contrastive Learning for Entity Typing over Knowledge Graphs <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12008v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12008v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiwei Hu, Víctor Gutiérrez-Basulto, Zhiliang Xiang, Ru Li, Jeff Z. Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge graph entity typing (KGET) aims at inferring plausible types of
+entities in knowledge graphs. Existing approaches to KGET focus on how to
+better encode the knowledge provided by the neighbors and types of an entity
+into its representation. However, they ignore the semantic knowledge provided
+by the way in which types can be clustered together. In this paper, we propose
+a novel method called Multi-view Contrastive Learning for knowledge graph
+Entity Typing (MCLET), which effectively encodes the coarse-grained knowledge
+provided by clusters into entity and type embeddings. MCLET is composed of
+three modules: i) Multi-view Generation and Encoder module, which encodes
+structured information from entity-type, entity-cluster and cluster-type views;
+ii) Cross-view Contrastive Learning module, which encourages different views to
+collaboratively improve view-specific representations of entities and types;
+iii) Entity Typing Prediction module, which integrates multi-head attention and
+a Mixture-of-Experts strategy to infer missing entity types. Extensive
+experiments show the strong performance of MCLET compared to the
+state-of-the-art
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at EMNLP 2023 Main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sociotechnical Safety Evaluation of Generative AI Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11986v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11986v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laura Weidinger, Maribeth Rauh, Nahema Marchal, Arianna Manzini, Lisa Anne Hendricks, Juan Mateos-Garcia, Stevie Bergman, Jackie Kay, Conor Griffin, Ben Bariach, Iason Gabriel, Verena Rieser, William Isaac
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative AI systems produce a range of risks. To ensure the safety of
+generative AI systems, these risks must be evaluated. In this paper, we make
+two main contributions toward establishing such evaluations. First, we propose
+a three-layered framework that takes a structured, sociotechnical approach to
+evaluating these risks. This framework encompasses capability evaluations,
+which are the main current approach to safety evaluation. It then reaches
+further by building on system safety principles, particularly the insight that
+context determines whether a given capability may cause harm. To account for
+relevant context, our framework adds human interaction and systemic impacts as
+additional layers of evaluation. Second, we survey the current state of safety
+evaluation of generative AI systems and create a repository of existing
+evaluations. Three salient evaluation gaps emerge from this analysis. We
+propose ways forward to closing these gaps, outlining practical steps as well
+as roles and responsibilities for different actors. Sociotechnical safety
+evaluation is a tractable approach to the robust and comprehensive safety
+evaluation of generative AI systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>main paper p.1-29, 5 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Interpolation to Extrapolation: Complete Length Generalization for
+  Arithmetic <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11984v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11984v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaoxiong Duan, Yining Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since its introduction, the transformer model has demonstrated outstanding
+performance across various tasks. However, there are still unresolved issues
+regarding length generalization, particularly in algorithmic tasks. In this
+paper, we investigate the inherent capabilities of transformer models in
+learning arithmetic algorithms, such as addition and multiplication. Through
+experiments and attention analysis, we identify a number of crucial factors for
+achieving optimal length generalization. We show that transformer models are
+able to generalize to long lengths with the help of targeted attention biasing.
+We then introduce Attention Bias Calibration (ABC), a calibration stage that
+enables the model to automatically learn the proper attention biases, which we
+link to mechanisms in relative position encoding. We demonstrate that using
+ABC, the transformer model can achieve unprecedented perfect length
+generalization on certain arithmetic tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InfoDiffusion: Information Entropy Aware Diffusion Process for
+  Non-Autoregressive Text Generation <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11976v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11976v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renzhi Wang, Jing Li, Piji Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have garnered considerable interest in the field of text
+generation. Several studies have explored text diffusion models with different
+structures and applied them to various tasks, including named entity
+recognition and summarization. However, there exists a notable disparity
+between the "easy-first" text generation process of current diffusion models
+and the "keyword-first" natural text generation process of humans, which has
+received limited attention. To bridge this gap, we propose InfoDiffusion, a
+non-autoregressive text diffusion model. Our approach introduces a
+"keyinfo-first" generation strategy and incorporates a noise schedule based on
+the amount of text information. In addition, InfoDiffusion combines
+self-conditioning with a newly proposed partially noising model structure.
+Experimental results show that InfoDiffusion outperforms the baseline model in
+terms of generation quality and diversity, as well as exhibiting higher
+sampling efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Filling in the Gaps: Efficient Event Coreference Resolution using Graph
+  Autoencoder Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11965v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11965v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Loic De Langhe, Orphée De Clercq, Veronique Hoste
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a novel and efficient method for Event Coreference Resolution
+(ECR) applied to a lower-resourced language domain. By framing ECR as a graph
+reconstruction task, we are able to combine deep semantic embeddings with
+structural coreference chain knowledge to create a parameter-efficient family
+of Graph Autoencoder models (GAE). Our method significantly outperforms
+classical mention-pair methods on a large Dutch event coreference corpus in
+terms of overall score, efficiency and training speed. Additionally, we show
+that our models are consistently able to classify more difficult coreference
+links and are far more robust in low-data settings when compared to
+transformer-based mention-pair coreference algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AMR Parsing with Causal Hierarchical Attention and Pointers <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11964v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11964v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Lou, Kewei Tu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Translation-based AMR parsers have recently gained popularity due to their
+simplicity and effectiveness. They predict linearized graphs as free texts,
+avoiding explicit structure modeling. However, this simplicity neglects
+structural locality in AMR graphs and introduces unnecessary tokens to
+represent coreferences. In this paper, we introduce new target forms of AMR
+parsing and a novel model, CHAP, which is equipped with causal hierarchical
+attention and the pointer mechanism, enabling the integration of structures
+into the Transformer decoder. We empirically explore various alternative
+modeling options. Experiments show that our model outperforms baseline models
+on four out of five benchmarks in the setting of no additional data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast Multipole Attention: A Divide-and-Conquer Attention Mechanism for
+  Long Sequences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11960v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11960v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanming Kang, Giang Tran, Hans De Sterck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based models have achieved state-of-the-art performance in many
+areas. However, the quadratic complexity of self-attention with respect to the
+input length hinders the applicability of Transformer-based models to long
+sequences. To address this, we present Fast Multipole Attention, a new
+attention mechanism that uses a divide-and-conquer strategy to reduce the time
+and memory complexity of attention for sequences of length $n$ from
+$\mathcal{O}(n^2)$ to $\mathcal{O}(n \log n)$ or $O(n)$, while retaining a
+global receptive field. The hierarchical approach groups queries, keys, and
+values into $\mathcal{O}( \log n)$ levels of resolution, where groups at
+greater distances are increasingly larger in size and the weights to compute
+group quantities are learned. As such, the interaction between tokens far from
+each other is considered in lower resolution in an efficient hierarchical
+manner. The overall complexity of Fast Multipole Attention is $\mathcal{O}(n)$
+or $\mathcal{O}(n \log n)$, depending on whether the queries are down-sampled
+or not. This multi-level divide-and-conquer strategy is inspired by fast
+summation methods from $n$-body physics and the Fast Multipole Method. We
+perform evaluation on autoregressive and bidirectional language modeling tasks
+and compare our Fast Multipole Attention model with other efficient attention
+variants on medium-size datasets. We find empirically that the Fast Multipole
+Transformer performs much better than other efficient transformers in terms of
+memory size and accuracy. The Fast Multipole Attention mechanism has the
+potential to empower large language models with much greater sequence lengths,
+taking the full context into account in an efficient, naturally hierarchical
+manner during training and when generating long sequences.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Emptying the Ocean with a Spoon: Should We Edit Models? <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11958v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11958v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuval Pinter, Michael Elhadad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We call into question the recently popularized method of direct model editing
+as a means of correcting factual errors in LLM generations. We contrast model
+editing with three similar but distinct approaches that pursue better defined
+objectives: (1) retrieval-based architectures, which decouple factual memory
+from inference and linguistic capabilities embodied in LLMs; (2) concept
+erasure methods, which aim at preventing systemic bias in generated text; and
+(3) attribution methods, which aim at grounding generations into identified
+textual sources. We argue that direct model editing cannot be trusted as a
+systematic remedy for the disadvantages inherent to LLMs, and while it has
+proven potential in improving model explainability, it opens risks by
+reinforcing the notion that models can be trusted for factuality. We call for
+cautious promotion and application of model editing as part of the LLM
+deployment process, and for responsibly limiting the use cases of LLMs to those
+not relying on editing as a critical component.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL: EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MusicAgent: An AI Agent for Music Understanding and Generation with
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11954v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11954v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dingyao Yu, Kaitao Song, Peiling Lu, Tianyu He, Xu Tan, Wei Ye, Shikun Zhang, Jiang Bian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI-empowered music processing is a diverse field that encompasses dozens of
+tasks, ranging from generation tasks (e.g., timbre synthesis) to comprehension
+tasks (e.g., music classification). For developers and amateurs, it is very
+difficult to grasp all of these task to satisfy their requirements in music
+processing, especially considering the huge differences in the representations
+of music data and the model applicability across platforms among various tasks.
+Consequently, it is necessary to build a system to organize and integrate these
+tasks, and thus help practitioners to automatically analyze their demand and
+call suitable tools as solutions to fulfill their requirements. Inspired by the
+recent success of large language models (LLMs) in task automation, we develop a
+system, named MusicAgent, which integrates numerous music-related tools and an
+autonomous workflow to address user requirements. More specifically, we build
+1) toolset that collects tools from diverse sources, including Hugging Face,
+GitHub, and Web API, etc. 2) an autonomous workflow empowered by LLMs (e.g.,
+ChatGPT) to organize these tools and automatically decompose user requests into
+multiple sub-tasks and invoke corresponding music tools. The primary goal of
+this system is to free users from the intricacies of AI-music tools, enabling
+them to concentrate on the creative aspect. By granting users the freedom to
+effortlessly combine tools, the system offers a seamless and enriching music
+experience.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Grounded and Well-rounded: A Methodological Approach to the Study of
+  Cross-modal and Cross-lingual Grounding <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11938v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11938v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timothee Mickus, Elaine Zosa, Denis Paperno
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Grounding has been argued to be a crucial component towards the development
+of more complete and truly semantically competent artificial intelligence
+systems. Literature has divided into two camps: While some argue that grounding
+allows for qualitatively different generalizations, others believe it can be
+compensated by mono-modal data quantity. Limited empirical evidence has emerged
+for or against either position, which we argue is due to the methodological
+challenges that come with studying grounding and its effects on NLP systems.
+  In this paper, we establish a methodological framework for studying what the
+effects are - if any - of providing models with richer input sources than
+text-only. The crux of it lies in the construction of comparable samples of
+populations of models trained on different input modalities, so that we can
+tease apart the qualitative effects of different input sources from
+quantifiable model performances. Experiments using this framework reveal
+qualitative differences in model behavior between cross-modally grounded,
+cross-lingually grounded, and ungrounded models, which we measure both at a
+global dataset level as well as for specific word representations, depending on
+how concrete their semantics is.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating semantic subspaces of <span class="highlight-title">Transformer</span> sentence embeddings
+  through linear structural probing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11923v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11923v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dmitry Nikolaev, Sebastian Padó
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The question of what kinds of linguistic information are encoded in different
+layers of Transformer-based language models is of considerable interest for the
+NLP community. Existing work, however, has overwhelmingly focused on word-level
+representations and encoder-only language models with the masked-token training
+objective. In this paper, we present experiments with semantic structural
+probing, a method for studying sentence-level representations via finding a
+subspace of the embedding space that provides suitable task-specific pairwise
+distances between data-points. We apply our method to language models from
+different families (encoder-only, decoder-only, encoder-decoder) and of
+different sizes in the context of two tasks, semantic textual similarity and
+natural-language inference. We find that model families differ substantially in
+their performance and layer dynamics, but that the results are largely
+model-size invariant.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to BlackboxNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Benchmark for Semi-Inductive Link Prediction in Knowledge Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11917v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11917v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrian Kochsiek, Rainer Gemulla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-inductive link prediction (LP) in knowledge graphs (KG) is the task of
+predicting facts for new, previously unseen entities based on context
+information. Although new entities can be integrated by retraining the model
+from scratch in principle, such an approach is infeasible for large-scale KGs,
+where retraining is expensive and new entities may arise frequently. In this
+paper, we propose and describe a large-scale benchmark to evaluate
+semi-inductive LP models. The benchmark is based on and extends Wikidata5M: It
+provides transductive, k-shot, and 0-shot LP tasks, each varying the available
+information from (i) only KG structure, to (ii) including textual mentions, and
+(iii) detailed descriptions of the entities. We report on a small study of
+recent approaches and found that semi-inductive LP performance is far from
+transductive performance on long-tail entities throughout all experiments. The
+benchmark provides a test bed for further research into integrating context and
+textual information in semi-inductive LP models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rather a Nurse than a Physician -- Contrastive Explanations under
+  Investigation <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11906v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11906v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oliver Eberle, Ilias Chalkidis, Laura Cabello, Stephanie Brandl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive explanations, where one decision is explained in contrast to
+another, are supposed to be closer to how humans explain a decision than
+non-contrastive explanations, where the decision is not necessarily referenced
+to an alternative. This claim has never been empirically validated. We analyze
+four English text-classification datasets (SST2, DynaSent, BIOS and
+DBpedia-Animals). We fine-tune and extract explanations from three different
+models (RoBERTa, GTP-2, and T5), each in three different sizes and apply three
+post-hoc explainability methods (LRP, GradientxInput, GradNorm). We furthermore
+collect and release human rationale annotations for a subset of 100 samples
+from the BIOS dataset for contrastive and non-contrastive settings. A
+cross-comparison between model-based rationales and human annotations, both in
+contrastive and non-contrastive settings, yields a high agreement between the
+two settings for models as well as for humans. Moreover, model-based
+explanations computed in both settings align equally well with human
+rationales. Thus, we empirically find that humans do not necessarily explain in
+a contrastive manner.9 pages, long paper at ACL 2022 proceedings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, long paper at EMNLP 2023 proceedings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Neural Activations to Concepts: A <span class="highlight-title">Survey</span> on Explaining Concepts in
+  Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11884v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11884v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jae Hee Lee, Sergio Lanza, Stefan Wermter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we review recent approaches for explaining concepts in neural
+networks. Concepts can act as a natural link between learning and reasoning:
+once the concepts are identified that a neural learning system uses, one can
+integrate those concepts with a reasoning system for inference or use a
+reasoning system to act upon them to improve or enhance the learning system. On
+the other hand, knowledge can not only be extracted from neural networks but
+concept knowledge can also be inserted into neural network architectures. Since
+integrating learning and reasoning is at the core of neuro-symbolic AI, the
+insights gained from this survey can serve as an important step towards
+realizing neuro-symbolic AI based on explainable concepts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to Neurosymbolic Artificial Intelligence
+  (https://neurosymbolic-ai-journal.com/paper/neural-activations-concepts-survey-explaining-concepts-neural-networks)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Dissonance to Insights: Dissecting Disagreements in Rationale
+  <span class="highlight-title">Dataset</span> Construction for Case Outcome Classification <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11878v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11878v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shanshan Xu, Santosh T. Y. S. S, Oana Ichim, Isabella Risini, Barbara Plank, Matthias Grabmair
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In legal NLP, Case Outcome Classification (COC) must not only be accurate but
+also trustworthy and explainable. Existing work in explainable COC has been
+limited to annotations by a single expert. However, it is well-known that
+lawyers may disagree in their assessment of case facts. We hence collect a
+novel dataset RAVE: Rationale Variation in ECHR1, which is obtained from two
+experts in the domain of international human rights law, for whom we observe
+weak agreement. We study their disagreements and build a two-level
+task-independent taxonomy, supplemented with COC-specific subcategories. To our
+knowledge, this is the first work in the legal NLP that focuses on human label
+variation. We quantitatively assess different taxonomy categories and find that
+disagreements mainly stem from underspecification of the legal context, which
+poses challenges given the typically limited granularity and noise in COC
+metadata. We further assess the explainablility of SOTA COC models on RAVE and
+observe limited agreement between models and experts. Overall, our case study
+reveals hitherto underappreciated complexities in creating benchmark datasets
+in legal NLP that revolve around identifying aspects of a case's facts
+supposedly relevant to its outcome.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Curious Case of Hallucinatory Unanswerablity: Finding Truths in the
+  Hidden States of Over-Confident Large Language Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11877v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11877v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aviv Slobodkin, Omer Goldman, Avi Caciularu, Ido Dagan, Shauli Ravfogel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have been shown to possess impressive
+capabilities, while also raising crucial concerns about the faithfulness of
+their responses. A primary issue arising in this context is the management of
+unanswerable queries by LLMs, which often results in hallucinatory behavior,
+due to overconfidence. In this paper, we explore the behavior of LLMs when
+presented with unanswerable queries. We ask: do models \textbf{represent} the
+fact that the question is unanswerable when generating a hallucinatory answer?
+Our results show strong indications that such models encode the answerability
+of an input query, with the representation of the first decoded token often
+being a strong indicator. These findings shed new light on the spatial
+organization within the latent representations of LLMs, unveiling previously
+unexplored facets of these models. Moreover, they pave the way for the
+development of improved decoding techniques with better adherence to factual
+generation, particularly in scenarios where query unanswerability is a concern.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AI Nushu: An Exploration of Language Emergence in Sisterhood -Through
+  the Lens of Computational Linguistics <span class="chip">SIGGRAPH</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11870v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11870v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuqian Sun, Yuying Tang, Ze Gao, Zhijun Pan, Chuyan Xu, Yurou Chen, Kejiang Qian, Zhigang Wang, Tristan Braud, Chang Hee Lee, Ali Asadipour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents "AI Nushu," an emerging language system inspired by Nushu
+(women's scripts), the unique language created and used exclusively by ancient
+Chinese women who were thought to be illiterate under a patriarchal society. In
+this interactive installation, two artificial intelligence (AI) agents are
+trained in the Chinese dictionary and the Nushu corpus. By continually
+observing their environment and communicating, these agents collaborate towards
+creating a standard writing system to encode Chinese. It offers an artistic
+interpretation of the creation of a non-western script from a computational
+linguistics perspective, integrating AI technology with Chinese cultural
+heritage and a feminist viewpoint.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at SIGGRAPH Asia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Text Annotation Handbook: A Practical Guide for Machine Learning
+  Projects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11780v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11780v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felix Stollenwerk, Joey Öhman, Danila Petrelli, Emma Wallerö, Fredrik Olsson, Camilla Bengtsson, Andreas Horndahl, Gabriela Zarzar Gandler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This handbook is a hands-on guide on how to approach text annotation tasks.
+It provides a gentle introduction to the topic, an overview of theoretical
+concepts as well as practical advice. The topics covered are mostly technical,
+but business, ethical and regulatory issues are also touched upon. The focus
+lies on readability and conciseness rather than completeness and scientific
+rigor. Experience with annotation and knowledge of machine learning are useful
+but not required. The document may serve as a primer or reference book for a
+wide range of professions such as team leaders, project managers, IT
+architects, software developers and machine learning engineers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, white paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language Agents for Detecting Implicit Stereotypes in Text-to-image
+  Models at Scale 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11778v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11778v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qichao Wang, Tian Bian, Yian Yin, Tingyang Xu, Hong Cheng, Helen M. Meng, Zibin Zheng, Liang Chen, Bingzhe Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent surge in the research of diffusion models has accelerated the
+adoption of text-to-image models in various Artificial Intelligence Generated
+Content (AIGC) commercial products. While these exceptional AIGC products are
+gaining increasing recognition and sparking enthusiasm among consumers, the
+questions regarding whether, when, and how these models might unintentionally
+reinforce existing societal stereotypes remain largely unaddressed. Motivated
+by recent advancements in language agents, here we introduce a novel agent
+architecture tailored for stereotype detection in text-to-image models. This
+versatile agent architecture is capable of accommodating free-form detection
+tasks and can autonomously invoke various tools to facilitate the entire
+process, from generating corresponding instructions and images, to detecting
+stereotypes. We build the stereotype-relevant benchmark based on multiple
+open-text datasets, and apply this architecture to commercial products and
+popular open source text-to-image models. We find that these models often
+display serious stereotypes when it comes to certain prompts about personal
+characteristics, social cultural context and crime-related aspects. In summary,
+these empirical findings underscore the pervasive existence of stereotypes
+across social dimensions, including gender, race, and religion, which not only
+validate the effectiveness of our proposed approach, but also emphasize the
+critical necessity of addressing potential ethical risks in the burgeoning
+realm of AIGC. As AIGC continues its rapid expansion trajectory, with new
+models and plugins emerging daily in staggering numbers, the challenge lies in
+the timely detection and mitigation of potential biases within these models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Long Document Topic Segmentation Models With Enhanced
+  Coherence Modeling <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11772v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11772v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hai Yu, Chong Deng, Qinglin Zhang, Jiaqing Liu, Qian Chen, Wen Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Topic segmentation is critical for obtaining structured long documents and
+improving downstream tasks like information retrieval. Due to its ability of
+automatically exploring clues of topic shift from a large amount of labeled
+data, recent supervised neural models have greatly promoted the development of
+long document topic segmentation, but leaving the deeper relationship of
+semantic coherence and topic segmentation underexplored. Therefore, this paper
+enhances the supervised model's ability to capture coherence from both
+structure and similarity perspectives to further improve the topic segmentation
+performance, including the Topic-aware Sentence Structure Prediction (TSSP) and
+Contrastive Semantic Similarity Learning (CSSL). Specifically, the TSSP task is
+proposed to force the model to comprehend structural information by learning
+the original relations of adjacent sentences in a disarrayed document, which is
+constructed by jointly disrupting the original document at the topic and
+sentence levels. In addition, we utilize inter- and intra-topic information to
+construct contrastive samples and design the CSSL objective to ensure that the
+sentences representations in the same topic have higher semantic similarity,
+while those in different topics are less similar. Extensive experiments show
+that the Longformer with our approach significantly outperforms old
+state-of-the-art (SOTA) methods. Our approach improves $F_{1}$ of old SOTA by
+3.42 (73.74 -> 77.16) and reduces $P_{k}$ by 1.11 points (15.0 -> 13.89) on
+WIKI-727K and achieves an average reduction of 0.83 points on $P_{k}$ on
+WikiSection. The average $P_{k}$ drop of 2.82 points on the two out-of-domain
+datasets also illustrates the robustness of our approach
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by EMNLP 2023. Codes is available at
+  https://github.com/alibaba-damo-academy/SpokenNLP/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Annotated Job Ads with Named Entity Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11769v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11769v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Felix Stollenwerk, Niklas Fastlund, Anna Nyqvist, Joey Öhman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We have trained a named entity recognition (NER) model that screens Swedish
+job ads for different kinds of useful information (e.g. skills required from a
+job seeker). It was obtained by fine-tuning KB-BERT. The biggest challenge we
+faced was the creation of a labelled dataset, which required manual annotation.
+This paper gives an overview of the methods we employed to make the annotation
+process more efficient and to ensure high quality data. We also report on the
+performance of the resulting model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SLTC 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comprehensive Evaluation of Large Language Models on Legal Judgment
+  Prediction <span class="chip">EMNLP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11761v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11761v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruihao Shui, Yixin Cao, Xiang Wang, Tat-Seng Chua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated great potential for
+domain-specific applications, such as the law domain. However, recent disputes
+over GPT-4's law evaluation raise questions concerning their performance in
+real-world legal tasks. To systematically investigate their competency in the
+law, we design practical baseline solutions based on LLMs and test on the task
+of legal judgment prediction. In our solutions, LLMs can work alone to answer
+open questions or coordinate with an information retrieval (IR) system to learn
+from similar cases or solve simplified multi-choice questions. We show that
+similar cases and multi-choice options, namely label candidates, included in
+prompts can help LLMs recall domain knowledge that is critical for expertise
+legal reasoning. We additionally present an intriguing paradox wherein an IR
+system surpasses the performance of LLM+IR due to limited gains acquired by
+weaker LLMs from powerful IR systems. In such cases, the role of LLMs becomes
+redundant. Our evaluation pipeline can be easily extended into other tasks to
+facilitate evaluations in other domains. Code is available at
+https://github.com/srhthu/LM-CompEval-Legal
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP Findings 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bias in Emotion Recognition with Chat<span class="highlight-title">GPT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11753v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11753v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naoki Wake, Atsushi Kanehira, Kazuhiro Sasabuchi, Jun Takamatsu, Katsushi Ikeuchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This technical report explores the ability of ChatGPT in recognizing emotions
+from text, which can be the basis of various applications like interactive
+chatbots, data annotation, and mental health analysis. While prior research has
+shown ChatGPT's basic ability in sentiment analysis, its performance in more
+nuanced emotion recognition is not yet explored. Here, we conducted experiments
+to evaluate its performance of emotion recognition across different datasets
+and emotion labels. Our findings indicate a reasonable level of reproducibility
+in its performance, with noticeable improvement through fine-tuning. However,
+the performance varies with different emotion labels and datasets, highlighting
+an inherent instability and possible bias. The choice of dataset and emotion
+labels significantly impacts ChatGPT's emotion recognition performance. This
+paper sheds light on the importance of dataset and label selection, and the
+potential of fine-tuning in enhancing ChatGPT's emotion recognition
+capabilities, providing a groundwork for better integration of emotion analysis
+in applications using ChatGPT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 4 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating Uncertainty Calibration of Aligned Language Models under
+  the Multiple-Choice Setting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11732v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11732v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guande He, Peng Cui, Jianfei Chen, Wenbo Hu, Jun Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the significant progress made in practical applications of aligned
+language models (LMs), they tend to be overconfident in output answers compared
+to the corresponding pre-trained LMs. In this work, we systematically evaluate
+the impact of the alignment process on logit-based uncertainty calibration of
+LMs under the multiple-choice setting. We first conduct a thoughtful empirical
+study on how aligned LMs differ in calibration from their pre-trained
+counterparts. Experimental results reveal that there are two distinct
+uncertainties in LMs under the multiple-choice setting, which are responsible
+for the answer decision and the format preference of the LMs, respectively.
+Then, we investigate the role of these two uncertainties on aligned LM's
+calibration through fine-tuning in simple synthetic alignment schemes and
+conclude that one reason for aligned LMs' overconfidence is the conflation of
+these two types of uncertainty. Furthermore, we examine the utility of common
+post-hoc calibration methods for aligned LMs and propose an easy-to-implement
+and sample-efficient method to calibrate aligned LMs. We hope our findings
+could provide insights into the design of more reliable alignment processes for
+LMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantify Health-Related Atomic Knowledge in Chinese Medical Large
+  Language Models: A Computational Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11722v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11722v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaxin Fan, Feng Jiang, Peifeng Li, Haizhou Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have the potential to revolutionize the way
+users self-diagnose through search engines by offering direct and efficient
+suggestions. Recent studies primarily focused on the quality of LLMs evaluated
+by GPT-4 or their ability to pass medical exams, no studies have quantified the
+extent of health-related atomic knowledge stored in LLMs' memory, which is the
+basis of LLMs to provide more factual suggestions. In this paper, we first
+constructed a benchmark, including the most common types of atomic knowledge in
+user self-diagnosis queries, with 17 atomic types and a total of 14, 048 pieces
+of atomic knowledge. Then, we evaluated both generic and specialized LLMs on
+the benchmark. The experimental results showcased that generic LLMs perform
+better than specialized LLMs in terms of atomic knowledge and
+instruction-following ability. Error analysis revealed that both generic and
+specialized LLMs are sycophantic, e.g., always catering to users' claims when
+it comes to unknown knowledge. Besides, generic LLMs showed stronger safety,
+which can be learned by specialized LLMs through distilled data. We further
+explored different types of data commonly adopted for fine-tuning specialized
+LLMs, i.e., real-world, semi-distilled, and distilled data, and found that
+distilled data can benefit LLMs most.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Chain-of-Thought Tuning: Masked Language Models can also Think Step By
+  Step in Natural Language Understanding <span class="chip">EMNLP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11721v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11721v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Caoyun Fan, Jidong Tian, Yitian Li, Wenqing Chen, Hao He, Yaohui Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chain-of-Thought (CoT) is a technique that guides Large Language Models
+(LLMs) to decompose complex tasks into multi-step reasoning through
+intermediate steps in natural language form. Briefly, CoT enables LLMs to think
+step by step. However, although many Natural Language Understanding (NLU) tasks
+also require thinking step by step, LLMs perform less well than small-scale
+Masked Language Models (MLMs). To migrate CoT from LLMs to MLMs, we propose
+Chain-of-Thought Tuning (CoTT), a two-step reasoning framework based on prompt
+tuning, to implement step-by-step thinking for MLMs on NLU tasks. From the
+perspective of CoT, CoTT's two-step framework enables MLMs to implement task
+decomposition; CoTT's prompt tuning allows intermediate steps to be used in
+natural language form. Thereby, the success of CoT can be extended to NLU tasks
+through MLMs. To verify the effectiveness of CoTT, we conduct experiments on
+two NLU tasks: hierarchical classification and relation extraction, and the
+results show that CoTT outperforms baselines and achieves state-of-the-art
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP2023 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reflection-Tuning: Data Recycling Improves LLM Instruction-Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11716v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11716v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Li, Lichang Chen, Jiuhai Chen, Shwai He, Heng Huang, Jiuxiang Gu, Tianyi Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Large Language Models (LLMs) have expanded the
+horizons of natural language understanding and generation. Notably, the output
+control and alignment with the input of LLMs can be refined through instruction
+tuning. However, as highlighted in several studies, low-quality data in the
+training set are usually detrimental to instruction tuning, resulting in
+inconsistent or even misleading LLM outputs. We propose a novel method, termed
+"reflection-tuning," which addresses the problem by self-improvement and
+judging capabilities of LLMs. This approach utilizes an oracle LLM to recycle
+the original training data by introspecting and enhancing the quality of
+instructions and responses in the data. Extensive experiments on widely used
+evaluation benchmarks show that LLMs trained with our recycled data outperform
+those trained with existing datasets in various benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Low-resource Fine-grained Named Entity Recognition by
+  Leveraging Coarse-grained <span class="highlight-title">Dataset</span>s <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11715v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11715v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Su Ah Lee, Seokjin Oh, Woohwan Jung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Named Entity Recognition (NER) frequently suffers from the problem of
+insufficient labeled data, particularly in fine-grained NER scenarios. Although
+$K$-shot learning techniques can be applied, their performance tends to
+saturate when the number of annotations exceeds several tens of labels. To
+overcome this problem, we utilize existing coarse-grained datasets that offer a
+large number of annotations. A straightforward approach to address this problem
+is pre-finetuning, which employs coarse-grained data for representation
+learning. However, it cannot directly utilize the relationships between
+fine-grained and coarse-grained entities, although a fine-grained entity type
+is likely to be a subcategory of a coarse-grained entity type. We propose a
+fine-grained NER model with a Fine-to-Coarse(F2C) mapping matrix to leverage
+the hierarchical structure explicitly. In addition, we present an inconsistency
+filtering method to eliminate coarse-grained entities that are inconsistent
+with fine-grained entity types to avoid performance degradation. Our
+experimental results show that our method outperforms both $K$-shot learning
+and supervised learning methods when dealing with a small number of
+fine-grained annotations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Co-Speech Gesture for Multimodal Aphasia Type Detection <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11710v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11710v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daeun Lee, Sejung Son, Hyolim Jeon, Seungbae Kim, Jinyoung Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aphasia, a language disorder resulting from brain damage, requires accurate
+identification of specific aphasia types, such as Broca's and Wernicke's
+aphasia, for effective treatment. However, little attention has been paid to
+developing methods to detect different types of aphasia. Recognizing the
+importance of analyzing co-speech gestures for distinguish aphasia types, we
+propose a multimodal graph neural network for aphasia type detection using
+speech and corresponding gesture patterns. By learning the correlation between
+the speech and gesture modalities for each aphasia type, our model can generate
+textual representations sensitive to gesture information, leading to accurate
+aphasia type detection. Extensive experiments demonstrate the superiority of
+our approach over existing methods, achieving state-of-the-art results (F1
+84.2\%). We also show that gesture features outperform acoustic features,
+highlighting the significance of gesture expression in detecting aphasia types.
+We provide the codes for reproducibility purposes\footnote{Code:
+\url{https://github.com/DSAIL-SKKU/Multimodal-Aphasia-Type-Detection_EMNLP_2023}}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MISAR: A Multimodal Instructional System with Augmented Reality <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11699v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11699v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Bi, Nguyen Manh Nguyen, Ali Vosoughi, Chenliang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Augmented reality (AR) requires the seamless integration of visual, auditory,
+and linguistic channels for optimized human-computer interaction. While
+auditory and visual inputs facilitate real-time and contextual user guidance,
+the potential of large language models (LLMs) in this landscape remains largely
+untapped. Our study introduces an innovative method harnessing LLMs to
+assimilate information from visual, auditory, and contextual modalities.
+Focusing on the unique challenge of task performance quantification in AR, we
+utilize egocentric video, speech, and context analysis. The integration of LLMs
+facilitates enhanced state estimation, marking a step towards more adaptive AR
+systems. Code, dataset, and demo will be available at
+https://github.com/nguyennm1024/misar.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023 - AV4D, 6 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptation with Self-Evaluation to Improve Selective Prediction in LLMs <span class="chip">EMNLP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11689v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11689v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiefeng Chen, Jinsung Yoon, Sayna Ebrahimi, Sercan O Arik, Tomas Pfister, Somesh Jha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have recently shown great advances in a variety
+of tasks, including natural language understanding and generation. However,
+their use in high-stakes decision-making scenarios is still limited due to the
+potential for errors. Selective prediction is a technique that can be used to
+improve the reliability of the LLMs by allowing them to abstain from making
+predictions when they are unsure of the answer. In this work, we propose a
+novel framework for adaptation with self-evaluation to improve the selective
+prediction performance of LLMs. Our framework is based on the idea of using
+parameter-efficient tuning to adapt the LLM to the specific task at hand while
+improving its ability to perform self-evaluation. We evaluate our method on a
+variety of question-answering (QA) datasets and show that it outperforms
+state-of-the-art selective prediction methods. For example, on the CoQA
+benchmark, our method improves the AUACC from 91.23% to 92.63% and improves the
+AUROC from 74.61% to 80.25%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper published at Findings of the Association for Computational
+  Linguistics: EMNLP, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Superiority of Softmax: Unveiling the Performance Edge Over Linear
+  Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11685v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11685v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichuan Deng, Zhao Song, Tianyi Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large transformer models have achieved state-of-the-art results in numerous
+natural language processing tasks. Among the pivotal components of the
+transformer architecture, the attention mechanism plays a crucial role in
+capturing token interactions within sequences through the utilization of
+softmax function.
+  Conversely, linear attention presents a more computationally efficient
+alternative by approximating the softmax operation with linear complexity.
+However, it exhibits substantial performance degradation when compared to the
+traditional softmax attention mechanism.
+  In this paper, we bridge the gap in our theoretical understanding of the
+reasons behind the practical performance gap between softmax and linear
+attention. By conducting a comprehensive comparative analysis of these two
+attention mechanisms, we shed light on the underlying reasons for why softmax
+attention outperforms linear attention in most scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Descriptive Knowledge Graph in Biomedical Domain <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11681v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11681v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kerui Zhu, Jie Huang, Kevin Chen-Chuan Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel system that automatically extracts and generates
+informative and descriptive sentences from the biomedical corpus and
+facilitates the efficient search for relational knowledge. Unlike previous
+search engines or exploration systems that retrieve unconnected passages, our
+system organizes descriptive sentences as a relational graph, enabling
+researchers to explore closely related biomedical entities (e.g., diseases
+treated by a chemical) or indirectly connected entities (e.g., potential drugs
+for treating a disease). Our system also uses ChatGPT and a fine-tuned relation
+synthesis model to generate concise and reliable descriptive sentences from
+retrieved information, reducing the need for extensive human reading effort.
+With our system, researchers can easily obtain both high-level knowledge and
+detailed references and interactively steer to the information of interest. We
+spotlight the application of our system in COVID-19 research, illustrating its
+utility in areas such as drug repurposing and literature curation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 Demo</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Open-ended Commonsense Reasoning with Unrestricted Answer Scope <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11672v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11672v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Ling, Xuchao Zhang, Xujiang Zhao, Yanchi Liu, Wei Cheng, Takao Osaki, Katsushi Matsuda, Haifeng Chen, Liang Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open-ended Commonsense Reasoning is defined as solving a commonsense question
+without providing 1) a short list of answer candidates and 2) a pre-defined
+answer scope. Conventional ways of formulating the commonsense question into a
+question-answering form or utilizing external knowledge to learn
+retrieval-based methods are less applicable in the open-ended setting due to an
+inherent challenge. Without pre-defining an answer scope or a few candidates,
+open-ended commonsense reasoning entails predicting answers by searching over
+an extremely large searching space. Moreover, most questions require implicit
+multi-hop reasoning, which presents even more challenges to our problem. In
+this work, we leverage pre-trained language models to iteratively retrieve
+reasoning paths on the external knowledge base, which does not require
+task-specific supervision. The reasoning paths can help to identify the most
+precise answer to the commonsense question. We conduct experiments on two
+commonsense benchmark datasets. Compared to other approaches, our proposed
+method achieves better performance both quantitatively and qualitatively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MixEdit: Revisiting Data Augmentation and Beyond for Grammatical Error
+  Correction <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11671v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11671v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingheng Ye, Yinghui Li, Yangning Li, Hai-Tao Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data Augmentation through generating pseudo data has been proven effective in
+mitigating the challenge of data scarcity in the field of Grammatical Error
+Correction (GEC). Various augmentation strategies have been widely explored,
+most of which are motivated by two heuristics, i.e., increasing the
+distribution similarity and diversity of pseudo data. However, the underlying
+mechanism responsible for the effectiveness of these strategies remains poorly
+understood. In this paper, we aim to clarify how data augmentation improves GEC
+models. To this end, we introduce two interpretable and computationally
+efficient measures: Affinity and Diversity. Our findings indicate that an
+excellent GEC data augmentation strategy characterized by high Affinity and
+appropriate Diversity can better improve the performance of GEC models. Based
+on this observation, we propose MixEdit, a data augmentation approach that
+strategically and dynamically augments realistic data, without requiring extra
+monolingual corpora. To verify the correctness of our findings and the
+effectiveness of the proposed MixEdit, we conduct experiments on mainstream
+English and Chinese GEC datasets. The results show that MixEdit substantially
+improves GEC models and is complementary to traditional data augmentation
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prototype-based HyperAdapter for Sample-Efficient Multi-task Tuning <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11670v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11670v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Zhao, Jie Fu, Zhaofeng He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parameter-efficient fine-tuning (PEFT) has shown its effectiveness in
+adapting the pre-trained language models to downstream tasks while only
+updating a small number of parameters. Despite the success, most existing
+methods independently adapt to each task without considering knowledge transfer
+between tasks and are limited to low-data regimes. To overcome this issue, we
+propose Prototype-based HyperAdapter (PHA), a novel framework built on the
+adapter-tuning and hypernetwork. It introduces an instance-dense retriever and
+a prototypical hypernetwork to generate the conditional modules in a
+sample-efficient manner. This leads to comparable performance improvements
+against existing PEFT methods on multi-task learning and few-shot transfer
+learning. More importantly, when the available data size gets smaller, our
+method outperforms other strong baselines by a large margin. Based on our
+extensive empirical experiments across various datasets, we demonstrate that
+PHA strikes a better trade-off between trainable parameters, accuracy on stream
+tasks, and sample efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SOTOPIA: Interactive Evaluation for Social Intelligence in Language
+  Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11667v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11667v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuhui Zhou, Hao Zhu, Leena Mathur, Ruohong Zhang, Haofei Yu, Zhengyang Qi, Louis-Philippe Morency, Yonatan Bisk, Daniel Fried, Graham Neubig, Maarten Sap
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans are social beings; we pursue social goals in our daily interactions,
+which is a crucial aspect of social intelligence. Yet, AI systems' abilities in
+this realm remain elusive. We present SOTOPIA, an open-ended environment to
+simulate complex social interactions between artificial agents and evaluate
+their social intelligence. In our environment, agents role-play and interact
+under a wide variety of scenarios; they coordinate, collaborate, exchange, and
+compete with each other to achieve complex social goals. We simulate the
+role-play interaction between LLM-based agents and humans within this task
+space and evaluate their performance with a holistic evaluation framework
+called SOTOPIA-Eval. With SOTOPIA, we find significant differences between
+these models in terms of their social intelligence, and we identify a subset of
+SOTOPIA scenarios, SOTOPIA-hard, that is generally challenging for all models.
+We find that on this subset, GPT-4 achieves a significantly lower goal
+completion rate than humans and struggles to exhibit social commonsense
+reasoning and strategic communication skills. These findings demonstrate
+SOTOPIA's promise as a general platform for research on evaluating and
+improving social intelligence in artificial agents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint, 43 pages. The first two authors contribute equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Field-testing items using artificial intelligence: Natural language
+  processing with <span class="highlight-title">transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11655v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11655v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hotaka Maeda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Five thousand variations of the RoBERTa model, an artificially intelligent
+"transformer" that can understand text language, completed an English literacy
+exam with 29 multiple-choice questions. Data were used to calculate the
+psychometric properties of the items, which showed some degree of agreement to
+those obtained from human examinee data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zero-shot Faithfulness Evaluation for Text Summarization with Foundation
+  Language Model <span class="chip">EMNLP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11648v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11648v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Jia, Siyu Ren, Yizhu Liu, Kenny Q. Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite tremendous improvements in natural language generation, summarization
+models still suffer from the unfaithfulness issue. Previous work evaluates
+faithfulness either using models trained on the other tasks or in-domain
+synthetic data, or prompting a large model such as ChatGPT. This paper proposes
+to do zero-shot faithfulness evaluation simply with a moderately-sized
+foundation language model. We introduce a new metric FFLM, which is a
+combination of probability changes based on the intuition that prefixing a
+piece of text that is consistent with the output will increase the probability
+of predicting the output. Experiments show that FFLM performs competitively
+with or even outperforms ChatGPT on both inconsistency detection and
+faithfulness rating with 24x fewer parameters. FFLM also achieves improvements
+over other strong baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by EMNLP2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Systematic Assessment of Factual Knowledge in Large Language Models <span class="chip">EMNLP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11638v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11638v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linhao Luo, Thuy-Trang Vu, Dinh Phung, Gholamreza Haffari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Previous studies have relied on existing question-answering benchmarks to
+evaluate the knowledge stored in large language models (LLMs). However, this
+approach has limitations regarding factual knowledge coverage, as it mostly
+focuses on generic domains which may overlap with the pretraining data. This
+paper proposes a framework to systematically assess the factual knowledge of
+LLMs by leveraging knowledge graphs (KGs). Our framework automatically
+generates a set of questions and expected answers from the facts stored in a
+given KG, and then evaluates the accuracy of LLMs in answering these questions.
+We systematically evaluate the state-of-the-art LLMs with KGs in generic and
+specific domains. The experiment shows that ChatGPT is consistently the top
+performer across all domains. We also find that LLMs performance depends on the
+instruction finetuning, domain and question complexity and is prone to
+adversarial context.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by EMNLP 2024 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MAGNIFICo: Evaluating the In-Context Learning Ability of Large Language
+  Models to Generalize to Novel Interpretations <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11634v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11634v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arkil Patel, Satwik Bhattamishra, Siva Reddy, Dzmitry Bahdanau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans possess a remarkable ability to assign novel interpretations to
+linguistic expressions, enabling them to learn new words and understand
+community-specific connotations. However, Large Language Models (LLMs) have a
+knowledge cutoff and are costly to finetune repeatedly. Therefore, it is
+crucial for LLMs to learn novel interpretations in-context. In this paper, we
+systematically analyse the ability of LLMs to acquire novel interpretations
+using in-context learning. To facilitate our study, we introduce MAGNIFICo, an
+evaluation suite implemented within a text-to-SQL semantic parsing framework
+that incorporates diverse tokens and prompt settings to simulate real-world
+complexity. Experimental results on MAGNIFICo demonstrate that LLMs exhibit a
+surprisingly robust capacity for comprehending novel interpretations from
+natural language descriptions as well as from discussions within long
+conversations. Nevertheless, our findings also highlight the need for further
+improvements, particularly when interpreting unfamiliar words or when composing
+multiple novel interpretations simultaneously in the same example.
+Additionally, our analysis uncovers the semantic predispositions in LLMs and
+reveals the impact of recency bias for information presented in long contexts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Prompt</span>ing Chat<span class="highlight-title">GPT</span> in MNER: Enhanced Multimodal Named Entity Recognition
+  with Auxiliary Refined Knowledge <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12212v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12212v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinyuan Li, Han Li, Zhuo Pan, Di Sun, Jiahao Wang, Wenkun Zhang, Gang Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal Named Entity Recognition (MNER) on social media aims to enhance
+textual entity prediction by incorporating image-based clues. Existing studies
+mainly focus on maximizing the utilization of pertinent image information or
+incorporating external knowledge from explicit knowledge bases. However, these
+methods either neglect the necessity of providing the model with external
+knowledge, or encounter issues of high redundancy in the retrieved knowledge.
+In this paper, we present PGIM -- a two-stage framework that aims to leverage
+ChatGPT as an implicit knowledge base and enable it to heuristically generate
+auxiliary knowledge for more efficient entity prediction. Specifically, PGIM
+contains a Multimodal Similar Example Awareness module that selects suitable
+examples from a small number of predefined artificial samples. These examples
+are then integrated into a formatted prompt template tailored to the MNER and
+guide ChatGPT to generate auxiliary refined knowledge. Finally, the acquired
+knowledge is integrated with the original text and fed into a downstream model
+for further processing. Extensive experiments show that PGIM outperforms
+state-of-the-art methods on two classic MNER datasets and exhibits a stronger
+robustness and generalization capability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ YATO: Yet Another deep learning based Text analysis Open toolkit 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.13877v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.13877v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeqiang Wang, Yile Wang, Jiageng Wu, Zhiyang Teng, Jie Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce YATO, an open-source, easy-to-use toolkit for text analysis with
+deep learning. Different from existing heavily engineered toolkits and
+platforms, YATO is lightweight and user-friendly for researchers from
+cross-disciplinary areas. Designed in a hierarchical structure, YATO supports
+free combinations of three types of widely used features including 1)
+traditional neural networks (CNN, RNN, etc.); 2) pre-trained language models
+(BERT, RoBERTa, ELECTRA, etc.); and 3) user-customized neural features via a
+simple configurable file. Benefiting from the advantages of flexibility and
+ease of use, YATO can facilitate fast reproduction and refinement of
+state-of-the-art NLP models, and promote the cross-disciplinary applications of
+NLP techniques. The code, examples, and documentation are publicly available at
+https://github.com/jiesutd/YATO. A demo video is also available at
+https://www.youtube.com/playlist?list=PLJ0mhzMcRuDUlTkzBfAftOqiJRxYTTjXH.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Don't throw away your value model! Making PPO even better via
+  Value-Guided Monte-Carlo Tree Search decoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15028v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15028v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiacheng Liu, Andrew Cohen, Ramakanth Pasunuru, Yejin Choi, Hannaneh Hajishirzi, Asli Celikyilmaz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inference-time search algorithms such as Monte-Carlo Tree Search (MCTS) may
+seem unnecessary when generating natural language text based on
+state-of-the-art reinforcement learning such as Proximal Policy Optimization
+(PPO). In this paper, we demonstrate that it is possible to get extra mileage
+out of PPO by integrating MCTS on top. The key idea is not to throw out the
+value network, a byproduct of PPO training for evaluating partial output
+sequences, when decoding text out of the policy network. More concretely, we
+present a novel value-guided decoding algorithm called PPO-MCTS, which can
+integrate the value network from PPO to work closely with the policy network
+during inference-time generation. Compared to prior approaches based on MCTS
+for controlled text generation, the key strength of our approach is to reduce
+the fundamental mismatch of the scoring mechanisms of the partial outputs
+between training and test. Evaluation on four text generation tasks demonstrate
+that PPO-MCTS greatly improves the preferability of generated text compared to
+the standard practice of using only the PPO policy. Our results demonstrate the
+promise of search algorithms even on top of the aligned language models from
+PPO, and the under-explored benefit of the value network.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Crystal: Introspective Reasoners Reinforced with Self-Feedback <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04921v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04921v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiacheng Liu, Ramakanth Pasunuru, Hannaneh Hajishirzi, Yejin Choi, Asli Celikyilmaz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extensive work has shown that the performance and interpretability of
+commonsense reasoning can be improved via knowledge-augmented reasoning
+methods, where the knowledge that underpins the reasoning process is explicitly
+verbalized and utilized. However, existing implementations, including
+"chain-of-thought" and its variants, fall short in capturing the introspective
+nature of knowledge required in commonsense reasoning, and in accounting for
+the mutual adaptation between the generation and utilization of knowledge. We
+propose a novel method to develop an introspective commonsense reasoner,
+Crystal. To tackle commonsense problems, it first introspects for knowledge
+statements related to the given question, and subsequently makes an informed
+prediction that is grounded in the previously introspected knowledge. The
+knowledge introspection and knowledge-grounded reasoning modes of the model are
+tuned via reinforcement learning to mutually adapt, where the reward derives
+from the feedback given by the model itself. Experiments show that Crystal
+significantly outperforms both the standard supervised finetuning and
+chain-of-thought distilled methods, and enhances the transparency of the
+commonsense reasoning process. Our work ultimately validates the feasibility
+and potential of reinforcing a neural model with self-feedback.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vera: A General-Purpose Plausibility Estimation Model for Commonsense
+  Statements <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03695v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03695v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiacheng Liu, Wenya Wang, Dianzhuo Wang, Noah A. Smith, Yejin Choi, Hannaneh Hajishirzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the much discussed capabilities of today's language models, they are
+still prone to silly and unexpected commonsense failures. We consider a
+retrospective verification approach that reflects on the correctness of LM
+outputs, and introduce Vera, a general-purpose model that estimates the
+plausibility of declarative statements based on commonsense knowledge. Trained
+on ~7M commonsense statements created from 19 QA datasets and two large-scale
+knowledge bases, and with a combination of three training objectives, Vera is a
+versatile model that effectively separates correct from incorrect statements
+across diverse commonsense domains. When applied to solving commonsense
+problems in the verification format, Vera substantially outperforms existing
+models that can be repurposed for commonsense verification, and it further
+exhibits generalization capabilities to unseen tasks and provides
+well-calibrated outputs. We find that Vera excels at filtering LM-generated
+commonsense knowledge and is useful in detecting erroneous commonsense
+statements generated by models like ChatGPT in real-world settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Explanation Selection Using Unlabeled Data for Chain-of-Thought
+  <span class="highlight-title">Prompt</span>ing <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.04813v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.04813v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xi Ye, Greg Durrett
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has shown how to prompt large language models with explanations
+to obtain strong performance on textual reasoning tasks, i.e., the
+chain-of-thought paradigm. However, subtly different explanations can yield
+widely varying downstream task accuracy. Explanations that have not been
+"tuned" for a task, such as off-the-shelf explanations written by nonexperts,
+may lead to mediocre performance. This paper tackles the problem of how to
+optimize explanation-infused prompts in a blackbox fashion. We first generate
+sets of candidate explanations for each example in the prompt using a
+leave-one-out scheme, then find an effective combination of these explanations
+with a two-stage framework. We first evaluate explanations for each in-context
+example in isolation according to two proxy metrics, log likelihood and
+accuracy on new examples. Then, we search over combinations of explanations to
+find one that yields high performance against a silver-labeled development set.
+Across four textual reasoning tasks spanning question answering, mathematical
+reasoning, and natural language inference, results show that our proxy metrics
+correlate with ground truth accuracy and our overall method can effectively
+improve prompts over crowdworker annotations and naive search strategies
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Compose Representations of Different Encoder Layers towards
+  Improving Compositional Generalization <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12169v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12169v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Lin, Shuangtao Li, Yafang Zheng, Biao Fu, Shan Liu, Yidong Chen, Xiaodong Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies have shown that sequence-to-sequence (seq2seq) models struggle
+with compositional generalization (CG), i.e., the ability to systematically
+generalize to unseen compositions of seen components. There is mounting
+evidence that one of the reasons hindering CG is the representation of the
+encoder uppermost layer is entangled, i.e., the syntactic and semantic
+representations of sequences are entangled. However, we consider that the
+previously identified representation entanglement problem is not comprehensive
+enough. Additionally, we hypothesize that the source keys and values
+representations passing into different decoder layers are also entangled.
+Starting from this intuition, we propose \textsc{CompoSition} (\textbf{Compo}se
+\textbf{S}yntactic and Semant\textbf{i}c Representa\textbf{tion}s), an
+extension to seq2seq models which learns to compose representations of
+different encoder layers dynamically for different tasks, since recent studies
+reveal that the bottom layers of the Transformer encoder contain more syntactic
+information and the top ones contain more semantic information. Specifically,
+we introduce a \textit{composed layer} between the encoder and decoder to
+compose different encoder layers' representations to generate specific keys and
+values passing into different decoder layers. \textsc{CompoSition} achieves
+competitive results on two comprehensive and realistic benchmarks, which
+empirically demonstrates the effectiveness of our proposal. Codes are available
+at~\url{https://github.com/thinkaboutzero/COMPOSITION}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Prompt</span>Bench: Towards Evaluating the Robustness of Large Language Models
+  on Adversarial <span class="highlight-title">Prompt</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.04528v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.04528v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaijie Zhu, Jindong Wang, Jiaheng Zhou, Zichen Wang, Hao Chen, Yidong Wang, Linyi Yang, Wei Ye, Yue Zhang, Neil Zhenqiang Gong, Xing Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing reliance on Large Language Models (LLMs) across academia and
+industry necessitates a comprehensive understanding of their robustness to
+prompts. In response to this vital need, we introduce PromptBench, a robustness
+benchmark designed to measure LLMs' resilience to adversarial prompts. This
+study uses a plethora of adversarial textual attacks targeting prompts across
+multiple levels: character, word, sentence, and semantic. The adversarial
+prompts, crafted to mimic plausible user errors like typos or synonyms, aim to
+evaluate how slight deviations can affect LLM outcomes while maintaining
+semantic integrity. These prompts are then employed in diverse tasks, such as
+sentiment analysis, natural language inference, reading comprehension, machine
+translation, and math problem-solving. Our study generates 4788 adversarial
+prompts, meticulously evaluated over 8 tasks and 13 datasets. Our findings
+demonstrate that contemporary LLMs are not robust to adversarial prompts.
+Furthermore, we present comprehensive analysis to understand the mystery behind
+prompt robustness and its transferability. We then offer insightful robustness
+analysis and pragmatic recommendations for prompt composition, beneficial to
+both researchers and everyday users. Code is available at:
+https://github.com/microsoft/promptbench.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report; code is at:
+  https://github.com/microsoft/promptbench</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Survey</span> on Factuality in Large Language Models: Knowledge, Retrieval and
+  Domain-Specificity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07521v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07521v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cunxiang Wang, Xiaoze Liu, Yuanhao Yue, Xiangru Tang, Tianhang Zhang, Cheng Jiayang, Yunzhi Yao, Wenyang Gao, Xuming Hu, Zehan Qi, Yidong Wang, Linyi Yang, Jindong Wang, Xing Xie, Zheng Zhang, Yue Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This survey addresses the crucial issue of factuality in Large Language
+Models (LLMs). As LLMs find applications across diverse domains, the
+reliability and accuracy of their outputs become vital. We define the
+Factuality Issue as the probability of LLMs to produce content inconsistent
+with established facts. We first delve into the implications of these
+inaccuracies, highlighting the potential consequences and challenges posed by
+factual errors in LLM outputs. Subsequently, we analyze the mechanisms through
+which LLMs store and process facts, seeking the primary causes of factual
+errors. Our discussion then transitions to methodologies for evaluating LLM
+factuality, emphasizing key metrics, benchmarks, and studies. We further
+explore strategies for enhancing LLM factuality, including approaches tailored
+for specific domains. We focus two primary LLM configurations standalone LLMs
+and Retrieval-Augmented LLMs that utilizes external data, we detail their
+unique challenges and potential enhancements. Our survey offers a structured
+guide for researchers aiming to fortify the factual reliability of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>44 pages; 300+ references</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Appraising the Potential Uses and Harms of LLMs for Medical Systematic
+  <span class="highlight-title">Review</span>s <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11828v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11828v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hye Sun Yun, Iain J. Marshall, Thomas A. Trikalinos, Byron C. Wallace
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical systematic reviews play a vital role in healthcare decision making
+and policy. However, their production is time-consuming, limiting the
+availability of high-quality and up-to-date evidence summaries. Recent
+advancements in large language models (LLMs) offer the potential to
+automatically generate literature reviews on demand, addressing this issue.
+However, LLMs sometimes generate inaccurate (and potentially misleading) texts
+by hallucination or omission. In healthcare, this can make LLMs unusable at
+best and dangerous at worst. We conducted 16 interviews with international
+systematic review experts to characterize the perceived utility and risks of
+LLMs in the specific context of medical evidence reviews. Experts indicated
+that LLMs can assist in the writing process by drafting summaries, generating
+templates, distilling information, and crosschecking information. They also
+raised concerns regarding confidently composed but inaccurate LLM outputs and
+other potential downstream harms, including decreased accountability and
+proliferation of low-quality reviews. Informed by this qualitative analysis, we
+identify criteria for rigorous evaluation of biomedical LLMs aligned with
+domain expert views.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 2 figures, 8 tables. Accepted as an EMNLP 2023 main paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Comparative Performance Evaluation of Large Language Models for
+  Extracting Molecular Interactions and Pathway Knowledge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08813v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08813v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gilchan Park, Byung-Jun Yoon, Xihaier Luo, Vanessa López-Marrero, Shinjae Yoo, Shantenu Jha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding protein interactions and pathway knowledge is crucial for
+unraveling the complexities of living systems and investigating the underlying
+mechanisms of biological functions and complex diseases. While existing
+databases provide curated biological data from literature and other sources,
+they are often incomplete and their maintenance is labor-intensive,
+necessitating alternative approaches. In this study, we propose to harness the
+capabilities of large language models to address these issues by automatically
+extracting such knowledge from the relevant scientific literature. Toward this
+goal, in this work, we investigate the effectiveness of different large
+language models in tasks that involve recognizing protein interactions,
+identifying genes associated with pathways affected by low-dose radiation, and
+gene regulatory relations. We thoroughly evaluate the performance of various
+models, highlight the significant findings, and discuss both the future
+opportunities and the remaining challenges associated with this approach. The
+code and data are available at: https://github.com/boxorange/BioIE-LLM
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Conversational Search: Large Language Model-Aided Informative
+  Query Rewriting <span class="chip">EMNLP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09716v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09716v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fanghua Ye, Meng Fang, Shenghui Li, Emine Yilmaz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Query rewriting plays a vital role in enhancing conversational search by
+transforming context-dependent user queries into standalone forms. Existing
+approaches primarily leverage human-rewritten queries as labels to train query
+rewriting models. However, human rewrites may lack sufficient information for
+optimal retrieval performance. To overcome this limitation, we propose
+utilizing large language models (LLMs) as query rewriters, enabling the
+generation of informative query rewrites through well-designed instructions. We
+define four essential properties for well-formed rewrites and incorporate all
+of them into the instruction. In addition, we introduce the role of rewrite
+editors for LLMs when initial query rewrites are available, forming a
+"rewrite-then-edit" process. Furthermore, we propose distilling the rewriting
+capabilities of LLMs into smaller models to reduce rewriting latency. Our
+experimental evaluation on the QReCC dataset demonstrates that informative
+query rewrites can yield substantially improved retrieval performance compared
+to human rewrites, especially with sparse retrievers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, accepted to EMNLP Findings 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stop Uploading Test Data in Plain Text: Practical Strategies for
+  Mitigating Data Contamination by Evaluation Benchmarks <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10160v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10160v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alon Jacovi, Avi Caciularu, Omer Goldman, Yoav Goldberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data contamination has become prevalent and challenging with the rise of
+models pretrained on large automatically-crawled corpora. For closed models,
+the training data becomes a trade secret, and even for open models, it is not
+trivial to detect contamination. Strategies such as leaderboards with hidden
+answers, or using test data which is guaranteed to be unseen, are expensive and
+become fragile with time. Assuming that all relevant actors value clean test
+data and will cooperate to mitigate data contamination, what can be done? We
+propose three strategies that can make a difference: (1) Test data made public
+should be encrypted with a public key and licensed to disallow derivative
+distribution; (2) demand training exclusion controls from closed API holders,
+and protect your test data by refusing to evaluate without them; (3) avoid data
+which appears with its solution on the internet, and release the web-page
+context of internet-derived data along with the data. These strategies are
+practical and can be effective in preventing data contamination.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Question Answering as Programming for Solving Time-Sensitive Questions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14221v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14221v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Zhu, Cheng Yang, Bei Chen, Siheng Li, Jian-Guang Lou, Yujiu Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Question answering plays a pivotal role in human daily life because it
+involves our acquisition of knowledge about the world. However, due to the
+dynamic and ever-changing nature of real-world facts, the answer can be
+completely different when the time constraint in the question changes.
+Recently, Large Language Models (LLMs) have shown remarkable intelligence in
+question answering, while our experiments reveal that the aforementioned
+problems still pose a significant challenge to existing LLMs. This can be
+attributed to the LLMs' inability to perform rigorous reasoning based on
+surface-level text semantics. To overcome this limitation, rather than
+requiring LLMs to directly answer the question, we propose a novel approach
+where we reframe the $\textbf{Q}$uestion $\textbf{A}$nswering task
+$\textbf{a}$s $\textbf{P}$rogramming ($\textbf{QAaP}$). Concretely, by
+leveraging modern LLMs' superior capability in understanding both natural
+language and programming language, we endeavor to harness LLMs to represent
+diversely expressed text as well-structured code and select the best matching
+answer from multiple candidates through programming. We evaluate our QAaP
+framework on several time-sensitive question answering datasets and achieve
+decent improvement, up to $14.5$% over strong baselines. Our codes and data are
+available at https://github.com/TianHongZXY/qaap
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PlugMed: Improving Specificity in Patient-Centered Medical Dialogue
+  Generation using In-Context Learning <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11508v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11508v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengfeng Dou, Zhi Jin, Wenping Jiao, Haiyan Zhao, Zhenwei Tao, Yongqiang Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The patient-centered medical dialogue systems strive to offer diagnostic
+interpretation services to users who are less knowledgeable about medical
+knowledge, through emphasizing the importance of providing responses specific
+to the patients. It is difficult for the large language models (LLMs) to
+guarantee the specificity of responses in spite of its promising performance
+even in some tasks in medical field. Inspired by in-context learning, we
+propose PlugMed, a Plug-and-Play Medical Dialogue System, for addressing this
+challenge. PlugMed is equipped with two modules, the prompt generation (PG)
+module and the response ranking (RR) module, to enhances LLMs' dialogue
+strategies for improving the specificity of the dialogue. The PG module is
+designed to stimulate the imitative ability of LLMs by providing them with real
+dialogues from similar patients as prompts. The RR module incorporates
+fine-tuned small model as response filter to enable the selection of
+appropriate responses generated by LLMs. Furthermore, we introduce a new
+evaluation method based on matching both user's intent and high-frequency
+medical term to effectively assess the specificity of the responses. We conduct
+experimental evaluations on three medical dialogue datasets, and the results,
+including both automatic and human evaluation, demonstrate the effectiveness of
+our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by EMNLP 2023 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cross-modality Data Augmentation for End-to-End Sign Language
+  Translation <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11096v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11096v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinhui Ye, Wenxiang Jiao, Xing Wang, Zhaopeng Tu, Hui Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  End-to-end sign language translation (SLT) aims to convert sign language
+videos into spoken language texts directly without intermediate
+representations. It has been a challenging task due to the modality gap between
+sign videos and texts and the data scarcity of labeled data. To tackle these
+challenges, we propose a novel Cross-modality Data Augmentation (XmDA)
+framework to transfer the powerful gloss-to-text translation capabilities to
+end-to-end sign language translation (i.e. video-to-text) by exploiting pseudo
+gloss-text pairs from the sign gloss translation model. Specifically, XmDA
+consists of two key components, namely, cross-modality mix-up and
+cross-modality knowledge distillation. The former explicitly encourages the
+alignment between sign video features and gloss embeddings to bridge the
+modality gap. The latter utilizes the generation knowledge from gloss-to-text
+teacher models to guide the spoken language text generation. Experimental
+results on two widely used SLT datasets, i.e., PHOENIX-2014T and CSL-Daily,
+demonstrate that the proposed XmDA framework significantly and consistently
+outperforms the baseline models. Extensive analyses confirm our claim that XmDA
+enhances spoken language text generation by reducing the representation
+distance between videos and texts, as well as improving the processing of
+low-frequency words and long sentences.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring the Cognitive Knowledge Structure of Large Language Models: An
+  Educational Diagnostic Assessment Approach <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08172v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08172v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheyuan Zhang, Jifan Yu, Juanzi Li, Lei Hou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have not only exhibited exceptional performance
+across various tasks, but also demonstrated sparks of intelligence. Recent
+studies have focused on assessing their capabilities on human exams and
+revealed their impressive competence in different domains. However, cognitive
+research on the overall knowledge structure of LLMs is still lacking. In this
+paper, based on educational diagnostic assessment method, we conduct an
+evaluation using MoocRadar, a meticulously annotated human test dataset based
+on Bloom Taxonomy. We aim to reveal the knowledge structures of LLMs and gain
+insights of their cognitive capabilities. This research emphasizes the
+significance of investigating LLMs' knowledge and understanding the disparate
+cognitive patterns of LLMs. By shedding light on models' knowledge, researchers
+can advance development and utilization of LLMs in a more informed and
+effective manner.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of EMNLP 2023 (Short Paper)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Replay to Remember: Continual Layer-Specific Fine-tuning for German
+  Speech Recognition <span class="chip">ICANN 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07280v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07280v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Theresa Pekarek Rosin, Stefan Wermter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While Automatic Speech Recognition (ASR) models have shown significant
+advances with the introduction of unsupervised or self-supervised training
+techniques, these improvements are still only limited to a subsection of
+languages and speakers. Transfer learning enables the adaptation of large-scale
+multilingual models to not only low-resource languages but also to more
+specific speaker groups. However, fine-tuning on data from new domains is
+usually accompanied by a decrease in performance on the original domain.
+Therefore, in our experiments, we examine how well the performance of
+large-scale ASR models can be approximated for smaller domains, with our own
+dataset of German Senior Voice Commands (SVC-de), and how much of the general
+speech recognition performance can be preserved by selectively freezing parts
+of the model during training. To further increase the robustness of the ASR
+model to vocabulary and speakers outside of the fine-tuned domain, we apply
+Experience Replay for continual learning. By adding only a fraction of data
+from the original domain, we are able to reach Word-Error-Rates (WERs) below
+5\% on the new domain, while stabilizing performance for general speech
+recognition at acceptable WERs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures, accepted and presented at ICANN 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Expression Tree Decoding Strategy for Mathematical Equation
+  Generation <span class="chip">EMNLP-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09619v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09619v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenqi Zhang, Yongliang Shen, Qingpeng Nong, Zeqi Tan, Yanna Ma, Weiming Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating mathematical equations from natural language requires an accurate
+understanding of the relations among math expressions. Existing approaches can
+be broadly categorized into token-level and expression-level generation. The
+former treats equations as a mathematical language, sequentially generating
+math tokens. Expression-level methods generate each expression one by one.
+However, each expression represents a solving step, and there naturally exist
+parallel or dependent relations between these steps, which are ignored by
+current sequential methods. Therefore, we integrate tree structure into the
+expression-level generation and advocate an expression tree decoding strategy.
+To generate a tree with expression as its node, we employ a layer-wise parallel
+decoding strategy: we decode multiple independent expressions (leaf nodes) in
+parallel at each layer and repeat parallel decoding layer by layer to
+sequentially generate these parent node expressions that depend on others.
+Besides, a bipartite matching algorithm is adopted to align multiple
+predictions with annotations for each layer. Experiments show our method
+outperforms other baselines, especially for these equations with complex
+structures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP-2023, camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ When Does Monolingual Data Help Multilingual Translation: The Role of
+  Domain and Model Scale 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14124v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14124v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christos Baziotis, Biao Zhang, Alexandra Birch, Barry Haddow
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multilingual machine translation (MMT), trained on a mixture of parallel and
+monolingual data, is key for improving translation in low-resource language
+pairs. However, the literature offers conflicting results on the performance of
+different methods of including monolingual data. To resolve this, we examine
+how denoising autoencoding (DAE) and backtranslation (BT) impact MMT under
+different data conditions and model scales. Unlike prior studies, we use a
+realistic dataset of 100 translation directions and consider many domain
+combinations of monolingual and test data. We find that monolingual data
+generally helps MMT, but models are surprisingly brittle to domain mismatches,
+especially at smaller model scales. BT is beneficial when the parallel,
+monolingual, and test data sources are similar but can be detrimental
+otherwise, while DAE is less effective than previously reported. Next, we
+analyze the impact of scale (from 90M to 1.6B parameters) and find it is
+important for both methods, particularly DAE. As scale increases, DAE
+transitions from underperforming the parallel-only baseline at 90M to
+converging with BT performance at 1.6B, and even surpassing it in low-resource.
+These results offer new insights into how to best use monolingual data in MMT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can LMs Generalize to Future Data? An Empirical Analysis on Text
+  Summarization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.01951v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.01951v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chi Seng Cheang, Hou Pong Chan, Derek F. Wong, Xuebo Liu, Zhaocong Li, Yanming Sun, Shudong Liu, Lidia S. Chao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent pre-trained language models (PLMs) achieve promising results in
+existing abstractive summarization datasets. However, existing summarization
+benchmarks overlap in time with the standard pre-training corpora and
+finetuning datasets. Hence, the strong performance of PLMs may rely on the
+parametric knowledge that is memorized during pre-training and fine-tuning.
+Moreover, the knowledge memorized by PLMs may quickly become outdated, which
+affects the generalization performance of PLMs on future data. In this work, we
+propose TempoSum, a novel benchmark that contains data samples from 2010 to
+2022, to understand the temporal generalization ability of abstractive
+summarization models. Through extensive human evaluation, we show that
+parametric knowledge stored in summarization models significantly affects the
+faithfulness of the generated summaries on future data. Moreover, existing
+faithfulness enhancement methods cannot reliably improve the faithfulness of
+summarization models on future data. Finally, we discuss several
+recommendations to the research community on how to evaluate and improve the
+temporal generalization capability of text summarization models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ In-Context Learning in Large Language Models: A Neuroscience-inspired
+  Analysis of Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00313v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00313v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Safoora Yousefi, Leo Betthauser, Hosein Hasanbeig, Akanksha Saran, Raphaël Millière, Ida Momennejad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) exhibit remarkable performance improvement
+through in-context learning (ICL) by leveraging task-specific examples in the
+input. However, the mechanisms behind this improvement remain elusive. In this
+work, we investigate embeddings and attention representations in Llama-2 70B
+and Vicuna 13B. Specifically, we study how embeddings and attention change
+after in-context-learning, and how these changes mediate improvement in
+behavior. We employ neuroscience-inspired techniques, such as representational
+similarity analysis (RSA), and propose novel methods for parameterized probing
+and attention ratio analysis (ARA, measuring the ratio of attention to relevant
+vs. irrelevant information). We designed three tasks with a priori
+relationships among their conditions: reading comprehension, linear regression,
+and adversarial prompt injection. We formed hypotheses about expected
+similarities in task representations to investigate latent changes in
+embeddings and attention. Our analyses revealed a meaningful correlation
+between changes in both embeddings and attention representations with
+improvements in behavioral performance after ICL. This empirical framework
+empowers a nuanced understanding of how latent representations affect LLM
+behavior with and without ICL, offering valuable tools and insights for future
+research and practical applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Added overview figures 1-3 in this version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Skill<span class="highlight-title">GPT</span>: a RESTful API service for skill extraction and standardization
+  using a Large Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.11060v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.11060v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nan Li, Bo Kang, Tijl De Bie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present SkillGPT, a tool for skill extraction and standardization (SES)
+from free-style job descriptions and user profiles with an open-source Large
+Language Model (LLM) as backbone. Most previous methods for similar tasks
+either need supervision or rely on heavy data-preprocessing and feature
+engineering. Directly prompting the latest conversational LLM for standard
+skills, however, is slow, costly and inaccurate. In contrast, SkillGPT utilizes
+a LLM to perform its tasks in steps via summarization and vector similarity
+search, to balance speed with precision. The backbone LLM of SkillGPT is based
+on Llama, free for academic use and thus useful for exploratory research and
+prototype development. Hence, our cost-free SkillGPT gives users the
+convenience of conversational SES, efficiently and reliably.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chain of Hindsight Aligns Language Models with Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.02676v8">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.02676v8.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Liu, Carmelo Sferrazza, Pieter Abbeel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning from human preferences is important for language models to match
+human needs and to align with human and social values. Prior works have
+achieved remarkable successes by learning from human feedback to understand and
+follow instructions. Nonetheless, these methods are either founded on
+hand-picked model generations that are favored by human annotators, rendering
+them inefficient in terms of data utilization and challenging to apply in
+general, or they depend on reinforcement learning, which often suffers from
+imperfect reward functions and relies on extremely challenging optimizations.
+In this work, we propose a novel technique, Chain of Hindsight, that is easy to
+optimize and can learn from any form of feedback, regardless of its polarity.
+Our idea is inspired by how humans learn from extensive feedback presented in
+the form of languages. We convert all types of feedback into sequences of
+sentences, which are then used to fine-tune the model, allowing us to take
+advantage of the language comprehension capabilities of language models. We
+condition the model on a sequence of model generations paired with feedback. By
+doing so, the model is trained to generate outputs based on feedback, while
+learning to identify and correct negative attributes or errors. Applying our
+method to large language models, we observed that Chain of Hindsight
+significantly surpasses previous methods in aligning language models with human
+preferences. We report significant improvements on summarization and dialogue
+benchmarks, with our approach markedly preferred in human evaluations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Semantic Parsing by Large Language Models for Intricate Updating
+  Strategies of Zero-Shot Dialogue State Tracking <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10520v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10520v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxiang Wu, Guanting Dong, Weiran Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot Dialogue State Tracking (DST) addresses the challenge of acquiring
+and annotating task-oriented dialogues, which can be time consuming and costly.
+However, DST extends beyond simple slot-filling and requires effective updating
+strategies for tracking dialogue state as conversations progress. In this
+paper, we propose ParsingDST, a new In-Context Learning (ICL) method, to
+introduce additional intricate updating strategies in zero-shot DST. Our
+approach reformulates the DST task by leveraging powerful Large Language Models
+(LLMs) and translating the original dialogue text to JSON through semantic
+parsing as an intermediate state. We also design a novel framework that
+includes more modules to ensure the effectiveness of updating strategies in the
+text-to-JSON process. Experimental results demonstrate that our approach
+outperforms existing zero-shot DST methods on MultiWOZ, exhibiting significant
+improvements in Joint Goal Accuracy (JGA) and slot accuracy compared to
+existing ICL methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the Findings of EMNLP 2023 (Short Paper)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cognate <span class="highlight-title">Transformer</span> for Automated Phonological Reconstruction and
+  Cognate Reflex Prediction <span class="chip">EMNLP-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07487v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07487v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        V. S. D. S. Mahesh Akavarapu, Arnab Bhattacharya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Phonological reconstruction is one of the central problems in historical
+linguistics where a proto-word of an ancestral language is determined from the
+observed cognate words of daughter languages. Computational approaches to
+historical linguistics attempt to automate the task by learning models on
+available linguistic data. Several ideas and techniques drawn from
+computational biology have been successfully applied in the area of
+computational historical linguistics. Following these lines, we adapt MSA
+Transformer, a protein language model, to the problem of automated phonological
+reconstruction. MSA Transformer trains on multiple sequence alignments as input
+and is, thus, apt for application on aligned cognate words. We, hence, name our
+model as Cognate Transformer. We also apply the model on another associated
+task, namely, cognate reflex prediction, where a reflex word in a daughter
+language is predicted based on cognate words from other daughter languages. We
+show that our model outperforms the existing models on both tasks, especially
+when it is pre-trained on masked word prediction task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at EMNLP-2023 (Main)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Examining Inter-Consistency of Large Language Models Collaboration: An
+  In-depth Analysis via Debate <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11595v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11595v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Xiong, Xiao Ding, Yixin Cao, Ting Liu, Bing Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown impressive capabilities in various
+applications, but they still face various inconsistency issues. Existing works
+primarily focus on the inconsistency issues within a single LLM, while we
+complementarily explore the inter-consistency among multiple LLMs for
+collaboration. To examine whether LLMs can collaborate effectively to achieve a
+consensus for a shared goal, we focus on commonsense reasoning, and introduce a
+formal debate framework (FORD) to conduct a three-stage debate among LLMs with
+real-world scenarios alignment: fair debate, mismatched debate, and roundtable
+debate. Through extensive experiments on various datasets, LLMs can effectively
+collaborate to reach a consensus despite noticeable inter-inconsistencies, but
+imbalances in their abilities can lead to domination by superior LLMs.
+Leveraging a more advanced LLM like GPT-4 as an authoritative judge can boost
+collaboration performance. Our work contributes to understanding the
+inter-consistency among LLMs and lays the foundation for developing future
+collaboration methods. Codes and data are available at
+https://github.com/Waste-Wood/FORD
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 Findings Camera Ready Version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Foundation Metrics: Quantifying Effectiveness of Healthcare
+  Conversations powered by Generative AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.12444v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.12444v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahyar Abbasian, Elahe Khatibi, Iman Azimi, David Oniani, Zahra Shakeri Hossein Abad, Alexander Thieme, Ram Sriram, Zhongqi Yang, Yanshan Wang, Bryant Lin, Olivier Gevaert, Li-Jia Li, Ramesh Jain, Amir M. Rahmani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Artificial Intelligence is set to revolutionize healthcare
+delivery by transforming traditional patient care into a more personalized,
+efficient, and proactive process. Chatbots, serving as interactive
+conversational models, will probably drive this patient-centered transformation
+in healthcare. Through the provision of various services, including diagnosis,
+personalized lifestyle recommendations, and mental health support, the
+objective is to substantially augment patient health outcomes, all the while
+mitigating the workload burden on healthcare providers. The life-critical
+nature of healthcare applications necessitates establishing a unified and
+comprehensive set of evaluation metrics for conversational models. Existing
+evaluation metrics proposed for various generic large language models (LLMs)
+demonstrate a lack of comprehension regarding medical and health concepts and
+their significance in promoting patients' well-being. Moreover, these metrics
+neglect pivotal user-centered aspects, including trust-building, ethics,
+personalization, empathy, user comprehension, and emotional support. The
+purpose of this paper is to explore state-of-the-art LLM-based evaluation
+metrics that are specifically applicable to the assessment of interactive
+conversational models in healthcare. Subsequently, we present an comprehensive
+set of evaluation metrics designed to thoroughly assess the performance of
+healthcare chatbots from an end-user perspective. These metrics encompass an
+evaluation of language processing abilities, impact on real-world clinical
+tasks, and effectiveness in user-interactive conversations. Finally, we engage
+in a discussion concerning the challenges associated with defining and
+implementing these metrics, with particular emphasis on confounding factors
+such as the target audience, evaluation methods, and prompt techniques involved
+in the evaluation process.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 4 figures, 2 tables, journal paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multilingual Simplification of Medical Texts <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12532v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12532v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Joseph, Kathryn Kazanas, Keziah Reina, Vishnesh J. Ramanathan, Wei Xu, Byron C. Wallace, Junyi Jessy Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated text simplification aims to produce simple versions of complex
+texts. This task is especially useful in the medical domain, where the latest
+medical findings are typically communicated via complex and technical articles.
+This creates barriers for laypeople seeking access to up-to-date medical
+findings, consequently impeding progress on health literacy. Most existing work
+on medical text simplification has focused on monolingual settings, with the
+result that such evidence would be available only in just one language (most
+often, English). This work addresses this limitation via multilingual
+simplification, i.e., directly simplifying complex texts into simplified texts
+in multiple languages. We introduce MultiCochrane, the first sentence-aligned
+multilingual text simplification dataset for the medical domain in four
+languages: English, Spanish, French, and Farsi. We evaluate fine-tuned and
+zero-shot models across these languages, with extensive human assessments and
+analyses. Although models can now generate viable simplified texts, we identify
+outstanding challenges that this dataset might be used to address.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This version will be in EMNLP 2023 main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BanglaNLP at BLP-2023 Task 2: Benchmarking different <span class="highlight-title">Transformer</span> Models
+  for Sentiment Analysis of Bangla Social Media Posts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09238v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09238v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saumajit Saha, Albert Nanda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bangla is the 7th most widely spoken language globally, with a staggering 234
+million native speakers primarily hailing from India and Bangladesh. This
+morphologically rich language boasts a rich literary tradition, encompassing
+diverse dialects and language-specific challenges. Despite its linguistic
+richness and history, Bangla remains categorized as a low-resource language
+within the natural language processing (NLP) and speech community. This paper
+presents our submission to Task 2 (Sentiment Analysis of Bangla Social Media
+Posts) of the BLP Workshop. We experiment with various Transformer-based
+architectures to solve this task. Our quantitative results show that transfer
+learning really helps in better learning of the models in this low-resource
+language scenario. This becomes evident when we further finetune a model which
+has already been finetuned on twitter data for sentiment analysis task and that
+finetuned model performs the best among all other models. We also perform a
+detailed error analysis where we find some instances where ground truth labels
+need to be relooked at. We obtain a micro-F1 of 67.02\% on the test set and our
+performance in this shared task is ranked at 21 in the leaderboard.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 2 figures, workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mixture of Soft <span class="highlight-title">Prompt</span>s for Controllable Data Generation <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.01580v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.01580v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Derek Chen, Celine Lee, Yunan Lu, Domenic Rosati, Zhou Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) effectively generate fluent text when the target
+output follows natural language patterns. However, structured prediction tasks
+confine the output format to a limited ontology, causing even very large models
+to struggle since they were never trained with such restrictions in mind. The
+difficulty of using LLMs for direct prediction is exacerbated in few-shot
+learning scenarios, which commonly arise due to domain shift and resource
+limitations. We flip the problem on its head by leveraging the LLM as a tool
+for data augmentation rather than direct prediction. Our proposed Mixture of
+Soft Prompts (MSP) serves as a parameter-efficient procedure for generating
+data in a controlled manner. Denoising mechanisms are further applied to
+improve the quality of synthesized data. Automatic metrics show our method is
+capable of producing diverse and natural text, while preserving label
+semantics. Moreover, MSP achieves state-of-the-art results on three benchmarks
+when compared against strong baselines. Our method offers an alternate
+data-centric approach for applying LLMs to complex prediction tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 13 Tables, 2 Figures. Accepted at EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Domain Specialization as the Key to Make Large Language Models
+  Disruptive: A Comprehensive <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18703v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18703v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Ling, Xujiang Zhao, Jiaying Lu, Chengyuan Deng, Can Zheng, Junxiang Wang, Tanmoy Chowdhury, Yun Li, Hejie Cui, Xuchao Zhang, Tianjiao Zhao, Amit Panalkar, Wei Cheng, Haoyu Wang, Yanchi Liu, Zhengzhang Chen, Haifeng Chen, Chris White, Quanquan Gu, Jian Pei, Liang Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have significantly advanced the field of natural
+language processing (NLP), providing a highly useful, task-agnostic foundation
+for a wide range of applications. However, directly applying LLMs to solve
+sophisticated problems in specific domains meets many hurdles, caused by the
+heterogeneity of domain data, the sophistication of domain knowledge, the
+uniqueness of domain objectives, and the diversity of the constraints (e.g.,
+various social norms, cultural conformity, religious beliefs, and ethical
+standards in the domain applications). Domain specification techniques are key
+to make large language models disruptive in many applications. Specifically, to
+solve these hurdles, there has been a notable increase in research and
+practices conducted in recent years on the domain specialization of LLMs. This
+emerging field of study, with its substantial potential for impact,
+necessitates a comprehensive and systematic review to better summarize and
+guide ongoing work in this area. In this article, we present a comprehensive
+survey on domain specification techniques for large language models, an
+emerging direction critical for large language model applications. First, we
+propose a systematic taxonomy that categorizes the LLM domain-specialization
+techniques based on the accessibility to LLMs and summarizes the framework for
+all the subcategories as well as their relations and differences to each other.
+Second, we present an extensive taxonomy of critical application domains that
+can benefit dramatically from specialized LLMs, discussing their practical
+significance and open challenges. Last, we offer our insights into the current
+research status and future trends in this area.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Expressive Power of <span class="highlight-title">Transformer</span>s with Chain of Thought 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07923v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07923v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        William Merrill, Ashish Sabharwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent theoretical work has identified surprisingly simple reasoning
+problems, such as checking if two nodes in a graph are connected or simulating
+finite-state machines, that are provably unsolvable by standard transformers
+that answer immediately after reading their input. However, in practice,
+transformers' reasoning can be improved by allowing them to use a "chain of
+thought" or "scratchpad", i.e., generate and condition on a sequence of
+intermediate tokens before answering. Motivated by this, we ask: Does such
+intermediate generation fundamentally extend the computational power of a
+decoder-only transformer? We show that the answer is yes, but the amount of
+increase depends crucially on the amount of intermediate generation. For
+instance, we find that transformer decoders with a logarithmic number of
+decoding steps (w.r.t. the input length) push the limits of standard
+transformers only slightly, while a linear number of decoding steps adds a
+clear new ability (under standard complexity conjectures): recognizing all
+regular languages. Our results also imply that linear steps keep transformer
+decoders within context-sensitive languages, and polynomial steps make them
+recognize exactly the class of polynomial-time solvable problems -- the first
+exact characterization of a type of transformers in terms of standard
+complexity classes. Together, our results provide a nuanced framework for
+understanding how the length of a transformer's chain of thought or scratchpad
+impacts its reasoning power.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9-page preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RexUIE: A Recursive Method with Explicit Schema Instructor for Universal
+  Information Extraction <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14770v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14770v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengyuan Liu, Fubang Zhao, Yangyang Kang, Jingyuan Zhang, Xiang Zhou, Changlong Sun, Kun Kuang, Fei Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Universal Information Extraction (UIE) is an area of interest due to the
+challenges posed by varying targets, heterogeneous structures, and
+demand-specific schemas. However, previous works have only achieved limited
+success by unifying a few tasks, such as Named Entity Recognition (NER) and
+Relation Extraction (RE), which fall short of being authentic UIE models
+particularly when extracting other general schemas such as quadruples and
+quintuples. Additionally, these models used an implicit structural schema
+instructor, which could lead to incorrect links between types, hindering the
+model's generalization and performance in low-resource scenarios. In this
+paper, we redefine the authentic UIE with a formal formulation that encompasses
+almost all extraction schemas. To the best of our knowledge, we are the first
+to introduce UIE for any kind of schemas. In addition, we propose RexUIE, which
+is a Recursive Method with Explicit Schema Instructor for UIE. To avoid
+interference between different types, we reset the position ids and attention
+mask matrices. RexUIE shows strong performance under both full-shot and
+few-shot settings and achieves State-of-the-Art results on the tasks of
+extracting complex schemas.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Paraphrasing evades detectors of AI-generated text, but retrieval is an
+  effective defense <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13408v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13408v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kalpesh Krishna, Yixiao Song, Marzena Karpinska, John Wieting, Mohit Iyyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rise in malicious usage of large language models, such as fake content
+creation and academic plagiarism, has motivated the development of approaches
+that identify AI-generated text, including those based on watermarking or
+outlier detection. However, the robustness of these detection algorithms to
+paraphrases of AI-generated text remains unclear. To stress test these
+detectors, we build a 11B parameter paraphrase generation model (DIPPER) that
+can paraphrase paragraphs, condition on surrounding context, and control
+lexical diversity and content reordering. Using DIPPER to paraphrase text
+generated by three large language models (including GPT3.5-davinci-003)
+successfully evades several detectors, including watermarking, GPTZero,
+DetectGPT, and OpenAI's text classifier. For example, DIPPER drops detection
+accuracy of DetectGPT from 70.3% to 4.6% (at a constant false positive rate of
+1%), without appreciably modifying the input semantics.
+  To increase the robustness of AI-generated text detection to paraphrase
+attacks, we introduce a simple defense that relies on retrieving
+semantically-similar generations and must be maintained by a language model API
+provider. Given a candidate text, our algorithm searches a database of
+sequences previously generated by the API, looking for sequences that match the
+candidate text within a certain threshold. We empirically verify our defense
+using a database of 15M generations from a fine-tuned T5-XXL model and find
+that it can detect 80% to 97% of paraphrased generations across different
+settings while only classifying 1% of human-written sequences as AI-generated.
+We open-source our models, code and data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023 camera ready (32 pages). Code, models, data available in
+  https://github.com/martiansideofthemoon/ai-detection-paraphrases</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MADNet: Maximizing Addressee Deduction Expectation for Multi-Party
+  Conversation Generation <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12733v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12733v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia-Chen Gu, Chao-Hong Tan, Caiyuan Chu, Zhen-Hua Ling, Chongyang Tao, Quan Liu, Cong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modeling multi-party conversations (MPCs) with graph neural networks has been
+proven effective at capturing complicated and graphical information flows.
+However, existing methods rely heavily on the necessary addressee labels and
+can only be applied to an ideal setting where each utterance must be tagged
+with an addressee label. To study the scarcity of addressee labels which is a
+common issue in MPCs, we propose MADNet that maximizes addressee deduction
+expectation in heterogeneous graph neural networks for MPC generation. Given an
+MPC with a few addressee labels missing, existing methods fail to build a
+consecutively connected conversation graph, but only a few separate
+conversation fragments instead. To ensure message passing between these
+conversation fragments, four additional types of latent edges are designed to
+complete a fully-connected graph. Besides, to optimize the edge-type-dependent
+message passing for those utterances without addressee labels, an
+Expectation-Maximization-based method that iteratively generates silver
+addressee labels (E step), and optimizes the quality of generated responses (M
+step), is designed. Experimental results on two Ubuntu IRC channel benchmarks
+show that MADNet outperforms various baseline models on the task of MPC
+generation, especially under the more common and challenging setting where part
+of addressee labels are missing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by EMNLP 2023. arXiv admin note: text overlap with
+  arXiv:2203.08500</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Model as Attributed Training Data Generator: A Tale of
+  Diversity and Bias <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15895v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15895v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Yu, Yuchen Zhuang, Jieyu Zhang, Yu Meng, Alexander Ratner, Ranjay Krishna, Jiaming Shen, Chao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have been recently leveraged as training data
+generators for various natural language processing (NLP) tasks. While previous
+research has explored different approaches to training models using generated
+data, they generally rely on simple class-conditional prompts, which may limit
+the diversity of the generated data and inherit systematic biases of LLM. Thus,
+we investigate training data generation with diversely attributed prompts
+(e.g., specifying attributes like length and style), which have the potential
+to yield diverse and attributed generated data. Our investigation focuses on
+datasets with high cardinality and diverse domains, wherein we demonstrate that
+attributed prompts outperform simple class-conditional prompts in terms of the
+resulting model's performance. Additionally, we present a comprehensive
+empirical study on data generation encompassing vital aspects like bias,
+diversity, and efficiency, and highlight three key observations: firstly,
+synthetic datasets generated by simple prompts exhibit significant biases, such
+as regional bias; secondly, attribute diversity plays a pivotal role in
+enhancing model performance; lastly, attributed prompts achieve the performance
+of simple class-conditional prompts while utilizing only 5\% of the querying
+cost of ChatGPT associated with the latter. The data and code are available on
+\url{https://github.com/yueyu1030/AttrPrompt}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2023 (Datasets and Benchmarks Track)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revisiting Entropy Rate Constancy in Text <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12084v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12084v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vivek Verma, Nicholas Tomlin, Dan Klein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The uniform information density (UID) hypothesis states that humans tend to
+distribute information roughly evenly across an utterance or discourse. Early
+evidence in support of the UID hypothesis came from Genzel & Charniak (2002),
+which proposed an entropy rate constancy principle based on the probability of
+English text under n-gram language models. We re-evaluate the claims of Genzel
+& Charniak (2002) with neural language models, failing to find clear evidence
+in support of entropy rate constancy. We conduct a range of experiments across
+datasets, model sizes, and languages and discuss implications for the uniform
+information density hypothesis and linguistic theories of efficient
+communication more broadly.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Harnessing the Power of LLMs: Evaluating Human-AI Text Co-Creation
+  through the Lens of News Headline Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10706v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10706v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijian Ding, Alison Smith-Renner, Wenjuan Zhang, Joel R. Tetreault, Alejandro Jaimes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To explore how humans can best leverage LLMs for writing and how interacting
+with these models affects feelings of ownership and trust in the writing
+process, we compared common human-AI interaction types (e.g., guiding system,
+selecting from system outputs, post-editing outputs) in the context of
+LLM-assisted news headline generation. While LLMs alone can generate
+satisfactory news headlines, on average, human control is needed to fix
+undesirable model outputs. Of the interaction methods, guiding and selecting
+model output added the most benefit with the lowest cost (in time and effort).
+Further, AI assistance did not harm participants' perception of control
+compared to freeform editing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Seq2Seq-SC: End-to-End Semantic Communication Systems with <span class="highlight-title">Pre-train</span>ed
+  Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.15237v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.15237v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ju-Hyung Lee, Dong-Ho Lee, Eunsoo Sheen, Thomas Choi, Jay Pujara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose a realistic semantic network called seq2seq-SC,
+designed to be compatible with 5G NR and capable of working with generalized
+text datasets using a pre-trained language model. The goal is to achieve
+unprecedented communication efficiency by focusing on the meaning of messages
+in semantic communication. We employ a performance metric called semantic
+similarity, measured by BLEU for lexical similarity and SBERT for semantic
+similarity. Our findings demonstrate that seq2seq-SC outperforms previous
+models in extracting semantically meaningful information while maintaining
+superior performance. This study paves the way for continued advancements in
+semantic communication and its prospective incorporation with future wireless
+systems in 6G networks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE ASILOMAR 2023 conference, 4 pages, 2 figures, 3
+  tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">92</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Probabilistic Sampling of Balanced K-Means using Adiabatic Quantum
+  Computing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12153v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12153v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan-Nico Zaech, Martin Danelljan, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adiabatic quantum computing (AQC) is a promising quantum computing approach
+for discrete and often NP-hard optimization problems. Current AQCs allow to
+implement problems of research interest, which has sparked the development of
+quantum representations for many machine learning and computer vision tasks.
+Despite requiring multiple measurements from the noisy AQC, current approaches
+only utilize the best measurement, discarding information contained in the
+remaining ones. In this work, we explore the potential of using this
+information for probabilistic balanced k-means clustering. Instead of
+discarding non-optimal solutions, we propose to use them to compute calibrated
+posterior probabilities with little additional compute cost. This allows us to
+identify ambiguous solutions and data points, which we demonstrate on a D-Wave
+AQC on synthetic and real data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning from Rich Semantics and Coarse Locations for Long-tailed Object
+  Detection <span class="chip">NeurIPS2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12152v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12152v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingchen Meng, Xiyang Dai, Jianwei Yang, Dongdong Chen, Yinpeng Chen, Mengchen Liu, Yi-Ling Chen, Zuxuan Wu, Lu Yuan, Yu-Gang Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Long-tailed object detection (LTOD) aims to handle the extreme data imbalance
+in real-world datasets, where many tail classes have scarce instances. One
+popular strategy is to explore extra data with image-level labels, yet it
+produces limited results due to (1) semantic ambiguity -- an image-level label
+only captures a salient part of the image, ignoring the remaining rich
+semantics within the image; and (2) location sensitivity -- the label highly
+depends on the locations and crops of the original image, which may change
+after data transformations like random cropping. To remedy this, we propose
+RichSem, a simple but effective method, which is robust to learn rich semantics
+from coarse locations without the need of accurate bounding boxes. RichSem
+leverages rich semantics from images, which are then served as additional soft
+supervision for training detectors. Specifically, we add a semantic branch to
+our detector to learn these soft semantics and enhance feature representations
+for long-tailed object detection. The semantic branch is only used for training
+and is removed during inference. RichSem achieves consistent improvements on
+both overall and rare-category of LVIS under different backbones and detectors.
+Our method achieves state-of-the-art performance without requiring complex
+training and testing procedures. Moreover, we show the effectiveness of our
+method on other long-tailed datasets with additional experiments. Code is
+available at \url{https://github.com/MengLcool/RichSem}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Object-aware Inversion and Reassembly for Image Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12149v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12149v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Yang, Dinggang Gui, Wen Wang, Hao Chen, Bohan Zhuang, Chunhua Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  By comparing the original and target prompts in editing task, we can obtain
+numerous editing pairs, each comprising an object and its corresponding editing
+target. To allow editability while maintaining fidelity to the input image,
+existing editing methods typically involve a fixed number of inversion steps
+that project the whole input image to its noisier latent representation,
+followed by a denoising process guided by the target prompt. However, we find
+that the optimal number of inversion steps for achieving ideal editing results
+varies significantly among different editing pairs, owing to varying editing
+difficulties. Therefore, the current literature, which relies on a fixed number
+of inversion steps, produces sub-optimal generation quality, especially when
+handling multiple editing pairs in a natural image. To this end, we propose a
+new image editing paradigm, dubbed Object-aware Inversion and Reassembly (OIR),
+to enable object-level fine-grained editing. Specifically, we design a new
+search metric, which determines the optimal inversion steps for each editing
+pair, by jointly considering the editability of the target and the fidelity of
+the non-editing region. We use our search metric to find the optimal inversion
+step for each editing pair when editing an image. We then edit these editing
+pairs separately to avoid concept mismatch. Subsequently, we propose an
+additional reassembly step to seamlessly integrate the respective editing
+results and the non-editing region to obtain the final edited image. To
+systematically evaluate the effectiveness of our method, we collect two
+datasets for benchmarking single- and multi-object editing, respectively.
+Experiments demonstrate that our method achieves superior performance in
+editing object shapes, colors, materials, categories, etc., especially in
+multi-object editing scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://aim-uofa.github.io/OIR-Diffusion/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InViG: Benchmarking Interactive Visual Grounding with 500K Human-Robot
+  Interactions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12147v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12147v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanbo Zhang, Jie Xu, Yuchen Mo, Tao Kong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ambiguity is ubiquitous in human communication. Previous approaches in
+Human-Robot Interaction (HRI) have often relied on predefined interaction
+templates, leading to reduced performance in realistic and open-ended
+scenarios. To address these issues, we present a large-scale dataset, \invig,
+for interactive visual grounding under language ambiguity. Our dataset
+comprises over 520K images accompanied by open-ended goal-oriented
+disambiguation dialogues, encompassing millions of object instances and
+corresponding question-answer pairs. Leveraging the \invig dataset, we conduct
+extensive studies and propose a set of baseline solutions for end-to-end
+interactive visual disambiguation and grounding, achieving a 45.6\% success
+rate during validation. To the best of our knowledge, the \invig dataset is the
+first large-scale dataset for resolving open-ended interactive visual
+grounding, presenting a practical yet highly challenging benchmark for
+ambiguity-aware HRI. Codes and datasets are available at:
+\href{https://openivg.github.io}{https://openivg.github.io}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 9 figures, 3 tables, under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diagrammer<span class="highlight-title">GPT</span>: Generating Open-Domain, Open-Platform Diagrams via LLM
+  Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12128v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12128v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhay Zala, Han Lin, Jaemin Cho, Mohit Bansal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image (T2I) generation has seen significant growth over the past few
+years. Despite this, there has been little work on generating diagrams with T2I
+models. A diagram is a symbolic/schematic representation that explains
+information using structurally rich and spatially complex visualizations (e.g.,
+a dense combination of related objects, text labels, directional arrows,
+connection lines, etc.). Existing state-of-the-art T2I models often fail at
+diagram generation because they lack fine-grained object layout control when
+many objects are densely connected via complex relations such as arrows/lines
+and also often fail to render comprehensible text labels. To address this gap,
+we present DiagrammerGPT, a novel two-stage text-to-diagram generation
+framework that leverages the layout guidance capabilities of LLMs (e.g., GPT-4)
+to generate more accurate open-domain, open-platform diagrams. In the first
+stage, we use LLMs to generate and iteratively refine 'diagram plans' (in a
+planner-auditor feedback loop) which describe all the entities (objects and
+text labels), their relationships (arrows or lines), and their bounding box
+layouts. In the second stage, we use a diagram generator, DiagramGLIGEN, and a
+text label rendering module to generate diagrams following the diagram plans.
+To benchmark the text-to-diagram generation task, we introduce AI2D-Caption, a
+densely annotated diagram dataset built on top of the AI2D dataset. We show
+quantitatively and qualitatively that our DiagrammerGPT framework produces more
+accurate diagrams, outperforming existing T2I models. We also provide
+comprehensive analysis including open-domain diagram generation, vector graphic
+diagram generation in different platforms, human-in-the-loop diagram plan
+editing, and multimodal planner/auditor LLMs (e.g., GPT-4Vision). We hope our
+work can inspire further research on diagram generation via T2I models and
+LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://diagrammerGPT.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Non-Intrusive Adaptation: Input-Centric Parameter-efficient Fine-Tuning
+  for Versatile Multimodal Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12100v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12100v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaqing Wang, Jialin Wu, Tanmaya Dabral, Jiageng Zhang, Geoff Brown, Chun-Ta Lu, Frederick Liu, Yi Liang, Bo Pang, Michael Bendersky, Radu Soricut
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) and vision language models (VLMs) demonstrate
+excellent performance on a wide range of tasks by scaling up parameter counts
+from O(10^9) to O(10^{12}) levels and further beyond. These large scales make
+it impossible to adapt and deploy fully specialized models given a task of
+interest. Parameter-efficient fine-tuning (PEFT) emerges as a promising
+direction to tackle the adaptation and serving challenges for such large
+models. We categorize PEFT techniques into two types: intrusive and
+non-intrusive. Intrusive PEFT techniques directly change a model's internal
+architecture. Though more flexible, they introduce significant complexities for
+training and serving. Non-intrusive PEFT techniques leave the internal
+architecture unchanged and only adapt model-external parameters, such as
+embeddings for input. In this work, we describe AdaLink as a non-intrusive PEFT
+technique that achieves competitive performance compared to SoTA intrusive PEFT
+(LoRA) and full model fine-tuning (FT) on various tasks. We evaluate using both
+text-only and multimodal tasks, with experiments that account for both
+parameter-count scaling and training regime (with and without instruction
+tuning).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HSTR-Net: Reference Based Video Super-resolution for Aerial Surveillance
+  with Dual Cameras 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12092v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12092v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        H. Umut Suluhan, Hasan F. Ates, Bahadir K. Gunturk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aerial surveillance requires high spatio-temporal resolution (HSTR) video for
+more accurate detection and tracking of objects. This is especially true for
+wide-area surveillance (WAS), where the surveyed region is large and the
+objects of interest are small. This paper proposes a dual camera system for the
+generation of HSTR video using reference-based super-resolution (RefSR). One
+camera captures high spatial resolution low frame rate (HSLF) video while the
+other captures low spatial resolution high frame rate (LSHF) video
+simultaneously for the same scene. A novel deep learning architecture is
+proposed to fuse HSLF and LSHF video feeds and synthesize HSTR video frames at
+the output. The proposed model combines optical flow estimation and
+(channel-wise and spatial) attention mechanisms to capture the fine motion and
+intricate dependencies between frames of the two video feeds. Simulations show
+that the proposed model provides significant improvement over existing
+reference-based SR techniques in terms of PSNR and SSIM metrics. The method
+also exhibits sufficient frames per second (FPS) for WAS when deployed on a
+power-constrained drone equipped with dual cameras.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 8 figures, 8 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unveiling the Siren's Song: Towards Reliable Fact-Conflicting
+  Hallucination Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12086v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12086v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Chen, Duanzheng Song, Honghao Gui, Chengxi Wang, Ningyu Zhang, Fei Huang, Chengfei Lv, Dan Zhang, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs), such as ChatGPT/GPT-4, have garnered widespread
+attention owing to their myriad of practical applications, yet their adoption
+has been constrained by issues of fact-conflicting hallucinations across web
+platforms. The assessment of factuality in text, produced by LLMs, remains
+inadequately explored, extending not only to the judgment of vanilla facts but
+also encompassing the evaluation of factual errors emerging in complex
+inferential tasks like multi-hop, and etc. In response, we introduce FactCHD, a
+fact-conflicting hallucination detection benchmark meticulously designed for
+LLMs. Functioning as a pivotal tool in evaluating factuality within
+"Query-Respons" contexts, our benchmark assimilates a large-scale dataset,
+encapsulating a broad spectrum of factuality patterns, such as vanilla,
+multi-hops, comparison, and set-operation patterns. A distinctive feature of
+our benchmark is its incorporation of fact-based chains of evidence, thereby
+facilitating comprehensive and conducive factual reasoning throughout the
+assessment process. We evaluate multiple LLMs, demonstrating the effectiveness
+of the benchmark and current methods fall short of faithfully detecting factual
+errors. Furthermore, we present TRUTH-TRIANGULATOR that synthesizes reflective
+considerations by tool-enhanced ChatGPT and LoRA-tuning based on Llama2, aiming
+to yield more credible detection through the amalgamation of predictive results
+and evidence. The benchmark dataset and source code will be made available in
+https://github.com/zjunlp/FactCHD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Benefit of Generative Foundation Models for Human Activity
+  Recognition <span class="chip">ISWC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12085v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12085v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zikang Leng, Hyeokhyen Kwon, Thomas Plötz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In human activity recognition (HAR), the limited availability of annotated
+data presents a significant challenge. Drawing inspiration from the latest
+advancements in generative AI, including Large Language Models (LLMs) and
+motion synthesis models, we believe that generative AI can address this data
+scarcity by autonomously generating virtual IMU data from text descriptions.
+Beyond this, we spotlight several promising research pathways that could
+benefit from generative AI for the community, including the generating
+benchmark datasets, the development of foundational models specific to HAR, the
+exploration of hierarchical structures within HAR, breaking down complex
+activities, and applications in health sensing and activity summarization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Generative AI for Pervasive Computing (GenAI4PC) Symposium within
+  UbiComp/ISWC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ One-Shot Imitation Learning: A Pose Estimation Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12077v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12077v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pietro Vitiello, Kamil Dreczkowski, Edward Johns
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study imitation learning under the challenging setting of:
+(1) only a single demonstration, (2) no further data collection, and (3) no
+prior task or object knowledge. We show how, with these constraints, imitation
+learning can be formulated as a combination of trajectory transfer and unseen
+object pose estimation. To explore this idea, we provide an in-depth study on
+how state-of-the-art unseen object pose estimators perform for one-shot
+imitation learning on ten real-world tasks, and we take a deep dive into the
+effects that camera calibration, pose estimation error, and spatial
+generalisation have on task success rates. For videos, please visit
+https://www.robot-learning.uk/pose-estimation-perspective.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at the 7th Conference on Robot Learning (CoRL 2023). For
+  more details please visit
+  https://www.robot-learning.uk/pose-estimation-perspective</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Fairness in <span class="highlight-title">Pre-train</span>ed Visual <span class="highlight-title">Transformer</span> based Natural and
+  GAN Generated Image Detection Systems and Understanding the Impact of Image
+  Compression in Fairness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12076v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12076v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manjary P. Gangan, Anoop Kadan, Lajish V L
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is not only sufficient to construct computational models that can
+accurately classify or detect fake images from real images taken from a camera,
+but it is also important to ensure whether these computational models are fair
+enough or produce biased outcomes that can eventually harm certain social
+groups or cause serious security threats. Exploring fairness in forensic
+algorithms is an initial step towards correcting these biases. Since visual
+transformers are recently being widely used in most image classification based
+tasks due to their capability to produce high accuracies, this study tries to
+explore bias in the transformer based image forensic algorithms that classify
+natural and GAN generated images. By procuring a bias evaluation corpora, this
+study analyzes bias in gender, racial, affective, and intersectional domains
+using a wide set of individual and pairwise bias evaluation measures. As the
+generalizability of the algorithms against image compression is an important
+factor to be considered in forensic tasks, this study also analyzes the role of
+image compression on model bias. Hence to study the impact of image compression
+on model bias, a two phase evaluation setting is followed, where a set of
+experiments is carried out in the uncompressed evaluation setting and the other
+in the compressed evaluation setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the use of Vision-Language models for Visual Sentiment Analysis: a
+  study on CLIP 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12062v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12062v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cristina Bustos, Carles Civit, Brian Du, Albert Sole-Ribalta, Agata Lapedriza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work presents a study on how to exploit the CLIP embedding space to
+perform Visual Sentiment Analysis. We experiment with two architectures built
+on top of the CLIP embedding space, which we denote by CLIP-E. We train the
+CLIP-E models with WEBEmo, the largest publicly available and manually labeled
+benchmark for Visual Sentiment Analysis, and perform two sets of experiments.
+First, we test on WEBEmo and compare the CLIP-E architectures with
+state-of-the-art (SOTA) models and with CLIP Zero-Shot. Second, we perform
+cross dataset evaluation, and test the CLIP-E architectures trained with WEBEmo
+on other Visual Sentiment Analysis benchmarks. Our results show that the CLIP-E
+approaches outperform SOTA models in WEBEmo fine grained categorization, and
+they also generalize better when tested on datasets that have not been seen
+during training. Interestingly, we observed that for the FI dataset, CLIP
+Zero-Shot produces better accuracies than SOTA models and CLIP-E trained on
+WEBEmo. These results motivate several questions that we discuss in this paper,
+such as how we should design new benchmarks and evaluate Visual Sentiment
+Analysis, and whether we should keep designing tailored Deep Learning models
+for Visual Sentiment Analysis or focus our efforts on better using the
+knowledge encoded in large vision-language models such as CLIP for this task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Class-Conditional Distribution Alignment for Partial Domain
+  Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12060v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12060v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sandipan Choudhuri, Arunabha Sen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unwanted samples from private source categories in the learning objective of
+a partial domain adaptation setup can lead to negative transfer and reduce
+classification performance. Existing methods, such as re-weighting or
+aggregating target predictions, are vulnerable to this issue, especially during
+initial training stages, and do not adequately address overlapping categorical
+distributions. We propose a solution to overcome these limitations by exploring
+beyond the first-order moments for robust alignment of categorical
+distributions. We employ objectives that optimize the intra and inter-class
+distributions in a domain-invariant fashion and design a robust pseudo-labeling
+for efficient target supervision. Our approach incorporates a complement
+entropy objective module to reduce classification uncertainty and flatten
+incorrect category predictions. The experimental findings and ablation analysis
+of the proposed modules demonstrate the superior performance of our proposed
+model compared to benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SegmATRon: Embodied Adaptive Semantic Segmentation for Indoor
+  Environment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12031v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12031v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tatiana Zemskova, Margarita Kichik, Dmitry Yudin, Aleksei Staroverov, Aleksandr Panov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an adaptive transformer model named SegmATRon for
+embodied image semantic segmentation. Its distinctive feature is the adaptation
+of model weights during inference on several images using a hybrid
+multicomponent loss function. We studied this model on datasets collected in
+the photorealistic Habitat and the synthetic AI2-THOR Simulators. We showed
+that obtaining additional images using the agent's actions in an indoor
+environment can improve the quality of semantic segmentation. The code of the
+proposed approach and datasets are publicly available at
+https://github.com/wingrune/SegmATRon.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LoHoRavens: A Long-Horizon Language-Conditioned Benchmark for Robotic
+  Tabletop Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12020v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12020v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengqiang Zhang, Philipp Wicke, Lütfi Kerem Şenel, Luis Figueredo, Abdeldjallil Naceri, Sami Haddadin, Barbara Plank, Hinrich Schütze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The convergence of embodied agents and large language models (LLMs) has
+brought significant advancements to embodied instruction following.
+Particularly, the strong reasoning capabilities of LLMs make it possible for
+robots to perform long-horizon tasks without expensive annotated
+demonstrations. However, public benchmarks for testing the long-horizon
+reasoning capabilities of language-conditioned robots in various scenarios are
+still missing. To fill this gap, this work focuses on the tabletop manipulation
+task and releases a simulation benchmark, \textit{LoHoRavens}, which covers
+various long-horizon reasoning aspects spanning color, size, space, arithmetics
+and reference. Furthermore, there is a key modality bridging problem for
+long-horizon manipulation tasks with LLMs: how to incorporate the observation
+feedback during robot execution for the LLM's closed-loop planning, which is
+however less studied by prior work. We investigate two methods of bridging the
+modality gap: caption generation and learnable interface for incorporating
+explicit and implicit observation feedback to the LLM, respectively. These
+methods serve as the two baselines for our proposed benchmark. Experiments show
+that both methods struggle to solve some tasks, indicating long-horizon
+manipulation tasks are still challenging for current popular models. We expect
+the proposed public benchmark and baselines can help the community develop
+better models for long-horizon tabletop manipulation tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages, 4 figures. The video and code of LoHoRavens are available at
+  https://shengqiang-zhang.github.io/lohoravens-webpage/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Decision-based Black-box Attacks on Face Forgery Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12017v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12017v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaoyu Chen, Bo Li, Kaixun Jiang, Shuang Wu, Shouhong Ding, Wenqiang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face forgery generation technologies generate vivid faces, which have raised
+public concerns about security and privacy. Many intelligent systems, such as
+electronic payment and identity verification, rely on face forgery detection.
+Although face forgery detection has successfully distinguished fake faces,
+recent studies have demonstrated that face forgery detectors are very
+vulnerable to adversarial examples. Meanwhile, existing attacks rely on network
+architectures or training datasets instead of the predicted labels, which leads
+to a gap in attacking deployed applications. To narrow this gap, we first
+explore the decision-based attacks on face forgery detection. However, applying
+existing decision-based attacks directly suffers from perturbation
+initialization failure and low image quality. First, we propose cross-task
+perturbation to handle initialization failures by utilizing the high
+correlation of face features on different tasks. Then, inspired by using
+frequency cues by face forgery detection, we propose the frequency
+decision-based attack. We add perturbations in the frequency domain and then
+constrain the visual quality in the spatial domain. Finally, extensive
+experiments demonstrate that our method achieves state-of-the-art attack
+performance on FaceForensics++, CelebDF, and industrial APIs, with high query
+efficiency and guaranteed image quality. Further, the fake faces by our method
+can pass face forgery detection and face recognition, which exposes the
+security problems of face forgery detectors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KI-PMF: Knowledge Integrated Plausible Motion Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12007v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12007v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhishek Vivekanandan, Ahmed Abouelazm, Philip Schörner, J. Marius Zöllner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately forecasting the motion of traffic actors is crucial for the
+deployment of autonomous vehicles at a large scale. Current trajectory
+forecasting approaches primarily concentrate on optimizing a loss function with
+a specific metric, which can result in predictions that do not adhere to
+physical laws or violate external constraints. Our objective is to incorporate
+explicit knowledge priors that allow a network to forecast future trajectories
+in compliance with both the kinematic constraints of a vehicle and the geometry
+of the driving environment. To achieve this, we introduce a non-parametric
+pruning layer and attention layers to integrate the defined knowledge priors.
+Our proposed method is designed to ensure reachability guarantees for traffic
+actors in both complex and dynamic situations. By conditioning the network to
+follow physical laws, we can obtain accurate and safe predictions, essential
+for maintaining autonomous vehicles' safety and efficiency in real-world
+settings.In summary, this paper presents concepts that prevent off-road
+predictions for safe and reliable motion forecasting by incorporating knowledge
+priors into the training process.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Image Super-resolution Via Latent Diffusion: A Sampling-space Mixture Of
+  Experts And Frequency-augmented Decoder Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12004v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12004v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feng Luo, Jinxi Xiang, Jun Zhang, Xiao Han, Wei Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent use of diffusion prior, enhanced by pre-trained text-image models,
+has markedly elevated the performance of image super-resolution (SR). To
+alleviate the huge computational cost required by pixel-based diffusion SR,
+latent-based methods utilize a feature encoder to transform the image and then
+implement the SR image generation in a compact latent space. Nevertheless,
+there are two major issues that limit the performance of latent-based
+diffusion. First, the compression of latent space usually causes reconstruction
+distortion. Second, huge computational cost constrains the parameter scale of
+the diffusion model. To counteract these issues, we first propose a frequency
+compensation module that enhances the frequency components from latent space to
+pixel space. The reconstruction distortion (especially for high-frequency
+information) can be significantly decreased. Then, we propose to use
+Sample-Space Mixture of Experts (SS-MoE) to achieve more powerful latent-based
+SR, which steadily improves the capacity of the model without a significant
+increase in inference costs. These carefully crafted designs contribute to
+performance improvements in largely explored 4x blind super-resolution
+benchmarks and extend to large magnification factors, i.e., 8x image SR
+benchmarks. The code is available at https://github.com/amandaluof/moe_sr.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bayesian Flow Networks in Continual Learning <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12001v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12001v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mateusz Pyla, Kamil Deja, Bartłomiej Twardowski, Tomasz Trzciński
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian Flow Networks (BFNs) has been recently proposed as one of the most
+promising direction to universal generative modelling, having ability to learn
+any of the data type. Their power comes from the expressiveness of neural
+networks and Bayesian inference which make them suitable in the context of
+continual learning. We delve into the mechanics behind BFNs and conduct the
+experiments to empirically verify the generative capabilities on non-stationary
+data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to NeurIPS 2023 Workshop on Diffusion Models</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-modal Medical Neurological Image Fusion using Wavelet Pooled Edge
+  Preserving Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11910v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11910v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manisha Das, Deep Gupta, Petia Radeva, Ashwini M Bakde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image fusion integrates the complementary diagnostic information of
+the source image modalities for improved visualization and analysis of
+underlying anomalies. Recently, deep learning-based models have excelled the
+conventional fusion methods by executing feature extraction, feature selection,
+and feature fusion tasks, simultaneously. However, most of the existing
+convolutional neural network (CNN) architectures use conventional pooling or
+strided convolutional strategies to downsample the feature maps. It causes the
+blurring or loss of important diagnostic information and edge details available
+in the source images and dilutes the efficacy of the feature extraction
+process. Therefore, this paper presents an end-to-end unsupervised fusion model
+for multimodal medical images based on an edge-preserving dense autoencoder
+network. In the proposed model, feature extraction is improved by using wavelet
+decomposition-based attention pooling of feature maps. This helps in preserving
+the fine edge detail information present in both the source images and enhances
+the visual perception of fused images. Further, the proposed model is trained
+on a variety of medical image pairs which helps in capturing the intensity
+distributions of the source images and preserves the diagnostic information
+effectively. Substantial experiments are conducted which demonstrate that the
+proposed method provides improved visual and quantitative results as compared
+to the other state-of-the-art fusion methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A New Multimodal Medical Image Fusion based on Laplacian Autoencoder
+  with Channel Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11896v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11896v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Payal Wankhede, Manisha Das, Deep Gupta, Petia Radeva, Ashwini M Bakde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image fusion combines the complementary information of multimodal
+medical images to assist medical professionals in the clinical diagnosis of
+patients' disorders and provide guidance during preoperative and
+intra-operative procedures. Deep learning (DL) models have achieved end-to-end
+image fusion with highly robust and accurate fusion performance. However, most
+DL-based fusion models perform down-sampling on the input images to minimize
+the number of learnable parameters and computations. During this process,
+salient features of the source images become irretrievable leading to the loss
+of crucial diagnostic edge details and contrast of various brain tissues. In
+this paper, we propose a new multimodal medical image fusion model is proposed
+that is based on integrated Laplacian-Gaussian concatenation with attention
+pooling (LGCA). We prove that our model preserves effectively complementary
+information and important tissue structures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures, % tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IRAD: Implicit Representation-driven Image Resampling against
+  Adversarial Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11890v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11890v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Cao, Tianlin Li, Xiaofeng Cao, Ivor Tsang, Yang Liu, Qing Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a novel approach to counter adversarial attacks, namely, image
+resampling. Image resampling transforms a discrete image into a new one,
+simulating the process of scene recapturing or rerendering as specified by a
+geometrical transformation. The underlying rationale behind our idea is that
+image resampling can alleviate the influence of adversarial perturbations while
+preserving essential semantic information, thereby conferring an inherent
+advantage in defending against adversarial attacks. To validate this concept,
+we present a comprehensive study on leveraging image resampling to defend
+against adversarial attacks. We have developed basic resampling methods that
+employ interpolation strategies and coordinate shifting magnitudes. Our
+analysis reveals that these basic methods can partially mitigate adversarial
+attacks. However, they come with apparent limitations: the accuracy of clean
+images noticeably decreases, while the improvement in accuracy on adversarial
+examples is not substantial. We propose implicit representation-driven image
+resampling (IRAD) to overcome these limitations. First, we construct an
+implicit continuous representation that enables us to represent any input image
+within a continuous coordinate space. Second, we introduce SampleNet, which
+automatically generates pixel-wise shifts for resampling in response to
+different inputs. Furthermore, we can extend our approach to the
+state-of-the-art diffusion-based method, accelerating it with fewer time steps
+while preserving its defense capability. Extensive experiments demonstrate that
+our method significantly enhances the adversarial robustness of diverse deep
+models against various attacks while maintaining high accuracy on clean images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analyze Mass Spectrometry data with Artificial Intelligence to assist
+  the understanding of past habitability of Mars and provide insights for
+  future missions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11888v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11888v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ioannis Nasios
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an application of artificial intelligence on mass
+spectrometry data for detecting habitability potential of ancient Mars.
+Although data was collected for planet Mars the same approach can be replicated
+for any terrestrial object of our solar system. Furthermore, proposed
+methodology can be adapted to any domain that uses mass spectrometry. This
+research is focused in data analysis of two mass spectrometry techniques,
+evolved gas analysis (EGA-MS) and gas chromatography (GC-MS), which are used to
+identify specific chemical compounds in geological material samples. The study
+demonstrates the applicability of EGA-MS and GC-MS data to extra-terrestrial
+material analysis. Most important features of proposed methodology includes
+square root transformation of mass spectrometry values, conversion of raw data
+to 2D sprectrograms and utilization of specific machine learning models and
+techniques to avoid overfitting on relative small datasets. Both EGA-MS and
+GC-MS datasets come from NASA and two machine learning competitions that the
+author participated and exploited. Complete running code for the GC-MS
+dataset/competition is available at GitHub.1 Raw training mass spectrometry
+data include [0, 1] labels of specific chemical compounds, selected to provide
+valuable insights and contribute to our understanding of the potential past
+habitability of Mars.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Neural Activations to Concepts: A <span class="highlight-title">Survey</span> on Explaining Concepts in
+  Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11884v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11884v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jae Hee Lee, Sergio Lanza, Stefan Wermter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we review recent approaches for explaining concepts in neural
+networks. Concepts can act as a natural link between learning and reasoning:
+once the concepts are identified that a neural learning system uses, one can
+integrate those concepts with a reasoning system for inference or use a
+reasoning system to act upon them to improve or enhance the learning system. On
+the other hand, knowledge can not only be extracted from neural networks but
+concept knowledge can also be inserted into neural network architectures. Since
+integrating learning and reasoning is at the core of neuro-symbolic AI, the
+insights gained from this survey can serve as an important step towards
+realizing neuro-symbolic AI based on explainable concepts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to Neurosymbolic Artificial Intelligence
+  (https://neurosymbolic-ai-journal.com/paper/neural-activations-concepts-survey-explaining-concepts-neural-networks)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comparative Study of Image Restoration Networks for General Backbone
+  Network Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11881v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11881v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangyu Chen, Zheyuan Li, Yuandong Pu, Yihao Liu, Jiantao Zhou, Yu Qiao, Chao Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the significant progress made by deep models in various image
+restoration tasks, existing image restoration networks still face challenges in
+terms of task generality. An intuitive manifestation is that networks which
+excel in certain tasks often fail to deliver satisfactory results in others. To
+illustrate this point, we select five representative image restoration networks
+and conduct a comparative study on five classic image restoration tasks. First,
+we provide a detailed explanation of the characteristics of different image
+restoration tasks and backbone networks. Following this, we present the
+benchmark results and analyze the reasons behind the performance disparity of
+different models across various tasks. Drawing from this comparative study, we
+propose that a general image restoration backbone network needs to meet the
+functional requirements of diverse tasks. Based on this principle, we design a
+new general image restoration backbone network, X-Restormer. Extensive
+experiments demonstrate that X-Restormer possesses good task generality and
+achieves state-of-the-art performance across a variety of tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fractional Concepts in Neural Networks: Enhancing Activation and Loss
+  Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11875v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11875v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zahra Alijani, Vojtech Molek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The paper presents a method for using fractional concepts in a neural network
+to modify the activation and loss functions. The methodology allows the neural
+network to define and optimize its activation functions by determining the
+fractional derivative order of the training process as an additional
+hyperparameter. This will enable neurons in the network to adjust their
+activation functions to match input data better and reduce output errors,
+potentially improving the network's overall performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 6 figures, submitted to Neurocomputing journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ To Generate or Not? Safety-Driven Unlearned Diffusion Models Are Still
+  Easy To Generate Unsafe Images ... For Now 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11868v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11868v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yimeng Zhang, Jinghan Jia, Xin Chen, Aochuan Chen, Yihua Zhang, Jiancheng Liu, Ke Ding, Sijia Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent advances in diffusion models (DMs) have revolutionized the
+generation of complex and diverse images. However, these models also introduce
+potential safety hazards, such as the production of harmful content and
+infringement of data copyrights. Although there have been efforts to create
+safety-driven unlearning methods to counteract these challenges, doubts remain
+about their capabilities. To bridge this uncertainty, we propose an evaluation
+framework built upon adversarial attacks (also referred to as adversarial
+prompts), in order to discern the trustworthiness of these safety-driven
+unlearned DMs. Specifically, our research explores the (worst-case) robustness
+of unlearned DMs in eradicating unwanted concepts, styles, and objects,
+assessed by the generation of adversarial prompts. We develop a novel
+adversarial learning approach called UnlearnDiff that leverages the inherent
+classification capabilities of DMs to streamline the generation of adversarial
+prompts, making it as simple for DMs as it is for image classification attacks.
+This technique streamlines the creation of adversarial prompts, making the
+process as intuitive for generative modeling as it is for image classification
+assaults. Through comprehensive benchmarking, we assess the unlearning
+robustness of five prevalent unlearned DMs across multiple tasks. Our results
+underscore the effectiveness and efficiency of UnlearnDiff when compared to
+state-of-the-art adversarial prompting methods. Codes are available at
+https://github.com/OPTML-Group/Diffusion-MU-Attack. WARNING: This paper
+contains model outputs that may be offensive in nature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Codes are available at
+  https://github.com/OPTML-Group/Diffusion-MU-Attack</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating the Fairness of Discriminative Foundation Models in Computer
+  Vision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11867v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11867v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junaid Ali, Matthaeus Kleindessner, Florian Wenzel, Kailash Budhathoki, Volkan Cevher, Chris Russell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel taxonomy for bias evaluation of discriminative foundation
+models, such as Contrastive Language-Pretraining (CLIP), that are used for
+labeling tasks. We then systematically evaluate existing methods for mitigating
+bias in these models with respect to our taxonomy. Specifically, we evaluate
+OpenAI's CLIP and OpenCLIP models for key applications, such as zero-shot
+classification, image retrieval and image captioning. We categorize desired
+behaviors based around three axes: (i) if the task concerns humans; (ii) how
+subjective the task is (i.e., how likely it is that people from a diverse range
+of backgrounds would agree on a labeling); and (iii) the intended purpose of
+the task and if fairness is better served by impartiality (i.e., making
+decisions independent of the protected attributes) or representation (i.e.,
+making decisions to maximize diversity). Finally, we provide quantitative
+fairness evaluations for both binary-valued and multi-valued protected
+attributes over ten diverse datasets. We find that fair PCA, a post-processing
+method for fair representations, works very well for debiasing in most of the
+aforementioned tasks while incurring only minor loss of performance. However,
+different debiasing approaches vary in their effectiveness depending on the
+task. Hence, one should choose the debiasing approach depending on the specific
+use case.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AIES'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VQ-NeRF: Neural Reflectance Decomposition and Editing with Vector
+  Quantization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11864v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11864v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongliang Zhong, Jingbo Zhang, Jing Liao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose VQ-NeRF, a two-branch neural network model that incorporates
+Vector Quantization (VQ) to decompose and edit reflectance fields in 3D scenes.
+Conventional neural reflectance fields use only continuous representations to
+model 3D scenes, despite the fact that objects are typically composed of
+discrete materials in reality. This lack of discretization can result in noisy
+material decomposition and complicated material editing. To address these
+limitations, our model consists of a continuous branch and a discrete branch.
+The continuous branch follows the conventional pipeline to predict decomposed
+materials, while the discrete branch uses the VQ mechanism to quantize
+continuous materials into individual ones. By discretizing the materials, our
+model can reduce noise in the decomposition process and generate a segmentation
+map of discrete materials. Specific materials can be easily selected for
+further editing by clicking on the corresponding area of the segmentation
+outcomes. Additionally, we propose a dropout-based VQ codeword ranking strategy
+to predict the number of materials in a scene, which reduces redundancy in the
+material segmentation process. To improve usability, we also develop an
+interactive interface to further assist material editing. We evaluate our model
+on both computer-generated and real-world scenes, demonstrating its superior
+performance. To the best of our knowledge, our model is the first to enable
+discrete material editing in 3D scenes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review. Project Page:
+  https://jtbzhl.github.io/VQ-NeRF.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Generate Parameters of ConvNets for Unseen Image Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11862v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11862v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiye Wang, Kaituo Feng, Changsheng Li, Ye Yuan, Guoren Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Typical Convolutional Neural Networks (ConvNets) depend heavily on large
+amounts of image data and resort to an iterative optimization algorithm (e.g.,
+SGD or Adam) to learn network parameters, which makes training very time- and
+resource-intensive. In this paper, we propose a new training paradigm and
+formulate the parameter learning of ConvNets into a prediction task: given a
+ConvNet architecture, we observe there exists correlations between image
+datasets and their corresponding optimal network parameters, and explore if we
+can learn a hyper-mapping between them to capture the relations, such that we
+can directly predict the parameters of the network for an image dataset never
+seen during the training phase. To do this, we put forward a new hypernetwork
+based model, called PudNet, which intends to learn a mapping between datasets
+and their corresponding network parameters, and then predicts parameters for
+unseen data with only a single forward propagation. Moreover, our model
+benefits from a series of adaptive hyper recurrent units sharing weights to
+capture the dependencies of parameters among different network layers.
+Extensive experiments demonstrate that our proposed method achieves good
+efficacy for unseen image datasets on two kinds of settings: Intra-dataset
+prediction and Inter-dataset prediction. Our PudNet can also well scale up to
+large-scale datasets, e.g., ImageNet-1K. It takes 8967 GPU seconds to train
+ResNet-18 on the ImageNet-1K using GC from scratch and obtain a top-5 accuracy
+of 44.65 %. However, our PudNet costs only 3.89 GPU seconds to predict the
+network parameters of ResNet-18 achieving comparable performance (44.92 %),
+more than 2,300 times faster than the traditional training paradigm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Transferable Adversarial Image Examples: Attack
+  Categorization, Evaluation Guidelines, and New Insights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11850v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11850v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengyu Zhao, Hanwei Zhang, Renjue Li, Ronan Sicre, Laurent Amsaleg, Michael Backes, Qi Li, Chao Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transferable adversarial examples raise critical security concerns in
+real-world, black-box attack scenarios. However, in this work, we identify two
+main problems in common evaluation practices: (1) For attack transferability,
+lack of systematic, one-to-one attack comparison and fair hyperparameter
+settings. (2) For attack stealthiness, simply no comparisons. To address these
+problems, we establish new evaluation guidelines by (1) proposing a novel
+attack categorization strategy and conducting systematic and fair
+intra-category analyses on transferability, and (2) considering diverse
+imperceptibility metrics and finer-grained stealthiness characteristics from
+the perspective of attack traceback. To this end, we provide the first
+large-scale evaluation of transferable adversarial examples on ImageNet,
+involving 23 representative attacks against 9 representative defenses. Our
+evaluation leads to a number of new insights, including consensus-challenging
+ones: (1) Under a fair attack hyperparameter setting, one early attack method,
+DI, actually outperforms all the follow-up methods. (2) A state-of-the-art
+defense, DiffPure, actually gives a false sense of (white-box) security since
+it is indeed largely bypassed by our (black-box) transferable attacks. (3) Even
+when all attacks are bounded by the same $L_p$ norm, they lead to dramatically
+different stealthiness performance, which negatively correlates with their
+transferability performance. Overall, our work demonstrates that existing
+problematic evaluations have indeed caused misleading conclusions and missing
+points, and as a result, hindered the assessment of the actual progress in this
+field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at
+  https://github.com/ZhengyuZhao/TransferAttackEval</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HB-net: Holistic bursting cell cluster integrated network for occluded
+  multi-objects recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11834v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11834v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xudong Gao, Xiao Guang Gao, Jia Rong, Xiaowei Chen, Xiang Liao, Jun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Within the realm of image recognition, a specific category of multi-label
+classification (MLC) challenges arises when objects within the visual field may
+occlude one another, demanding simultaneous identification of both occluded and
+occluding objects. Traditional convolutional neural networks (CNNs) can tackle
+these challenges; however, those models tend to be bulky and can only attain
+modest levels of accuracy. Leveraging insights from cutting-edge neural science
+research, specifically the Holistic Bursting (HB) cell, this paper introduces a
+pioneering integrated network framework named HB-net. Built upon the foundation
+of HB cell clusters, HB-net is designed to address the intricate task of
+simultaneously recognizing multiple occluded objects within images. Various
+Bursting cell cluster structures are introduced, complemented by an evidence
+accumulation mechanism. Testing is conducted on multiple datasets comprising
+digits and letters. The results demonstrate that models incorporating the HB
+framework exhibit a significant $2.98\%$ enhancement in recognition accuracy
+compared to models without the HB framework ($1.0298$ times, $p=0.0499$).
+Although in high-noise settings, standard CNNs exhibit slightly greater
+robustness when compared to HB-net models, the models that combine the HB
+framework and EA mechanism achieve a comparable level of accuracy and
+resilience to ResNet50, despite having only three convolutional layers and
+approximately $1/30$ of the parameters. The findings of this study offer
+valuable insights for improving computer vision algorithms. The essential code
+is provided at https://github.com/d-lab438/hb-net.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ShapeGraFormer: GraFormer-Based Network for Hand-Object Reconstruction
+  from a Single Depth Map 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11811v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11811v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed Tawfik Aboukhadra, Jameel Malik, Nadia Robertini, Ahmed Elhayek, Didier Stricker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D reconstruction of hand-object manipulations is important for emulating
+human actions. Most methods dealing with challenging object manipulation
+scenarios, focus on hands reconstruction in isolation, ignoring physical and
+kinematic constraints due to object contact. Some approaches produce more
+realistic results by jointly reconstructing 3D hand-object interactions.
+However, they focus on coarse pose estimation or rely upon known hand and
+object shapes. We propose the first approach for realistic 3D hand-object shape
+and pose reconstruction from a single depth map. Unlike previous work, our
+voxel-based reconstruction network regresses the vertex coordinates of a hand
+and an object and reconstructs more realistic interaction. Our pipeline
+additionally predicts voxelized hand-object shapes, having a one-to-one mapping
+to the input voxelized depth. Thereafter, we exploit the graph nature of the
+hand and object shapes, by utilizing the recent GraFormer network with
+positional embedding to reconstruct shapes from template meshes. In addition,
+we show the impact of adding another GraFormer component that refines the
+reconstructed shapes based on the hand-object interactions and its ability to
+reconstruct more accurate object shapes. We perform an extensive evaluation on
+the HO-3D and DexYCB datasets and show that our method outperforms existing
+approaches in hand reconstruction and produces plausible reconstructions for
+the objects
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Panoptic Out-of-Distribution Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11797v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11797v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rohit Mohan, Kiran Kumaraswamy, Juana Valeria Hurtado, Kürsat Petek, Abhinav Valada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning has led to remarkable strides in scene understanding with
+panoptic segmentation emerging as a key holistic scene interpretation task.
+However, the performance of panoptic segmentation is severely impacted in the
+presence of out-of-distribution (OOD) objects i.e. categories of objects that
+deviate from the training distribution. To overcome this limitation, we propose
+Panoptic Out-of Distribution Segmentation for joint pixel-level semantic
+in-distribution and out-of-distribution classification with instance
+prediction. We extend two established panoptic segmentation benchmarks,
+Cityscapes and BDD100K, with out-of-distribution instance segmentation
+annotations, propose suitable evaluation metrics, and present multiple strong
+baselines. Importantly, we propose the novel PoDS architecture with a shared
+backbone, an OOD contextual module for learning global and local OOD object
+cues, and dual symmetrical decoders with task-specific heads that employ our
+alignment-mismatch strategy for better OOD generalization. Combined with our
+data augmentation strategy, this approach facilitates progressive learning of
+out-of-distribution objects while maintaining in-distribution performance. We
+perform extensive evaluations that demonstrate that our proposed PoDS network
+effectively addresses the main challenges and substantially outperforms the
+baselines. We make the dataset, code, and trained models publicly available at
+http://pods.cs.uni-freiburg.de.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Progressive3D: Progressively Local Editing for Text-to-3D Content
+  Creation with Complex Semantic <span class="highlight-title">Prompt</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11784v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11784v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinhua Cheng, Tianyu Yang, Jianan Wang, Yu Li, Lei Zhang, Jian Zhang, Li Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent text-to-3D generation methods achieve impressive 3D content creation
+capacity thanks to the advances in image diffusion models and optimizing
+strategies. However, current methods struggle to generate correct 3D content
+for a complex prompt in semantics, i.e., a prompt describing multiple
+interacted objects binding with different attributes. In this work, we propose
+a general framework named Progressive3D, which decomposes the entire generation
+into a series of locally progressive editing steps to create precise 3D content
+for complex prompts, and we constrain the content change to only occur in
+regions determined by user-defined region prompts in each editing step.
+Furthermore, we propose an overlapped semantic component suppression technique
+to encourage the optimization process to focus more on the semantic differences
+between prompts. Extensive experiments demonstrate that the proposed
+Progressive3D framework generates precise 3D content for prompts with complex
+semantics and is general for various text-to-3D methods driven by different 3D
+representations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://cxh0519.github.io/projects/Progressive3D/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi Task Consistency Guided Source-Free Test-Time Domain Adaptation
+  Medical Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11766v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11766v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanyu Ye, Zhenxi Zhang, Wei Wei, Chunna Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Source-free test-time adaptation for medical image segmentation aims to
+enhance the adaptability of segmentation models to diverse and previously
+unseen test sets of the target domain, which contributes to the
+generalizability and robustness of medical image segmentation models without
+access to the source domain. Ensuring consistency between target edges and
+paired inputs is crucial for test-time adaptation. To improve the performance
+of test-time domain adaptation, we propose a multi task consistency guided
+source-free test-time domain adaptation medical image segmentation method which
+ensures the consistency of the local boundary predictions and the global
+prototype representation. Specifically, we introduce a local boundary
+consistency constraint method that explores the relationship between tissue
+region segmentation and tissue boundary localization tasks. Additionally, we
+propose a global feature consistency constraint toto enhance the intra-class
+compactness. We conduct extensive experiments on the segmentation of benchmark
+fundus images. Compared to prediction directly by the source domain model, the
+segmentation Dice score is improved by 6.27\% and 0.96\% in RIM-ONE-r3 and
+Drishti GS datasets, respectively. Additionally, the results of experiments
+demonstrate that our proposed method outperforms existing competitive domain
+adaptation segmentation algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages,7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Perceptual Measurements, Distances and Metrics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11759v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11759v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Vacher, Pascal Mamassian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Perception is often viewed as a process that transforms physical variables,
+external to an observer, into internal psychological variables. Such a process
+can be modeled by a function coined perceptual scale. The perceptual scale can
+be deduced from psychophysical measurements that consist in comparing the
+relative differences between stimuli (i.e. difference scaling experiments).
+However, this approach is often overlooked by the modeling and experimentation
+communities. Here, we demonstrate the value of measuring the perceptual scale
+of classical (spatial frequency, orientation) and less classical physical
+variables (interpolation between textures) by embedding it in recent
+probabilistic modeling of perception. First, we show that the assumption that
+an observer has an internal representation of univariate parameters such as
+spatial frequency or orientation while stimuli are high-dimensional does not
+lead to contradictory predictions when following the theoretical framework.
+Second, we show that the measured perceptual scale corresponds to the
+transduction function hypothesized in this framework. In particular, we
+demonstrate that it is related to the Fisher information of the generative
+model that underlies perception and we test the predictions given by the
+generative model of different stimuli in a set a of difference scaling
+experiments. Our main conclusion is that the perceptual scale is mostly driven
+by the stimulus power spectrum. Finally, we propose that this measure of
+perceptual scale is a way to push further the notion of perceptual distances by
+estimating the perceptual geometry of images i.e. the path between images
+instead of simply the distance between those.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 6 figures, 5 appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Domain-Generalized Face Anti-Spoofing with Unknown Attacks <span class="chip">ICIP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11758v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11758v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zong-Wei Hong, Yu-Chen Lin, Hsuan-Tung Liu, Yi-Ren Yeh, Chu-Song Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although face anti-spoofing (FAS) methods have achieved remarkable
+performance on specific domains or attack types, few studies have focused on
+the simultaneous presence of domain changes and unknown attacks, which is
+closer to real application scenarios. To handle domain-generalized unknown
+attacks, we introduce a new method, DGUA-FAS, which consists of a
+Transformer-based feature extractor and a synthetic unknown attack sample
+generator (SUASG). The SUASG network simulates unknown attack samples to assist
+the training of the feature extractor. Experimental results show that our
+method achieves superior performance on domain generalization FAS with known or
+unknown attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE International Conference on Image Processing (ICIP 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RGM: A Robust Generalist Matching Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11755v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11755v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songyan Zhang, Xinyu Sun, Hao Chen, Bo Li, Chunhua Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Finding corresponding pixels within a pair of images is a fundamental
+computer vision task with various applications. Due to the specific
+requirements of different tasks like optical flow estimation and local feature
+matching, previous works are primarily categorized into dense matching and
+sparse feature matching focusing on specialized architectures along with
+task-specific datasets, which may somewhat hinder the generalization
+performance of specialized models. In this paper, we propose a deep model for
+sparse and dense matching, termed RGM (Robust Generalist Matching). In
+particular, we elaborately design a cascaded GRU module for refinement by
+exploring the geometric similarity iteratively at multiple scales following an
+additional uncertainty estimation module for sparsification. To narrow the gap
+between synthetic training samples and real-world scenarios, we build a new,
+large-scale dataset with sparse correspondence ground truth by generating
+optical flow supervision with greater intervals. As such, we are able to mix up
+various dense and sparse matching datasets, significantly improving the
+training diversity. The generalization capacity of our proposed RGM is greatly
+improved by learning the matching and uncertainty estimation in a two-stage
+manner on the large, mixed data. Superior performance is achieved for zero-shot
+matching and downstream geometry estimation across multiple datasets,
+outperforming the previous methods by a large margin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages. Code is available at: https://github.com/aim-uofa/RGM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BanglaAbuseMeme: A <span class="highlight-title">Dataset</span> for Bengali Abusive Meme Classification <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11748v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11748v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mithun Das, Animesh Mukherjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The dramatic increase in the use of social media platforms for information
+sharing has also fueled a steep growth in online abuse. A simple yet effective
+way of abusing individuals or communities is by creating memes, which often
+integrate an image with a short piece of text layered on top of it. Such
+harmful elements are in rampant use and are a threat to online safety. Hence it
+is necessary to develop efficient models to detect and flag abusive memes. The
+problem becomes more challenging in a low-resource setting (e.g., Bengali
+memes, i.e., images with Bengali text embedded on it) because of the absence of
+benchmark datasets on which AI models could be trained. In this paper we bridge
+this gap by building a Bengali meme dataset. To setup an effective benchmark we
+implement several baseline models for classifying abusive memes using this
+dataset. We observe that multimodal models that use both textual and visual
+information outperform unimodal models. Our best-performing model achieves a
+macro F1 score of 70.51. Finally, we perform a qualitative error analysis of
+the misclassified memes of the best-performing text-based, image-based and
+multimodal models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 (main conference)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DBDNet:Partial-to-Partial Point Cloud Registration with Dual Branches
+  Decoupling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11733v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11733v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiqi Li, Jihua Zhu, Yifan Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point cloud registration plays a crucial role in various computer vision
+tasks, and usually demands the resolution of partial overlap registration in
+practice. Most existing methods perform a serial calculation of rotation and
+translation, while jointly predicting overlap during registration, this
+coupling tends to degenerate the registration performance. In this paper, we
+propose an effective registration method with dual branches decoupling for
+partial-to-partial registration, dubbed as DBDNet. Specifically, we introduce a
+dual branches structure to eliminate mutual interference error between rotation
+and translation by separately creating two individual correspondence matrices.
+For partial-to-partial registration, we consider overlap prediction as a
+preordering task before the registration procedure. Accordingly, we present an
+overlap predictor that benefits from explicit feature interaction, which is
+achieved by the powerful attention mechanism to accurately predict pointwise
+masks. Furthermore, we design a multi-resolution feature extraction network to
+capture both local and global patterns thus enhancing both overlap prediction
+and registration module. Experimental results on both synthetic and real
+datasets validate the effectiveness of our proposed method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VST++: Efficient and Stronger Visual Saliency <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11725v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11725v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nian Liu, Ziyang Luo, Ni Zhang, Junwei Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While previous CNN-based models have exhibited promising results for salient
+object detection (SOD), their ability to explore global long-range dependencies
+is restricted. Our previous work, the Visual Saliency Transformer (VST),
+addressed this constraint from a transformer-based sequence-to-sequence
+perspective, to unify RGB and RGB-D SOD. In VST, we developed a multi-task
+transformer decoder that concurrently predicts saliency and boundary outcomes
+in a pure transformer architecture. Moreover, we introduced a novel token
+upsampling method called reverse T2T for predicting a high-resolution saliency
+map effortlessly within transformer-based structures. Building upon the VST
+model, we further propose an efficient and stronger VST version in this work,
+i.e. VST++. To mitigate the computational costs of the VST model, we propose a
+Select-Integrate Attention (SIA) module, partitioning foreground into
+fine-grained segments and aggregating background information into a single
+coarse-grained token. To incorporate 3D depth information with low cost, we
+design a novel depth position encoding method tailored for depth maps.
+Furthermore, we introduce a token-supervised prediction loss to provide
+straightforward guidance for the task-related tokens. We evaluate our VST++
+model across various transformer-based backbones on RGB, RGB-D, and RGB-T SOD
+benchmark datasets. Experimental results show that our model outperforms
+existing methods while achieving a 25% reduction in computational costs without
+significant performance compromise. The demonstrated strong ability for
+generalization, enhanced performance, and heightened efficiency of our VST++
+model highlight its potential.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Separating Invisible Sounds Toward Universal Audiovisual Scene-Aware
+  Sound Separation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11713v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11713v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyang Su, Ali Vosoughi, Shijian Deng, Yapeng Tian, Chenliang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The audio-visual sound separation field assumes visible sources in videos,
+but this excludes invisible sounds beyond the camera's view. Current methods
+struggle with such sounds lacking visible cues. This paper introduces a novel
+"Audio-Visual Scene-Aware Separation" (AVSA-Sep) framework. It includes a
+semantic parser for visible and invisible sounds and a separator for
+scene-informed separation. AVSA-Sep successfully separates both sound types,
+with joint training and cross-modal alignment enhancing effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023 - AV4D, 4 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DPF-Nutrition: Food Nutrition Estimation via Depth Prediction and Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11702v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11702v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuzhe Han, Qimin Cheng, Wenjin Wu, Ziyang Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A reasonable and balanced diet is essential for maintaining good health. With
+the advancements in deep learning, automated nutrition estimation method based
+on food images offers a promising solution for monitoring daily nutritional
+intake and promoting dietary health. While monocular image-based nutrition
+estimation is convenient, efficient, and economical, the challenge of limited
+accuracy remains a significant concern. To tackle this issue, we proposed
+DPF-Nutrition, an end-to-end nutrition estimation method using monocular
+images. In DPF-Nutrition, we introduced a depth prediction module to generate
+depth maps, thereby improving the accuracy of food portion estimation.
+Additionally, we designed an RGB-D fusion module that combined monocular images
+with the predicted depth information, resulting in better performance for
+nutrition estimation. To the best of our knowledge, this was the pioneering
+effort that integrated depth prediction and RGB-D fusion techniques in food
+nutrition estimation. Comprehensive experiments performed on Nutrition5k
+evaluated the effectiveness and efficiency of DPF-Nutrition.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Runner re-identification from single-view video in the open-world
+  setting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11700v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11700v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomohiro Suzuki, Kazushi Tsutsui, Kazuya Takeda, Keisuke Fujii
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In many sports, player re-identification is crucial for automatic video
+processing and analysis. However, most of the current studies on player
+re-identification in multi- or single-view sports videos focus on
+re-identification in the closed-world setting using labeled image dataset, and
+player re-identification in the open-world setting for automatic video analysis
+is not well developed. In this paper, we propose a runner re-identification
+system that directly processes single-view video to address the open-world
+setting. In the open-world setting, we cannot use labeled dataset and have to
+process video directly. The proposed system automatically processes raw video
+as input to identify runners, and it can identify runners even when they are
+framed out multiple times. For the automatic processing, we first detect the
+runners in the video using the pre-trained YOLOv8 and the fine-tuned
+EfficientNet. We then track the runners using ByteTrack and detect their shoes
+with the fine-tuned YOLOv8. Finally, we extract the image features of the
+runners using an unsupervised method using the gated recurrent unit autoencoder
+model. To improve the accuracy of runner re-identification, we use dynamic
+features of running sequence images. We evaluated the system on a running
+practice video dataset and showed that the proposed method identified runners
+with higher accuracy than one of the state-of-the-art models in unsupervised
+re-identification. We also showed that our unsupervised running dynamic feature
+extractor was effective for runner re-identification. Our runner
+re-identification system can be useful for the automatic analysis of running
+videos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MISAR: A Multimodal Instructional System with Augmented Reality <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11699v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11699v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Bi, Nguyen Manh Nguyen, Ali Vosoughi, Chenliang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Augmented reality (AR) requires the seamless integration of visual, auditory,
+and linguistic channels for optimized human-computer interaction. While
+auditory and visual inputs facilitate real-time and contextual user guidance,
+the potential of large language models (LLMs) in this landscape remains largely
+untapped. Our study introduces an innovative method harnessing LLMs to
+assimilate information from visual, auditory, and contextual modalities.
+Focusing on the unique challenge of task performance quantification in AR, we
+utilize egocentric video, speech, and context analysis. The integration of LLMs
+facilitates enhanced state estimation, marking a step towards more adaptive AR
+systems. Code, dataset, and demo will be available at
+https://github.com/nguyennm1024/misar.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023 - AV4D, 6 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MOHO: Learning Single-view Hand-held Object Reconstruction with
+  Multi-view Occlusion-Aware Supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11696v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11696v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenyangguang Zhang, Guanlong Jiao, Yan Di, Ziqin Huang, Gu Wang, Ruida Zhang, Bowen Fu, Federico Tombari, Xiangyang Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Previous works concerning single-view hand-held object reconstruction
+typically utilize supervision from 3D ground truth models, which are hard to
+collect in real world. In contrast, abundant videos depicting hand-object
+interactions can be accessed easily with low cost, although they only give
+partial object observations with complex occlusion. In this paper, we present
+MOHO to reconstruct hand-held object from a single image with multi-view
+supervision from hand-object videos, tackling two predominant challenges
+including object's self-occlusion and hand-induced occlusion. MOHO inputs
+semantic features indicating visible object parts and geometric embeddings
+provided by hand articulations as partial-to-full cues to resist object's
+self-occlusion, so as to recover full shape of the object. Meanwhile, a novel
+2D-3D hand-occlusion-aware training scheme following the synthetic-to-real
+paradigm is proposed to release hand-induced occlusion. In the synthetic
+pre-training stage, 2D-3D hand-object correlations are constructed by
+supervising MOHO with rendered images to complete the hand-concealed regions of
+the object in both 2D and 3D space. Subsequently, MOHO is finetuned in real
+world by the mask-weighted volume rendering supervision adopting hand-object
+correlations obtained during pre-training. Extensive experiments on HO3D and
+DexYCB datasets demonstrate that 2D-supervised MOHO gains superior results
+against 3D-supervised methods by a large margin. Codes and key assets will be
+released soon.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Chat<span class="highlight-title">GPT</span>-guided Semantics for Zero-shot Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11657v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11657v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fahimul Hoque Shubho, Townim Faisal Chowdhury, Ali Cheraghian, Morteza Saberi, Nabeel Mohammed, Shafin Rahman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot learning (ZSL) aims to classify objects that are not observed or
+seen during training. It relies on class semantic description to transfer
+knowledge from the seen classes to the unseen classes. Existing methods of
+obtaining class semantics include manual attributes or automatic word vectors
+from language models (like word2vec). We know attribute annotation is costly,
+whereas automatic word-vectors are relatively noisy. To address this problem,
+we explore how ChatGPT, a large language model, can enhance class semantics for
+ZSL tasks. ChatGPT can be a helpful source to obtain text descriptions for each
+class containing related attributes and semantics. We use the word2vec model to
+get a word vector using the texts from ChatGPT. Then, we enrich word vectors by
+combining the word embeddings from class names and descriptions generated by
+ChatGPT. More specifically, we leverage ChatGPT to provide extra supervision
+for the class description, eventually benefiting ZSL models. We evaluate our
+approach on various 2D image (CUB and AwA) and 3D point cloud (ModelNet10,
+ModelNet40, and ScanObjectNN) datasets and show that it improves ZSL
+performance. Our work contributes to the ZSL literature by applying ChatGPT for
+class semantics enhancement and proposing a novel word vector fusion method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in International Conference on Digital Image Computing:
+  Techniques and Applications (DICTA), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VKIE: The Application of Key Information Extraction on Video Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11650v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11650v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyu An, Ye Liu, Haoyuan Peng, Di Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extracting structured information from videos is critical for numerous
+downstream applications in the industry. In this paper, we define a significant
+task of extracting hierarchical key information from visual texts on videos. To
+fulfill this task, we decouples it into four subtasks and introduce two
+implementation solutions called PipVKIE and UniVKIE. PipVKIE sequentially
+completes the four subtasks in continuous stages, while UniVKIE is improved by
+unifying all the subtasks into one backbone. Both PipVKIE and UniVKIE leverage
+multimodal information from vision, text, and coordinates for feature
+representation. Extensive experiments on one well-defined dataset demonstrate
+that our solutions can achieve remarkable performance and efficient inference
+speed. The code and dataset will be publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Abdominal 3-D Scene Rendering from Laparoscopy Surgical Videos
+  using NeRFs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11645v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11645v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khoa Tuan Nguyen, Francesca Tozzi, Nikdokht Rashidian, Wouter Willaert, Joris Vankerschaver, Wesley De Neve
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given that a conventional laparoscope only provides a two-dimensional (2-D)
+view, the detection and diagnosis of medical ailments can be challenging. To
+overcome the visual constraints associated with laparoscopy, the use of
+laparoscopic images and videos to reconstruct the three-dimensional (3-D)
+anatomical structure of the abdomen has proven to be a promising approach.
+Neural Radiance Fields (NeRFs) have recently gained attention thanks to their
+ability to generate photorealistic images from a 3-D static scene, thus
+facilitating a more comprehensive exploration of the abdomen through the
+synthesis of new views. This distinguishes NeRFs from alternative methods such
+as Simultaneous Localization and Mapping (SLAM) and depth estimation. In this
+paper, we present a comprehensive examination of NeRFs in the context of
+laparoscopy surgical videos, with the goal of rendering abdominal scenes in
+3-D. Although our experimental results are promising, the proposed approach
+encounters substantial challenges, which require further exploration in future
+research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The Version of Record of this contribution is published in MLMI 2023
+  Part I, and is available online at
+  https://doi.org/10.1007/978-3-031-45673-2_9</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AcTExplore: Active Tactile Exploration on Unknown Objects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08745v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08745v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir-Hossein Shahidzadeh, Seong Jong Yoo, Pavan Mantripragada, Chahat Deep Singh, Cornelia Fermüller, Yiannis Aloimonos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tactile exploration plays a crucial role in understanding object structures
+for fundamental robotics tasks such as grasping and manipulation. However,
+efficiently exploring such objects using tactile sensors is challenging,
+primarily due to the large-scale unknown environments and limited sensing
+coverage of these sensors. To this end, we present AcTExplore, an active
+tactile exploration method driven by reinforcement learning for object
+reconstruction at scales that automatically explores the object surfaces in a
+limited number of steps. Through sufficient exploration, our algorithm
+incrementally collects tactile data and reconstructs 3D shapes of the objects
+as well, which can serve as a representation for higher-level downstream tasks.
+Our method achieves an average of 95.97% IoU coverage on unseen YCB objects
+while just being trained on primitive shapes. Project Webpage:
+https://prg.cs.umd$.$edu/AcTExplore
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Boundary Guided Learning-Free Semantic Control with Diffusion Models <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08357v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08357v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ye Zhu, Yu Wu, Zhiwei Deng, Olga Russakovsky, Yan Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Applying pre-trained generative denoising diffusion models (DDMs) for
+downstream tasks such as image semantic editing usually requires either
+fine-tuning DDMs or learning auxiliary editing networks in the existing
+literature. In this work, we present our BoundaryDiffusion method for
+efficient, effective and light-weight semantic control with frozen pre-trained
+DDMs, without learning any extra networks. As one of the first learning-free
+diffusion editing works, we start by seeking a comprehensive understanding of
+the intermediate high-dimensional latent spaces by theoretically and
+empirically analyzing their probabilistic and geometric behaviors in the Markov
+chain. We then propose to further explore the critical step for editing in the
+denoising trajectory that characterizes the convergence of a pre-trained DDM
+and introduce an automatic search method. Last but not least, in contrast to
+the conventional understanding that DDMs have relatively poor semantic
+behaviors, we prove that the critical latent space we found already exhibits
+semantic subspace boundaries at the generic level in unconditional DDMs, which
+allows us to do controllable manipulation by guiding the denoising trajectory
+towards the targeted boundary via a single-step operation. We conduct extensive
+experiments on multiple DPMs architectures (DDPM, iDDPM) and datasets (CelebA,
+CelebA-HQ, LSUN-church, LSUN-bedroom, AFHQ-dog) with different resolutions (64,
+256), achieving superior or state-of-the-art performance in various task
+scenarios (image semantic editing, text-based editing, unconditional semantic
+control) to demonstrate the effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023. 27 pages including appendices, code at
+  https://github.com/L-YeZhu/BoundaryDiffusion</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Global k-Space Interpolation for Dynamic MRI Reconstruction using Masked
+  Image Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12672v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12672v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiazhen Pan, Suprosanna Shit, Özgün Turgut, Wenqi Huang, Hongwei Bran Li, Nil Stolt-Ansó, Thomas Küstner, Kerstin Hammernik, Daniel Rueckert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In dynamic Magnetic Resonance Imaging (MRI), k-space is typically
+undersampled due to limited scan time, resulting in aliasing artifacts in the
+image domain. Hence, dynamic MR reconstruction requires not only modeling
+spatial frequency components in the x and y directions of k-space but also
+considering temporal redundancy. Most previous works rely on image-domain
+regularizers (priors) to conduct MR reconstruction. In contrast, we focus on
+interpolating the undersampled k-space before obtaining images with Fourier
+transform. In this work, we connect masked image modeling with k-space
+interpolation and propose a novel Transformer-based k-space Global
+Interpolation Network, termed k-GIN. Our k-GIN learns global dependencies among
+low- and high-frequency components of 2D+t k-space and uses it to interpolate
+unsampled data. Further, we propose a novel k-space Iterative Refinement Module
+(k-IRM) to enhance the high-frequency components learning. We evaluate our
+approach on 92 in-house 2D+t cardiac MR subjects and compare it to MR
+reconstruction methods with image-domain regularizers. Experiments show that
+our proposed k-space interpolation method quantitatively and qualitatively
+outperforms baseline methods. Importantly, the proposed approach achieves
+substantially higher robustness and generalizability in cases of
+highly-undersampled MR data. For video presentation, poster, GIF results and
+code please check our project page:
+https://jzpeterpan.github.io/k-gin.github.io/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SA2-Net: Scale-aware Attention Network for Microscopic Image
+  Segmentation <span class="chip">BMVC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16661v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16661v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mustansar Fiaz, Rao Muhammad Anwer, Hisham Cholakkal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Microscopic image segmentation is a challenging task, wherein the objective
+is to assign semantic labels to each pixel in a given microscopic image. While
+convolutional neural networks (CNNs) form the foundation of many existing
+frameworks, they often struggle to explicitly capture long-range dependencies.
+Although transformers were initially devised to address this issue using
+self-attention, it has been proven that both local and global features are
+crucial for addressing diverse challenges in microscopic images, including
+variations in shape, size, appearance, and target region density. In this
+paper, we introduce SA2-Net, an attention-guided method that leverages
+multi-scale feature learning to effectively handle diverse structures within
+microscopic images. Specifically, we propose scale-aware attention (SA2) module
+designed to capture inherent variations in scales and shapes of microscopic
+regions, such as cells, for accurate segmentation. This module incorporates
+local attention at each level of multi-stage features, as well as global
+attention across multiple resolutions. Furthermore, we address the issue of
+blurred region boundaries (e.g., cell boundaries) by introducing a novel
+upsampling strategy called the Adaptive Up-Attention (AuA) module. This module
+enhances the discriminative ability for improved localization of microscopic
+regions using an explicit attention mechanism. Extensive experiments on five
+challenging datasets demonstrate the benefits of our SA2-Net model. Our source
+code is publicly available at \url{https://github.com/mustansarfiaz/SA2-Net}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>BMVC 2023 accepted as oral</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Breathing New Life into 3D Assets with Generative Repainting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08523v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08523v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianfu Wang, Menelaos Kanakis, Konrad Schindler, Luc Van Gool, Anton Obukhov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion-based text-to-image models ignited immense attention from the
+vision community, artists, and content creators. Broad adoption of these models
+is due to significant improvement in the quality of generations and efficient
+conditioning on various modalities, not just text. However, lifting the rich
+generative priors of these 2D models into 3D is challenging. Recent works have
+proposed various pipelines powered by the entanglement of diffusion models and
+neural fields. We explore the power of pretrained 2D diffusion models and
+standard 3D neural radiance fields as independent, standalone tools and
+demonstrate their ability to work together in a non-learned fashion. Such
+modularity has the intrinsic advantage of eased partial upgrades, which became
+an important property in such a fast-paced domain. Our pipeline accepts any
+legacy renderable geometry, such as textured or untextured meshes, orchestrates
+the interaction between 2D generative refinement and 3D consistency enforcement
+tools, and outputs a painted input geometry in several formats. We conduct a
+large-scale study on a wide range of objects and categories from the
+ShapeNetSem dataset and demonstrate the advantages of our approach, both
+qualitatively and quantitatively. Project page:
+https://www.obukhov.ai/repainting_3d_assets
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Residual Elastic Warps for Image Stitching under Dirichlet
+  Boundary Condition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.01406v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.01406v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minsu Kim, Yongjun Lee, Woo Kyoung Han, Kyong Hwan Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Trendy suggestions for learning-based elastic warps enable the deep image
+stitchings to align images exposed to large parallax errors. Despite the
+remarkable alignments, the methods struggle with occasional holes or
+discontinuity between overlapping and non-overlapping regions of a target image
+as the applied training strategy mostly focuses on overlap region alignment. As
+a result, they require additional modules such as seam finder and image
+inpainting for hiding discontinuity and filling holes, respectively. In this
+work, we suggest Recurrent Elastic Warps (REwarp) that address the problem with
+Dirichlet boundary condition and boost performances by residual learning for
+recurrent misalign correction. Specifically, REwarp predicts a homography and a
+Thin-plate Spline (TPS) under the boundary constraint for discontinuity and
+hole-free image stitching. Our experiments show the favorable aligns and the
+competitive computational costs of REwarp compared to the existing stitching
+methods. Our source code is available at https://github.com/minshu-kim/REwarp.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vulnerabilities in Video Quality Assessment Models: The Challenge of
+  Adversarial Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.13609v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.13609v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ao-Xiang Zhang, Yu Ran, Weixuan Tang, Yuan-Gen Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  No-Reference Video Quality Assessment (NR-VQA) plays an essential role in
+improving the viewing experience of end-users. Driven by deep learning, recent
+NR-VQA models based on Convolutional Neural Networks (CNNs) and Transformers
+have achieved outstanding performance. To build a reliable and practical
+assessment system, it is of great necessity to evaluate their robustness.
+However, such issue has received little attention in the academic community. In
+this paper, we make the first attempt to evaluate the robustness of NR-VQA
+models against adversarial attacks, and propose a patch-based random search
+method for black-box attack. Specifically, considering both the attack effect
+on quality score and the visual quality of adversarial video, the attack
+problem is formulated as misleading the estimated quality score under the
+constraint of just-noticeable difference (JND). Built upon such formulation, a
+novel loss function called Score-Reversed Boundary Loss is designed to push the
+adversarial video's estimated quality score far away from its ground-truth
+score towards a specific boundary, and the JND constraint is modeled as a
+strict $L_2$ and $L_\infty$ norm restriction. By this means, both white-box and
+black-box attacks can be launched in an effective and imperceptible manner. The
+source code is available at https://github.com/GZHU-DVL/AttackVQA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Implicit Neural Image Stitching With Enhanced and Blended Feature
+  Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.01409v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.01409v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minsu Kim, Jaewon Lee, Byeonghun Lee, Sunghoon Im, Kyong Hwan Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing frameworks for image stitching often provide visually reasonable
+stitchings. However, they suffer from blurry artifacts and disparities in
+illumination, depth level, etc. Although the recent learning-based stitchings
+relax such disparities, the required methods impose sacrifice of image
+qualities failing to capture high-frequency details for stitched images. To
+address the problem, we propose a novel approach, implicit Neural Image
+Stitching (NIS) that extends arbitrary-scale super-resolution. Our method
+estimates Fourier coefficients of images for quality-enhancing warps. Then, the
+suggested model blends color mismatches and misalignment in the latent space
+and decodes the features into RGB values of stitched images. Our experiments
+show that our approach achieves improvement in resolving the low-definition
+imaging of the previous deep image stitching with favorable accelerated
+image-enhancing methods. Our source code is available at
+https://github.com/minshu-kim/NIS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VERITE: A Robust Benchmark for Multimodal Misinformation Detection
+  Accounting for Unimodal Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14133v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14133v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefanos-Iordanis Papadopoulos, Christos Koutlis, Symeon Papadopoulos, Panagiotis C. Petrantonakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimedia content has become ubiquitous on social media platforms, leading
+to the rise of multimodal misinformation (MM) and the urgent need for effective
+strategies to detect and prevent its spread. In recent years, the challenge of
+multimodal misinformation detection (MMD) has garnered significant attention by
+researchers and has mainly involved the creation of annotated, weakly
+annotated, or synthetically generated training datasets, along with the
+development of various deep learning MMD models. However, the problem of
+unimodal bias has been overlooked, where specific patterns and biases in MMD
+benchmarks can result in biased or unimodal models outperforming their
+multimodal counterparts on an inherently multimodal task; making it difficult
+to assess progress. In this study, we systematically investigate and identify
+the presence of unimodal bias in widely-used MMD benchmarks, namely VMU-Twitter
+and COSMOS. To address this issue, we introduce the "VERification of Image-TExt
+pairs" (VERITE) benchmark for MMD which incorporates real-world data, excludes
+"asymmetric multimodal misinformation" and utilizes "modality balancing". We
+conduct an extensive comparative study with a Transformer-based architecture
+that shows the ability of VERITE to effectively address unimodal bias,
+rendering it a robust evaluation framework for MMD. Furthermore, we introduce a
+new method -- termed Crossmodal HArd Synthetic MisAlignment (CHASMA) -- for
+generating realistic synthetic training data that preserve crossmodal relations
+between legitimate images and false human-written captions. By leveraging
+CHASMA in the training process, we observe consistent and notable improvements
+in predictive performance on VERITE; with a 9.2% increase in accuracy. We
+release our code at: https://github.com/stevejpapad/image-text-verification
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Binarized Spectral Compressive Imaging <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10299v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10299v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanhao Cai, Yuxin Zheng, Jing Lin, Xin Yuan, Yulun Zhang, Haoqian Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing deep learning models for hyperspectral image (HSI) reconstruction
+achieve good performance but require powerful hardwares with enormous memory
+and computational resources. Consequently, these methods can hardly be deployed
+on resource-limited mobile devices. In this paper, we propose a novel method,
+Binarized Spectral-Redistribution Network (BiSRNet), for efficient and
+practical HSI restoration from compressed measurement in snapshot compressive
+imaging (SCI) systems. Firstly, we redesign a compact and easy-to-deploy base
+model to be binarized. Then we present the basic unit, Binarized
+Spectral-Redistribution Convolution (BiSR-Conv). BiSR-Conv can adaptively
+redistribute the HSI representations before binarizing activation and uses a
+scalable hyperbolic tangent function to closer approximate the Sign function in
+backpropagation. Based on our BiSR-Conv, we customize four binarized
+convolutional modules to address the dimension mismatch and propagate
+full-precision information throughout the whole network. Finally, our BiSRNet
+is derived by using the proposed techniques to binarize the base model.
+Comprehensive quantitative and qualitative experiments manifest that our
+proposed BiSRNet outperforms state-of-the-art binarization methods and achieves
+comparable performance with full-precision algorithms. Code and models are
+publicly available at https://github.com/caiyuanhao1998/BiSCI and
+https://github.com/caiyuanhao1998/MST
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023; The first work to study binarized spectral compressive
+  imaging reconstruction problem</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TongueSAM: An Universal Tongue Segmentation Model Based on SAM with
+  Zero-Shot 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.06444v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.06444v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shan Cao, Qunsheng Ruan, Qingfeng Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tongue segmentation serves as the primary step in automated TCM tongue
+diagnosis, which plays a significant role in the diagnostic results. Currently,
+numerous deep learning based methods have achieved promising results. However,
+when confronted with tongue images that differ from the training set or possess
+challenging backgrounds, these methods demonstrate limited performance. To
+address this issue, this paper proposes a universal tongue segmentation model
+named TongueSAM based on SAM (Segment Anything Model). SAM is a large-scale
+pretrained interactive segmentation model known for its powerful zero-shot
+generalization capability. Applying SAM to tongue segmentation leverages its
+learned prior knowledge from natural images, enabling the achievement of
+zero-shot segmentation for various types of tongue images. In this study, a
+Prompt Generator based on object detection is integrated into SAM to enable an
+end-to-end automated tongue segmentation method. Experiments demonstrate that
+TongueSAM achieves exceptional performance across various of tongue
+segmentation datasets, particularly under zero-shot. Even when dealing with
+challenging background tongue images, TongueSAM achieves a mIoU of 95.23\%
+under zero-shot conditions, surpassing other segmentation methods. As far as we
+know, this is the first application of large-scale pretrained model for tongue
+segmentation. The project and pretrained model will be made public when the
+paper is accepted.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 4K4D: Real-Time 4D View Synthesis at 4K Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11448v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11448v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Xu, Sida Peng, Haotong Lin, Guangzhao He, Jiaming Sun, Yujun Shen, Hujun Bao, Xiaowei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper targets high-fidelity and real-time view synthesis of dynamic 3D
+scenes at 4K resolution. Recently, some methods on dynamic view synthesis have
+shown impressive rendering quality. However, their speed is still limited when
+rendering high-resolution images. To overcome this problem, we propose 4K4D, a
+4D point cloud representation that supports hardware rasterization and enables
+unprecedented rendering speed. Our representation is built on a 4D feature grid
+so that the points are naturally regularized and can be robustly optimized. In
+addition, we design a novel hybrid appearance model that significantly boosts
+the rendering quality while preserving efficiency. Moreover, we develop a
+differentiable depth peeling algorithm to effectively learn the proposed model
+from RGB videos. Experiments show that our representation can be rendered at
+over 400 FPS on the DNA-Rendering dataset at 1080p resolution and 80 FPS on the
+ENeRF-Outdoor dataset at 4K resolution using an RTX 4090 GPU, which is 30x
+faster than previous methods and achieves the state-of-the-art rendering
+quality. Our project page is available at https://zju3dv.github.io/4k4d/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://zju3dv.github.io/4k4d</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fairness in Face Presentation Attack Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.09035v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.09035v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Meiling Fang, Wufei Yang, Arjan Kuijper, Vitomir Struc, Naser Damer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face recognition (FR) algorithms have been proven to exhibit discriminatory
+behaviors against certain demographic and non-demographic groups, raising
+ethical and legal concerns regarding their deployment in real-world scenarios.
+Despite the growing number of fairness studies in FR, the fairness of face
+presentation attack detection (PAD) has been overlooked, mainly due to the lack
+of appropriately annotated data. To avoid and mitigate the potential negative
+impact of such behavior, it is essential to assess the fairness in face PAD and
+develop fair PAD models. To enable fairness analysis in face PAD, we present a
+Combined Attribute Annotated PAD Dataset (CAAD-PAD), offering seven
+human-annotated attribute labels. Then, we comprehensively analyze the fairness
+of PAD and its relation to the nature of the training data and the Operational
+Decision Threshold Assignment (ODTA) through a set of face PAD solutions.
+Additionally, we propose a novel metric, the Accuracy Balanced Fairness (ABF),
+that jointly represents both the PAD fairness and the absolute PAD performance.
+The experimental results pointed out that female and faces with occluding
+features (e.g. eyeglasses, beard, etc.) are relatively less protected than male
+and non-occlusion groups by all PAD solutions. To alleviate this observed
+unfairness, we propose a plug-and-play data augmentation method, FairSWAP, to
+disrupt the identity/semantic information and encourage models to mine the
+attack clues. The extensive experimental results indicate that FairSWAP leads
+to better-performing and fairer face PADs in 10 out of 12 investigated cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Pattern Recognition</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ResShift: Efficient Diffusion Model for Image Super-resolution by
+  Residual Shifting <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12348v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12348v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zongsheng Yue, Jianyi Wang, Chen Change Loy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion-based image super-resolution (SR) methods are mainly limited by the
+low inference speed due to the requirements of hundreds or even thousands of
+sampling steps. Existing acceleration sampling techniques inevitably sacrifice
+performance to some extent, leading to over-blurry SR results. To address this
+issue, we propose a novel and efficient diffusion model for SR that
+significantly reduces the number of diffusion steps, thereby eliminating the
+need for post-acceleration during inference and its associated performance
+deterioration. Our method constructs a Markov chain that transfers between the
+high-resolution image and the low-resolution image by shifting the residual
+between them, substantially improving the transition efficiency. Additionally,
+an elaborate noise schedule is developed to flexibly control the shifting speed
+and the noise strength during the diffusion process. Extensive experiments
+demonstrate that the proposed method obtains superior or at least comparable
+performance to current state-of-the-art methods on both synthetic and
+real-world datasets, even only with 15 sampling steps. Our code and model are
+available at https://github.com/zsyOAOA/ResShift.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023 (Spotlight). Project page:
+  https://zsyoaoa.github.io/projects/resshift/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Simple Baseline for Supervised Surround-view Depth Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.07759v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.07759v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianda Guo, Wenjie Yuan, Yunpeng Zhang, Tian Yang, Chenming Zhang, Zheng Zhu, Long Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth estimation has been widely studied and serves as the fundamental step
+of 3D perception for intelligent vehicles. Though significant progress has been
+made in monocular depth estimation in the past decades, these attempts are
+mainly conducted on the KITTI benchmark with only front-view cameras, which
+ignores the correlations across surround-view cameras. In this paper, we
+propose S3Depth, a Simple Baseline for Supervised Surround-view Depth
+Estimation, to jointly predict the depth maps across multiple surrounding
+cameras. Specifically, we employ a global-to-local feature extraction module
+which combines CNN with transformer layers for enriched representations.
+Further, the Adjacent-view Attention mechanism is proposed to enable the
+intra-view and inter-view feature propagation. The former is achieved by the
+self-attention module within each view, while the latter is realized by the
+adjacent attention module, which computes the attention across multi-cameras to
+exchange the multi-scale representations across surround-view feature maps.
+Extensive experiments show that our method achieves superior performance over
+existing state-of-the-art methods on both DDAD and nuScenes datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Network Initialization for Medical AI Models Using
+  Large-Scale, Unlabeled Natural Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07688v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07688v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soroosh Tayebi Arasteh, Leo Misera, Jakob Nikolas Kather, Daniel Truhn, Sven Nebelung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-training datasets, like ImageNet, have become the gold standard in
+medical image analysis. However, the emergence of self-supervised learning
+(SSL), which leverages unlabeled data to learn robust features, presents an
+opportunity to bypass the intensive labeling process. In this study, we
+explored if SSL for pre-training on non-medical images can be applied to chest
+radiographs and how it compares to supervised pre-training on non-medical
+images and on medical images. We utilized a vision transformer and initialized
+its weights based on (i) SSL pre-training on natural images (DINOv2), (ii) SL
+pre-training on natural images (ImageNet dataset), and (iii) SL pre-training on
+chest radiographs from the MIMIC-CXR database. We tested our approach on over
+800,000 chest radiographs from six large global datasets, diagnosing more than
+20 different imaging findings. Our SSL pre-training on curated images not only
+outperformed ImageNet-based pre-training (P<0.001 for all datasets) but, in
+certain cases, also exceeded SL on the MIMIC-CXR dataset. Our findings suggest
+that selecting the right pre-training strategy, especially with SSL, can be
+pivotal for improving artificial intelligence (AI)'s diagnostic accuracy in
+medical imaging. By demonstrating the promise of SSL in chest radiograph
+analysis, we underline a transformative shift towards more efficient and
+accurate AI models in medical imaging.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fracture Detection in Pediatric Wrist Trauma X-ray Images Using YOLOv8
+  Algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.05071v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.05071v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui-Yang Ju, Weiming Cai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hospital emergency departments frequently receive lots of bone fracture
+cases, with pediatric wrist trauma fracture accounting for the majority of
+them. Before pediatric surgeons perform surgery, they need to ask patients how
+the fracture occurred and analyze the fracture situation by interpreting X-ray
+images. The interpretation of X-ray images often requires a combination of
+techniques from radiologists and surgeons, which requires time-consuming
+specialized training. With the rise of deep learning in the field of computer
+vision, network models applying for fracture detection has become an important
+research topic. In this paper, we use data augmentation to improve the model
+performance of YOLOv8 algorithm (the latest version of You Only Look Once) on a
+pediatric wrist trauma X-ray dataset (GRAZPEDWRI-DX), which is a public
+dataset. The experimental results show that our model have reached the
+state-of-the-art (SOTA) real-time model performance. Specifically, the mean
+average precision (mAP 50) of our model is 0.638, which is significantly higher
+than the 0.634 and 0.636 of the improved YOLOv7 and original YOLOv8 models. To
+enable surgeons to use our model for fracture detection on pediatric wrist
+trauma X-ray images, we have designed the application "Fracture Detection Using
+YOLOv8 App" to assist surgeons in diagnosing fractures, reducing the
+probability of error analysis, and providing more useful information for
+surgery.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Domain Re-Modulation for Few-Shot Generative Domain Adaptation <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.02550v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.02550v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Wu, Ziqiang Li, Chaoyue Wang, Heliang Zheng, Shanshan Zhao, Bin Li, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we delve into the task of few-shot Generative Domain
+Adaptation (GDA), which involves transferring a pre-trained generator from one
+domain to a new domain using only a few reference images. Inspired by the way
+human brains acquire knowledge in new domains, we present an innovative
+generator structure called Domain Re-Modulation (DoRM). DoRM not only meets the
+criteria of high quality, large synthesis diversity, and cross-domain
+consistency, which were achieved by previous research in GDA, but also
+incorporates memory and domain association, akin to how human brains operate.
+Specifically, DoRM freezes the source generator and introduces new mapping and
+affine modules (M&A modules) to capture the attributes of the target domain
+during GDA. This process resembles the formation of new synapses in human
+brains. Consequently, a linearly combinable domain shift occurs in the style
+space. By incorporating multiple new M&A modules, the generator gains the
+capability to perform high-fidelity multi-domain and hybrid-domain generation.
+Moreover, to maintain cross-domain consistency more effectively, we introduce a
+similarity-based structure loss. This loss aligns the auto-correlation map of
+the target image with its corresponding auto-correlation map of the source
+image during training. Through extensive experiments, we demonstrate the
+superior performance of our DoRM and similarity-based structure loss in
+few-shot GDA, both quantitatively and qualitatively. The code will be available
+at https://github.com/wuyi2020/DoRM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bootstrapping Vision-Language Learning with Decoupled Language
+  <span class="highlight-title">Pre-train</span>ing <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07063v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07063v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiren Jian, Chongyang Gao, Soroush Vosoughi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel methodology aimed at optimizing the application of frozen
+large language models (LLMs) for resource-intensive vision-language (VL)
+pre-training. The current paradigm uses visual features as prompts to guide
+language models, with a focus on determining the most relevant visual features
+for corresponding text. Our approach diverges by concentrating on the language
+component, specifically identifying the optimal prompts to align with visual
+features. We introduce the Prompt-Transformer (P-Former), a model that predicts
+these ideal prompts, which is trained exclusively on linguistic data, bypassing
+the need for image-text pairings. This strategy subtly bifurcates the
+end-to-end VL training process into an additional, separate stage. Our
+experiments reveal that our framework significantly enhances the performance of
+a robust image-to-text baseline (BLIP-2), and effectively narrows the
+performance gap between models trained with either 4M or 129M image-text pairs.
+Importantly, our framework is modality-agnostic and flexible in terms of
+architectural design, as validated by its successful application in a video
+learning task using varied base modules. The code will be made available at
+https://github.com/yiren-jian/BLIText.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2023 (spotlight)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AP$n$P: A Less-constrained P$n$P Solver for Pose Estimation with Unknown
+  Anisotropic Scaling or Focal Lengths 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09982v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09982v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxin Wei, Stefan Leutenegger, Laurent Kneip
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Perspective-$n$-Point (P$n$P) stands as a fundamental algorithm for pose
+estimation in various applications. In this paper, we present a new approach to
+the P$n$P problem with relaxed constraints, eliminating the need for precise 3D
+coordinates or complete calibration data. We refer to it as AP$n$P due to its
+ability to handle unknown anisotropic scaling factors of 3D coordinates or
+alternatively two distinct focal lengths in addition to the conventional rigid
+pose. Through algebraic manipulations and a novel parametrization, both cases
+are brought into similar forms that distinguish themselves primarily by the
+order of a rotation and an anisotropic scaling operation. AP$n$P furthermore
+brings down both cases to an identical polynomial problem, which is solved
+using the Gr\"obner basis approach. Experimental results on both simulated and
+real datasets demonstrate the effectiveness of AP$n$P, providing a more
+flexible and practical solution to several pose estimation tasks. Code:
+https://github.com/goldoak/APnP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RGB-based Category-level Object Pose Estimation via Decoupled Metric
+  Scale Recovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10255v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10255v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxin Wei, Xibin Song, Weizhe Liu, Laurent Kneip, Hongdong Li, Pan Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While showing promising results, recent RGB-D camera-based category-level
+object pose estimation methods have restricted applications due to the heavy
+reliance on depth sensors. RGB-only methods provide an alternative to this
+problem yet suffer from inherent scale ambiguity stemming from monocular
+observations. In this paper, we propose a novel pipeline that decouples the 6D
+pose and size estimation to mitigate the influence of imperfect scales on rigid
+transformations. Specifically, we leverage a pre-trained monocular estimator to
+extract local geometric information, mainly facilitating the search for inlier
+2D-3D correspondence. Meanwhile, a separate branch is designed to directly
+recover the metric scale of the object based on category-level statistics.
+Finally, we advocate using the RANSAC-P$n$P algorithm to robustly solve for 6D
+object pose. Extensive experiments have been conducted on both synthetic and
+real datasets, demonstrating the superior performance of our method over
+previous state-of-the-art RGB-based approaches, especially in terms of rotation
+accuracy. Code: https://github.com/goldoak/DMSR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ New Insights on Relieving Task-Recency Bias for Online Class Incremental
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08243v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08243v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoqiang Liang, Zhaojie Chen, Zhaoqiang Chen, Shiyu Ji, Yanning Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To imitate the ability of keeping learning of human, continual learning which
+can learn from a never-ending data stream has attracted more interests
+recently. In all settings, the online class incremental learning (OCIL), where
+incoming samples from data stream can be used only once, is more challenging
+and can be encountered more frequently in real world. Actually, all continual
+learning models face a stability-plasticity dilemma, where the stability means
+the ability to preserve old knowledge while the plasticity denotes the ability
+to incorporate new knowledge. Although replay-based methods have shown
+exceptional promise, most of them concentrate on the strategy for updating and
+retrieving memory to keep stability at the expense of plasticity. To strike a
+preferable trade-off between stability and plasticity, we propose an Adaptive
+Focus Shifting algorithm (AFS), which dynamically adjusts focus to ambiguous
+samples and non-target logits in model learning. Through a deep analysis of the
+task-recency bias caused by class imbalance, we propose a revised focal loss to
+mainly keep stability. \Rt{By utilizing a new weight function, the revised
+focal loss will pay more attention to current ambiguous samples, which are the
+potentially valuable samples to make model progress quickly.} To promote
+plasticity, we introduce a virtual knowledge distillation. By designing a
+virtual teacher, it assigns more attention to non-target classes, which can
+surmount overconfidence and encourage model to focus on inter-class
+information. Extensive experiments on three popular datasets for OCIL have
+shown the effectiveness of AFS. The code will be available at
+\url{https://github.com/czjghost/AFS}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages,16 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Visual Tracking by Motion Analyzing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.03247v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.03247v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammed Leo, Kurban Ubul, ShengJie Cheng, Michael Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, Video Object Segmentation (VOS) has emerged as a
+complementary method to Video Object Tracking (VOT). VOS focuses on classifying
+all the pixels around the target, allowing for precise shape labeling, while
+VOT primarily focuses on the approximate region where the target might be.
+However, traditional segmentation modules usually classify pixels frame by
+frame, disregarding information between adjacent frames.
+  In this paper, we propose a new algorithm that addresses this limitation by
+analyzing the motion pattern using the inherent tensor structure. The tensor
+structure, obtained through Tucker2 tensor decomposition, proves to be
+effective in describing the target's motion. By incorporating this information,
+we achieved competitive results on Four benchmarks LaSOT\cite{fan2019lasot},
+AVisT\cite{noman2022avist}, OTB100\cite{7001050}, and
+GOT-10k\cite{huang2019got} LaSOT\cite{fan2019lasot} with SOTA. Furthermore, the
+proposed tracker is capable of real-time operation, adding value to its
+practical application.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>found some key point that is missed,considering that it will take a
+  lot of time to reproduce the results and revise our mistakes,we would like to
+  withdraw the manuscript to avoid further mislead</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evading Watermark based Detection of AI-Generated Content <span class="chip">CCS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03807v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03807v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengyuan Jiang, Jinghuai Zhang, Neil Zhenqiang Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A generative AI model can generate extremely realistic-looking content,
+posing growing challenges to the authenticity of information. To address the
+challenges, watermark has been leveraged to detect AI-generated content.
+Specifically, a watermark is embedded into an AI-generated content before it is
+released. A content is detected as AI-generated if a similar watermark can be
+decoded from it. In this work, we perform a systematic study on the robustness
+of such watermark-based AI-generated content detection. We focus on
+AI-generated images. Our work shows that an attacker can post-process a
+watermarked image via adding a small, human-imperceptible perturbation to it,
+such that the post-processed image evades detection while maintaining its
+visual quality. We show the effectiveness of our attack both theoretically and
+empirically. Moreover, to evade detection, our adversarial post-processing
+method adds much smaller perturbations to AI-generated images and thus better
+maintain their visual quality than existing popular post-processing methods
+such as JPEG compression, Gaussian blur, and Brightness/Contrast. Our work
+shows the insufficiency of existing watermark-based detection of AI-generated
+content, highlighting the urgent needs of new methods. Our code is publicly
+available: https://github.com/zhengyuan-jiang/WEvade.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in ACM Conference on Computer and Communications Security
+  (CCS), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiffUTE: Universal Text Editing Diffusion Model <span class="chip">NeurIPS'2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10825v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10825v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoxing Chen, Zhuoer Xu, Zhangxuan Gu, Jun Lan, Xing Zheng, Yaohui Li, Changhua Meng, Huijia Zhu, Weiqiang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion model based language-guided image editing has achieved great
+success recently. However, existing state-of-the-art diffusion models struggle
+with rendering correct text and text style during generation. To tackle this
+problem, we propose a universal self-supervised text editing diffusion model
+(DiffUTE), which aims to replace or modify words in the source image with
+another one while maintaining its realistic appearance. Specifically, we build
+our model on a diffusion model and carefully modify the network structure to
+enable the model for drawing multilingual characters with the help of glyph and
+position information. Moreover, we design a self-supervised learning framework
+to leverage large amounts of web data to improve the representation ability of
+the model. Experimental results show that our method achieves an impressive
+performance and enables controllable editing on in-the-wild images with high
+fidelity. Our code will be avaliable in
+\url{https://github.com/chenhaoxing/DiffUTE}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS'2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Weakly Supervised Fine-grained Scene Graph Generation via Large Language
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10404v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10404v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kibum Kim, Kanghoon Yoon, Jaehyeong Jeon, Yeonjun In, Jinyoung Moon, Donghyun Kim, Chanyoung Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly-Supervised Scene Graph Generation (WSSGG) research has recently
+emerged as an alternative to the fully-supervised approach that heavily relies
+on costly annotations. In this regard, studies on WSSGG have utilized image
+captions to obtain unlocalized triplets while primarily focusing on grounding
+the unlocalized triplets over image regions. However, they have overlooked the
+two issues involved in the triplet formation process from the captions: 1)
+Semantic over-simplification issue arises when extracting triplets from
+captions, where fine-grained predicates in captions are undesirably converted
+into coarse-grained predicates, resulting in a long-tailed predicate
+distribution, and 2) Low-density scene graph issue arises when aligning the
+triplets in the caption with entity/predicate classes of interest, where many
+triplets are discarded and not used in training, leading to insufficient
+supervision. To tackle the two issues, we propose a new approach, i.e., Large
+Language Model for weakly-supervised SGG (LLM4SGG), where we mitigate the two
+issues by leveraging the LLM's in-depth understanding of language and reasoning
+ability during the extraction of triplets from captions and alignment of
+entity/predicate classes with target data. To further engage the LLM in these
+processes, we adopt the idea of Chain-of-Thought and the in-context few-shot
+learning strategy. To validate the effectiveness of LLM4SGG, we conduct
+extensive experiments on Visual Genome and GQA datasets, showing significant
+improvements in both Recall@K and mean Recall@K compared to the
+state-of-the-art WSSGG methods. A further appeal is that LLM4SGG is
+data-efficient, enabling effective model training with a small amount of
+training images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Detecting Abnormal Health Conditions in Smart Home Using a Drone 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05012v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05012v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pronob Kumar Barman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, detecting aberrant health issues is a difficult process. Falling,
+especially among the elderly, is a severe concern worldwide. Falls can result
+in deadly consequences, including unconsciousness, internal bleeding, and often
+times, death. A practical and optimal, smart approach of detecting falling is
+currently a concern. The use of vision-based fall monitoring is becoming more
+common among scientists as it enables senior citizens and those with other
+health conditions to live independently. For tracking, surveillance, and
+rescue, unmanned aerial vehicles use video or image segmentation and object
+detection methods. The Tello drone is equipped with a camera and with this
+device we determined normal and abnormal behaviors among our participants. The
+autonomous falling objects are classified using a convolutional neural network
+(CNN) classifier. The results demonstrate that the systems can identify falling
+objects with a precision of 0.9948.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Training and Tuning Generative Neural Radiance Fields for
+  Attribute-Conditional 3D-Aware Face Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.12550v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.12550v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jichao Zhang, Aliaksandr Siarohin, Yahui Liu, Hao Tang, Nicu Sebe, Wei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Neural Radiance Fields (GNeRF) based 3D-aware GANs have
+demonstrated remarkable capabilities in generating high-quality images while
+maintaining strong 3D consistency. Notably, significant advancements have been
+made in the domain of face generation. However, most existing models prioritize
+view consistency over disentanglement, resulting in limited semantic/attribute
+control during generation. To address this limitation, we propose a conditional
+GNeRF model incorporating specific attribute labels as input to enhance the
+controllability and disentanglement abilities of 3D-aware generative models.
+Our approach builds upon a pre-trained 3D-aware face model, and we introduce a
+Training as Init and Optimizing for Tuning (TRIOT) method to train a
+conditional normalized flow module to enable the facial attribute editing, then
+optimize the latent vector to improve attribute-editing precision further. Our
+extensive experiments demonstrate that our model produces high-quality edits
+with superior view consistency while preserving non-target regions. Code is
+available at https://github.com/zhangqianhui/TT-GNeRF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decoupled Diffusion Models: Image to Zero and Zero to Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13720v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13720v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhang Huang, Zheng Qin, Xinwang Liu, Kai Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent diffusion probabilistic models (DPMs) have shown remarkable abilities
+of generated content, however, they often suffer from complex forward
+processes, resulting in inefficient solutions for the reversed process and
+prolonged sampling times. In this paper, we aim to address the aforementioned
+challenges by focusing on the diffusion process itself that we propose to
+decouple the intricate diffusion process into two comparatively simpler process
+to improve the generative efficacy and speed. In particular, we present a novel
+diffusion paradigm named DDM (Decoupled Diffusion Models) based on the Ito
+diffusion process, in which the image distribution is approximated by an
+explicit transition probability while the noise path is controlled by the
+standard Wiener process. We find that decoupling the diffusion process reduces
+the learning difficulty and the explicit transition probability improves the
+generative speed significantly. We prove a new training objective for DPM,
+which enables the model to learn to predict the noise and image components
+separately. Moreover, given the novel forward diffusion equation, we derive the
+reverse denoising formula of DDM that naturally supports fewer steps of
+generation without ordinary differential equation (ODE) based accelerators. Our
+experiments demonstrate that DDM outperforms previous DPMs by a large margin in
+fewer function evaluations setting and gets comparable performances in long
+function evaluations setting. We also show that our framework can be applied to
+image-conditioned generation and high-resolution image synthesis, and that it
+can generate high-quality images with only 10 function evaluations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SEPT: Towards Efficient Scene Representation Learning for Motion
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15289v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15289v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiqian Lan, Yuxuan Jiang, Yao Mu, Chen Chen, Shengbo Eben Li, Hang Zhao, Keqiang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motion prediction is crucial for autonomous vehicles to operate safely in
+complex traffic environments. Extracting effective spatiotemporal relationships
+among traffic elements is key to accurate forecasting. Inspired by the
+successful practice of pretrained large language models, this paper presents
+SEPT, a modeling framework that leverages self-supervised learning to develop
+powerful spatiotemporal understanding for complex traffic scenes. Specifically,
+our approach involves three masking-reconstruction modeling tasks on scene
+inputs including agents' trajectories and road network, pretraining the scene
+encoder to capture kinematics within trajectory, spatial structure of road
+network, and interactions among roads and agents. The pretrained encoder is
+then finetuned on the downstream forecasting task. Extensive experiments
+demonstrate that SEPT, without elaborate architectural design or manual feature
+engineering, achieves state-of-the-art performance on the Argoverse 1 and
+Argoverse 2 motion forecasting benchmarks, outperforming previous methods on
+all main metrics by a large margin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The content of the article needs to be further polished</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DialogPaint: A Dialog-based Image Editing Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.10073v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.10073v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingxuan Wei, Shiyu Wu, Xin Jiang, Yequan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce DialogPaint, a novel framework that bridges conversational
+interactions with image editing, enabling users to modify images through
+natural dialogue. By integrating a dialogue model with the Stable Diffusion
+image transformation technique, DialogPaint offers a more intuitive and
+interactive approach to image modifications. Our method stands out by
+effectively interpreting and executing both explicit and ambiguous
+instructions, handling tasks such as object replacement, style transfer, and
+color modification. Notably, DialogPaint supports iterative, multi-round
+editing, allowing users to refine image edits over successive interactions.
+Comprehensive evaluations highlight the robustness and versatility of our
+approach, marking a significant advancement in dialogue-driven image editing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TFDet: Target-Aware Fusion for RGB-T Pedestrian Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16580v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16580v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xue Zhang, Xiao-Han Zhang, Jiacheng Ying, Zehua Sheng, Heng Yu, Chunguang Li, Hui-Liang Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pedestrian detection plays a critical role in computer vision as it
+contributes to ensuring traffic safety. Existing methods that rely solely on
+RGB images suffer from performance degradation under low-light conditions due
+to the lack of useful information. To address this issue, recent multispectral
+detection approaches have combined thermal images to provide complementary
+information and have obtained enhanced performances. Nevertheless, few
+approaches focus on the negative effects of false positives caused by noisy
+fused feature maps. Different from them, we comprehensively analyze the impacts
+of false positives on the detection performance and find that enhancing feature
+contrast can significantly reduce these false positives. In this paper, we
+propose a novel target-aware fusion strategy for multispectral pedestrian
+detection, named TFDet. Our fusion strategy highlights the pedestrian-related
+features and suppresses unrelated ones, generating more discriminative fused
+features. TFDet achieves state-of-the-art performance on both KAIST and LLVIP
+benchmarks, with an efficiency comparable to the previous state-of-the-art
+counterpart. Importantly, TFDet performs remarkably well even under low-light
+conditions, which is a significant advancement for ensuring road safety. The
+code will be made publicly available at https://github.com/XueZ-phd/TFDet.git.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CoBEV: Elevating Roadside 3D Object Detection with Depth and Height
+  Complementarity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02815v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02815v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Shi, Chengshan Pang, Jiaming Zhang, Kailun Yang, Yuhao Wu, Huajian Ni, Yining Lin, Rainer Stiefelhagen, Kaiwei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Roadside camera-driven 3D object detection is a crucial task in intelligent
+transportation systems, which extends the perception range beyond the
+limitations of vision-centric vehicles and enhances road safety. While previous
+studies have limitations in using only depth or height information, we find
+both depth and height matter and they are in fact complementary. The depth
+feature encompasses precise geometric cues, whereas the height feature is
+primarily focused on distinguishing between various categories of height
+intervals, essentially providing semantic context. This insight motivates the
+development of Complementary-BEV (CoBEV), a novel end-to-end monocular 3D
+object detection framework that integrates depth and height to construct robust
+BEV representations. In essence, CoBEV estimates each pixel's depth and height
+distribution and lifts the camera features into 3D space for lateral fusion
+using the newly proposed two-stage complementary feature selection (CFS)
+module. A BEV feature distillation framework is also seamlessly integrated to
+further enhance the detection accuracy from the prior knowledge of the
+fusion-modal CoBEV teacher. We conduct extensive experiments on the public 3D
+detection benchmarks of roadside camera-based DAIR-V2X-I and Rope3D, as well as
+the private Supremind-Road dataset, demonstrating that CoBEV not only achieves
+the accuracy of the new state-of-the-art, but also significantly advances the
+robustness of previous methods in challenging long-distance scenarios and noisy
+camera disturbance, and enhances generalization by a large margin in
+heterologous settings with drastic changes in scene and camera parameters. For
+the first time, the vehicle AP score of a camera model reaches 80% on
+DAIR-V2X-I in terms of easy mode. The source code will be made publicly
+available at https://github.com/MasterHow/CoBEV.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The source code will be made publicly available at
+  https://github.com/MasterHow/CoBEV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Foundation Model for General Moving Object Segmentation in Medical
+  Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17264v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17264v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongnuo Yan, Tong Han, Yuhao Huang, Lian Liu, Han Zhou, Jiongquan Chen, Wenlong Shi, Yan Cao, Xin Yang, Dong Ni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image segmentation aims to delineate the anatomical or pathological
+structures of interest, playing a crucial role in clinical diagnosis. A
+substantial amount of high-quality annotated data is crucial for constructing
+high-precision deep segmentation models. However, medical annotation is highly
+cumbersome and time-consuming, especially for medical videos or 3D volumes, due
+to the huge labeling space and poor inter-frame consistency. Recently, a
+fundamental task named Moving Object Segmentation (MOS) has made significant
+advancements in natural images. Its objective is to delineate moving objects
+from the background within image sequences, requiring only minimal annotations.
+In this paper, we propose the first foundation model, named iMOS, for MOS in
+medical images. Extensive experiments on a large multi-modal medical dataset
+validate the effectiveness of the proposed iMOS. Specifically, with the
+annotation of only a small number of images in the sequence, iMOS can achieve
+satisfactory tracking and segmentation performance of moving objects throughout
+the entire sequence in bi-directions. We hope that the proposed iMOS can help
+accelerate the annotation speed of experts, and boost the development of
+medical foundation models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 7 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FreeU: Free Lunch in Diffusion U-Net 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.11497v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.11497v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenyang Si, Ziqi Huang, Yuming Jiang, Ziwei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we uncover the untapped potential of diffusion U-Net, which
+serves as a "free lunch" that substantially improves the generation quality on
+the fly. We initially investigate the key contributions of the U-Net
+architecture to the denoising process and identify that its main backbone
+primarily contributes to denoising, whereas its skip connections mainly
+introduce high-frequency features into the decoder module, causing the network
+to overlook the backbone semantics. Capitalizing on this discovery, we propose
+a simple yet effective method-termed "FreeU" - that enhances generation quality
+without additional training or finetuning. Our key insight is to strategically
+re-weight the contributions sourced from the U-Net's skip connections and
+backbone feature maps, to leverage the strengths of both components of the
+U-Net architecture. Promising results on image and video generation tasks
+demonstrate that our FreeU can be readily integrated to existing diffusion
+models, e.g., Stable Diffusion, DreamBooth, ModelScope, Rerender and ReVersion,
+to improve the generation quality with only a few lines of code. All you need
+is to adjust two scaling factors during inference. Project page:
+https://chenyangsi.top/FreeU/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Method update: we proposed structure-based scaling to enhance the
+  performance of FreeU. Project page: https://chenyangsi.top/FreeU/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From CLIP to DINO: Visual Encoders Shout in Multi-modal Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08825v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08825v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongsheng Jiang, Yuchen Liu, Songlin Liu, Xiaopeng Zhang, Jin Li, Hongkai Xiong, Qi Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal Large Language Models (MLLMs) have made significant strides in
+expanding the capabilities of Large Language Models (LLMs) through the
+incorporation of visual perception interfaces. Despite the emergence of
+exciting applications and the availability of diverse instruction tuning data,
+existing approaches often rely on CLIP or its variants as the visual branch,
+and merely extract features from the deep layers. However, these methods lack a
+comprehensive analysis of the visual encoders in MLLMs. In this paper, we
+conduct an extensive investigation into the effectiveness of different vision
+encoders within MLLMs. Our findings reveal that the shallow layer features of
+CLIP offer particular advantages for fine-grained tasks such as grounding and
+region understanding. Surprisingly, the vision-only model DINO, which is not
+pretrained with text-image alignment, demonstrates promising performance as a
+visual branch within MLLMs. By simply equipping it with an MLP layer for
+alignment, DINO surpasses CLIP in fine-grained related perception tasks.
+Building upon these observations, we propose a simple yet effective feature
+merging strategy, named COMM, that integrates CLIP and DINO with Multi-level
+features Merging, to enhance the visual capabilities of MLLMs. We evaluate COMM
+through comprehensive experiments on a wide range of benchmarks, including
+image captioning, visual question answering, visual grounding, and object
+hallucination. Experimental results demonstrate the superior performance of
+COMM compared to existing methods, showcasing its enhanced visual capabilities
+within MLLMs. Code will be made available at
+https://github.com/YuchenLiu98/COMM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper is under the corporation's legal review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Semantic Graph Matching for Large-scale Outdoor Point Clouds
+  Registration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.05314v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.05314v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaocong Liu, Tao Wang, Yan Zhang, Ruqin Zhou, Li Li, Chenguang Dai, Yongsheng Zhang, Longguang Wang, Hanyun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current point cloud registration methods are mainly based on local geometric
+information and usually ignore the semantic information contained in the
+scenes. In this paper, we treat the point cloud registration problem as a
+semantic instance matching and registration task, and propose a deep semantic
+graph matching method (DeepSGM) for large-scale outdoor point cloud
+registration. Firstly, the semantic categorical labels of 3D points are
+obtained using a semantic segmentation network. The adjacent points with the
+same category labels are then clustered together using the Euclidean clustering
+algorithm to obtain the semantic instances, which are represented by three
+kinds of attributes including spatial location information, semantic
+categorical information, and global geometric shape information. Secondly, the
+semantic adjacency graph is constructed based on the spatial adjacency
+relations of semantic instances. To fully explore the topological structures
+between semantic instances in the same scene and across different scenes, the
+spatial distribution features and the semantic categorical features are learned
+with graph convolutional networks, and the global geometric shape features are
+learned with a PointNet-like network. These three kinds of features are further
+enhanced with the self-attention and cross-attention mechanisms. Thirdly, the
+semantic instance matching is formulated as an optimal transport problem, and
+solved through an optimal matching layer. Finally, the geometric transformation
+matrix between two point clouds is first estimated by the SVD algorithm and
+then refined by the ICP algorithm. Experimental results conducted on the KITTI
+Odometry dataset demonstrate that the proposed method improves the registration
+performance and outperforms various state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EvalCrafter: Benchmarking and Evaluating Large Video Generation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11440v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11440v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaofang Liu, Xiaodong Cun, Xuebo Liu, Xintao Wang, Yong Zhang, Haoxin Chen, Yang Liu, Tieyong Zeng, Raymond Chan, Ying Shan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The vision and language generative models have been overgrown in recent
+years. For video generation, various open-sourced models and public-available
+services are released for generating high-visual quality videos. However, these
+methods often use a few academic metrics, for example, FVD or IS, to evaluate
+the performance. We argue that it is hard to judge the large conditional
+generative models from the simple metrics since these models are often trained
+on very large datasets with multi-aspect abilities. Thus, we propose a new
+framework and pipeline to exhaustively evaluate the performance of the
+generated videos. To achieve this, we first conduct a new prompt list for
+text-to-video generation by analyzing the real-world prompt list with the help
+of the large language model. Then, we evaluate the state-of-the-art video
+generative models on our carefully designed benchmarks, in terms of visual
+qualities, content qualities, motion qualities, and text-caption alignment with
+around 18 objective metrics. To obtain the final leaderboard of the models, we
+also fit a series of coefficients to align the objective metrics to the users'
+opinions. Based on the proposed opinion alignment method, our final score shows
+a higher correlation than simply averaging the metrics, showing the
+effectiveness of the proposed evaluation method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report, Project page: https://evalcrafter.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Spatio-Temporal Attention-Based Method for Detecting Student Classroom
+  Behaviors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02523v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02523v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately detecting student behavior from classroom videos is beneficial for
+analyzing their classroom status and improving teaching efficiency. However,
+low accuracy in student classroom behavior detection is a prevalent issue. To
+address this issue, we propose a Spatio-Temporal Attention-Based Method for
+Detecting Student Classroom Behaviors (BDSTA). Firstly, the SlowFast network is
+used to generate motion and environmental information feature maps from the
+video. Then, the spatio-temporal attention module is applied to the feature
+maps, including information aggregation, compression and stimulation processes.
+Subsequently, attention maps in the time, channel and space dimensions are
+obtained, and multi-label behavior classification is performed based on these
+attention maps. To solve the long-tail data problem that exists in student
+classroom behavior datasets, we use an improved focal loss function to assign
+more weight to the tail class data during training. Experimental results are
+conducted on a self-made student classroom behavior dataset named STSCB.
+Compared with the SlowFast model, the average accuracy of student behavior
+classification detection improves by 8.94\% using BDSTA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SHINE: Deep Learning-Based Accessible Parking Management System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.00837v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.00837v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dhiraj Neupane, Aashish Bhattarai, Sunil Aryal, Mohamed Reda Bouadjenek, Uk-Min Seok, Jongwon Seok
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ongoing expansion of urban areas facilitated by advancements in science
+and technology has resulted in a considerable increase in the number of
+privately owned vehicles worldwide, including in South Korea. However, this
+gradual increment in the number of vehicles has inevitably led to
+parking-related issues, including the abuse of disabled parking spaces
+(hereafter referred to as accessible parking spaces) designated for individuals
+with disabilities. Traditional license plate recognition (LPR) systems have
+proven inefficient in addressing such a problem in real-time due to the high
+frame rate of surveillance cameras, the presence of natural and artificial
+noise, and variations in lighting and weather conditions that impede detection
+and recognition by these systems. With the growing concept of parking 4.0, many
+sensors, IoT and deep learning-based approaches have been applied to automatic
+LPR and parking management systems. Nonetheless, the studies show a need for a
+robust and efficient model for managing accessible parking spaces in South
+Korea. To address this, we have proposed a novel system called, Shine, which
+uses the deep learning-based object detection algorithm for detecting the
+vehicle, license plate, and disability badges (referred to as cards, badges, or
+access badges hereafter) and verifies the rights of the driver to use
+accessible parking spaces by coordinating with the central server. Our model,
+which achieves a mean average precision of 92.16%, is expected to address the
+issue of accessible parking space abuse and contributes significantly towards
+efficient and effective parking management in urban environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DORSal: Diffusion for Object-centric Representations of Scenes et al 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08068v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08068v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Allan Jabri, Sjoerd van Steenkiste, Emiel Hoogeboom, Mehdi S. M. Sajjadi, Thomas Kipf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent progress in 3D scene understanding enables scalable learning of
+representations across large datasets of diverse scenes. As a consequence,
+generalization to unseen scenes and objects, rendering novel views from just a
+single or a handful of input images, and controllable scene generation that
+supports editing, is now possible. However, training jointly on a large number
+of scenes typically compromises rendering quality when compared to single-scene
+optimized models such as NeRFs. In this paper, we leverage recent progress in
+diffusion models to equip 3D scene representation learning models with the
+ability to render high-fidelity novel views, while retaining benefits such as
+object-level scene editing to a large degree. In particular, we propose DORSal,
+which adapts a video diffusion architecture for 3D scene generation conditioned
+on frozen object-centric slot-based representations of scenes. On both complex
+synthetic multi-object scenes and on the real-world large-scale Street View
+dataset, we show that DORSal enables scalable neural rendering of 3D scenes
+with object-level editing and improves upon existing approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://www.sjoerdvansteenkiste.com/dorsal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OpenSTL: A Comprehensive Benchmark of Spatio-Temporal Predictive
+  Learning <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.11249v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.11249v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Tan, Siyuan Li, Zhangyang Gao, Wenfei Guan, Zedong Wang, Zicheng Liu, Lirong Wu, Stan Z. Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatio-temporal predictive learning is a learning paradigm that enables
+models to learn spatial and temporal patterns by predicting future frames from
+given past frames in an unsupervised manner. Despite remarkable progress in
+recent years, a lack of systematic understanding persists due to the diverse
+settings, complex implementation, and difficult reproducibility. Without
+standardization, comparisons can be unfair and insights inconclusive. To
+address this dilemma, we propose OpenSTL, a comprehensive benchmark for
+spatio-temporal predictive learning that categorizes prevalent approaches into
+recurrent-based and recurrent-free models. OpenSTL provides a modular and
+extensible framework implementing various state-of-the-art methods. We conduct
+standard evaluations on datasets across various domains, including synthetic
+moving object trajectory, human motion, driving scenes, traffic flow and
+weather forecasting. Based on our observations, we provide a detailed analysis
+of how model architecture and dataset properties affect spatio-temporal
+predictive learning performance. Surprisingly, we find that recurrent-free
+models achieve a good balance between efficiency and performance than recurrent
+models. Thus, we further extend the common MetaFormers to boost recurrent-free
+spatial-temporal predictive learning. We open-source the code and models at
+https://github.com/chengtan9907/OpenSTL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2023. 33 pages, 17 figures, 19 tables. Under
+  review. For more details, please refer to
+  https://github.com/chengtan9907/OpenSTL</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">13</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automated Attribute Extraction from Legal Proceedings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12131v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12131v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Subinay Adhikary, Sagnik Das, Sagnik Saha, Procheta Sen, Dwaipayan Roy, Kripabandhu Ghosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The escalating number of pending cases is a growing concern world-wide.
+Recent advancements in digitization have opened up possibilities for leveraging
+artificial intelligence (AI) tools in the processing of legal documents.
+Adopting a structured representation for legal documents, as opposed to a mere
+bag-of-words flat text representation, can significantly enhance processing
+capabilities. With the aim of achieving this objective, we put forward a set of
+diverse attributes for criminal case proceedings. We use a state-of-the-art
+sequence labeling framework to automatically extract attributes from the legal
+documents. Moreover, we demonstrate the efficacy of the extracted attributes in
+a downstream task, namely legal judgment prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented in Mining and Learning in the Legal Domain (MLLD) workshop
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unveiling the Siren's Song: Towards Reliable Fact-Conflicting
+  Hallucination Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12086v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12086v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Chen, Duanzheng Song, Honghao Gui, Chengxi Wang, Ningyu Zhang, Fei Huang, Chengfei Lv, Dan Zhang, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs), such as ChatGPT/GPT-4, have garnered widespread
+attention owing to their myriad of practical applications, yet their adoption
+has been constrained by issues of fact-conflicting hallucinations across web
+platforms. The assessment of factuality in text, produced by LLMs, remains
+inadequately explored, extending not only to the judgment of vanilla facts but
+also encompassing the evaluation of factual errors emerging in complex
+inferential tasks like multi-hop, and etc. In response, we introduce FactCHD, a
+fact-conflicting hallucination detection benchmark meticulously designed for
+LLMs. Functioning as a pivotal tool in evaluating factuality within
+"Query-Respons" contexts, our benchmark assimilates a large-scale dataset,
+encapsulating a broad spectrum of factuality patterns, such as vanilla,
+multi-hops, comparison, and set-operation patterns. A distinctive feature of
+our benchmark is its incorporation of fact-based chains of evidence, thereby
+facilitating comprehensive and conducive factual reasoning throughout the
+assessment process. We evaluate multiple LLMs, demonstrating the effectiveness
+of the benchmark and current methods fall short of faithfully detecting factual
+errors. Furthermore, we present TRUTH-TRIANGULATOR that synthesizes reflective
+considerations by tool-enhanced ChatGPT and LoRA-tuning based on Llama2, aiming
+to yield more credible detection through the amalgamation of predictive results
+and evidence. The benchmark dataset and source code will be made available in
+https://github.com/zjunlp/FactCHD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simulating Users in Interactive Web Table Retrieval <span class="chip">CIKM'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11931v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11931v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Björn Engelmann, Timo Breuer, Philipp Schaer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Considering the multimodal signals of search items is beneficial for
+retrieval effectiveness. Especially in web table retrieval (WTR) experiments,
+accounting for multimodal properties of tables boosts effectiveness. However,
+it still remains an open question how the single modalities affect user
+experience in particular. Previous work analyzed WTR performance in ad-hoc
+retrieval benchmarks, which neglects interactive search behavior and limits the
+conclusion about the implications for real-world user environments.
+  To this end, this work presents an in-depth evaluation of simulated
+interactive WTR search sessions as a more cost-efficient and reproducible
+alternative to real user studies. As a first of its kind, we introduce
+interactive query reformulation strategies based on Doc2Query, incorporating
+cognitive states of simulated user knowledge. Our evaluations include two
+perspectives on user effectiveness by considering different cost paradigms,
+namely query-wise and time-oriented measures of effort. Our multi-perspective
+evaluation scheme reveals new insights about query strategies, the impact of
+modalities, and different user types in simulated WTR search sessions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages + references; accepted at CIKM'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CIR at the NTCIR-17 ULTRE-2 Task 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11852v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11852v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lulu Yu, Keping Bi, Jiafeng Guo, Xueqi Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Chinese academy of sciences Information Retrieval team (CIR) has
+participated in the NTCIR-17 ULTRE-2 task. This paper describes our approaches
+and reports our results on the ULTRE-2 task. We recognize the issue of false
+negatives in the Baidu search data in this competition is very severe, much
+more severe than position bias. Hence, we adopt the Dual Learning Algorithm
+(DLA) to address the position bias and use it as an auxiliary model to study
+how to alleviate the false negative issue. We approach the problem from two
+perspectives: 1) correcting the labels for non-clicked items by a relevance
+judgment model trained from DLA, and learn a new ranker that is initialized
+from DLA; 2) including random documents as true negatives and documents that
+have partial matching as hard negatives. Both methods can enhance the model
+performance and our best method has achieved nDCG@10 of 0.5355, which is 2.66%
+better than the best score from the organizer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 1 figure, NTCIR-17</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DCRNN: A Deep Cross approach based on RNN for Partial Parameter Sharing
+  in Multi-task Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11777v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11777v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Zhou, Qian Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, DL has developed rapidly, and personalized services are
+exploring using DL algorithms to improve the performance of the recommendation
+system. For personalized services, a successful recommendation consists of two
+parts: attracting users to click the item and users being willing to consume
+the item. If both tasks need to be predicted at the same time, traditional
+recommendation systems generally train two independent models. This approach is
+cumbersome and does not effectively model the relationship between the two
+subtasks of "click-consumption". Therefore, in order to improve the success
+rate of recommendation and reduce computational costs, researchers are trying
+to model multi-task learning.
+  At present, existing multi-task learning models generally adopt hard
+parameter sharing or soft parameter sharing architecture, but these two
+architectures each have certain problems. Therefore, in this work, we propose a
+novel recommendation model based on real recommendation scenarios, Deep Cross
+network based on RNN for partial parameter sharing (DCRNN). The model has three
+innovations: 1) It adopts the idea of cross network and uses RNN network to
+cross-process the features, thereby effectively improves the expressive ability
+of the model; 2) It innovatively proposes the structure of partial parameter
+sharing; 3) It can effectively capture the potential correlation between
+different tasks to optimize the efficiency and methods for learning different
+tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work done while the first author was an algorithm engineer at Xiaomi
+  Inc</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Relevance to Utility: Evidence Retrieval with Feedback for Fact
+  Verification <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11675v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11675v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hengran Zhang, Ruqing Zhang, Jiafeng Guo, Maarten de Rijke, Yixing Fan, Xueqi Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-enhanced methods have become a primary approach in fact
+verification (FV); it requires reasoning over multiple retrieved pieces of
+evidence to verify the integrity of a claim. To retrieve evidence, existing
+work often employs off-the-shelf retrieval models whose design is based on the
+probability ranking principle. We argue that, rather than relevance, for FV we
+need to focus on the utility that a claim verifier derives from the retrieved
+evidence. We introduce the feedback-based evidence retriever(FER) that
+optimizes the evidence retrieval process by incorporating feedback from the
+claim verifier. As a feedback signal we use the divergence in utility between
+how effectively the verifier utilizes the retrieved evidence and the
+ground-truth evidence to produce the final claim label. Empirical studies
+demonstrate the superiority of FER over prevailing baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Acctepted by EMNLP 2023 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VKIE: The Application of Key Information Extraction on Video Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11650v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11650v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyu An, Ye Liu, Haoyuan Peng, Di Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extracting structured information from videos is critical for numerous
+downstream applications in the industry. In this paper, we define a significant
+task of extracting hierarchical key information from visual texts on videos. To
+fulfill this task, we decouples it into four subtasks and introduce two
+implementation solutions called PipVKIE and UniVKIE. PipVKIE sequentially
+completes the four subtasks in continuous stages, while UniVKIE is improved by
+unifying all the subtasks into one backbone. Both PipVKIE and UniVKIE leverage
+multimodal information from vision, text, and coordinates for feature
+representation. Extensive experiments on one well-defined dataset demonstrate
+that our solutions can achieve remarkable performance and efficient inference
+speed. The code and dataset will be publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Open Information Extraction: A <span class="highlight-title">Review</span> of Baseline Techniques,
+  Approaches, and Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11644v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11644v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Serafina Kamp, Morteza Fayazi, Zineb Benameur-El, Shuyan Yu, Ronald Dreslinski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the abundant amount of available online and offline text data, there
+arises a crucial need to extract the relation between phrases and summarize the
+main content of each document in a few words. For this purpose, there have been
+many studies recently in Open Information Extraction (OIE). OIE improves upon
+relation extraction techniques by analyzing relations across different domains
+and avoids requiring hand-labeling pre-specified relations in sentences. This
+paper surveys recent approaches of OIE and its applications on Knowledge Graph
+(KG), text summarization, and Question Answering (QA). Moreover, the paper
+describes OIE basis methods in relation extraction. It briefly discusses the
+main approaches and the pros and cons of each method. Finally, it gives an
+overview about challenges, open issues, and future work opportunities for OIE,
+relation extraction, and OIE applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CLaMP: Contrastive Language-Music <span class="highlight-title">Pre-train</span>ing for Cross-Modal Symbolic
+  Music Information Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.11029v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.11029v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shangda Wu, Dingyao Yu, Xu Tan, Maosong Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce CLaMP: Contrastive Language-Music Pre-training, which learns
+cross-modal representations between natural language and symbolic music using a
+music encoder and a text encoder trained jointly with a contrastive loss. To
+pre-train CLaMP, we collected a large dataset of 1.4 million music-text pairs.
+It employed text dropout as a data augmentation technique and bar patching to
+efficiently represent music data which reduces sequence length to less than
+10\%. In addition, we developed a masked music model pre-training objective to
+enhance the music encoder's comprehension of musical context and structure.
+CLaMP integrates textual information to enable semantic search and zero-shot
+classification for symbolic music, surpassing the capabilities of previous
+models. To support the evaluation of semantic search and music classification,
+we publicly release WikiMusicText (WikiMT), a dataset of 1010 lead sheets in
+ABC notation, each accompanied by a title, artist, genre, and description. In
+comparison to state-of-the-art models that require fine-tuning, zero-shot CLaMP
+demonstrated comparable or superior performance on score-oriented datasets. Our
+models and code are available at
+https://github.com/microsoft/muzic/tree/main/clamp.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 5 figures, 5 tables, accepted by ISMIR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Interactive Explanation with Varying Level of Details in an Explainable
+  Scientific Literature Recommender System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05809v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05809v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mouadh Guesmi, Mohamed Amine Chatti, Shoeb Joarder, Qurat Ul Ain, Rawaa Alatrash, Clara Siepmann, Tannaz Vahidi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Explainable recommender systems (RS) have traditionally followed a
+one-size-fits-all approach, delivering the same explanation level of detail to
+each user, without considering their individual needs and goals. Further,
+explanations in RS have so far been presented mostly in a static and
+non-interactive manner. To fill these research gaps, we aim in this paper to
+adopt a user-centered, interactive explanation model that provides explanations
+with different levels of detail and empowers users to interact with, control,
+and personalize the explanations based on their needs and preferences. We
+followed a user-centered approach to design interactive explanations with three
+levels of detail (basic, intermediate, and advanced) and implemented them in
+the transparent Recommendation and Interest Modeling Application (RIMA). We
+conducted a qualitative user study (N=14) to investigate the impact of
+providing interactive explanations with varying level of details on the users'
+perception of the explainable RS. Our study showed qualitative evidence that
+fostering interaction and giving users control in deciding which explanation
+they would like to see can meet the demands of users with different needs,
+preferences, and goals, and consequently can have positive effects on different
+crucial aspects in explainable recommendation, including transparency, trust,
+satisfaction, and user experience.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is an original manuscript of an article published by Taylor &
+  Francis in the International Journal of Human-Computer Interaction on 15 Oct
+  2023, available online: https://doi.org/10.1080/10447318.2023.2262797</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comparative Evaluation of Quantification Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2103.03223v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2103.03223v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias Schumacher, Markus Strohmaier, Florian Lemmerich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantification represents the problem of predicting class distributions in a
+dataset. It also represents a growing research field in supervised machine
+learning, for which a large variety of different algorithms has been proposed
+in recent years. However, a comprehensive empirical comparison of
+quantification methods that supports algorithm selection is not available yet.
+In this work, we close this research gap by conducting a thorough empirical
+performance comparison of 24 different quantification methods on overall more
+than 40 data sets, considering binary as well as multiclass quantification
+settings. We observe that no single algorithm generally outperforms all
+competitors, but identify a group of methods including the threshold
+selection-based Median Sweep and TSMax methods, the DyS framework, and
+Friedman's method that performs best in the binary setting. For the multiclass
+setting, we observe that a different group of algorithms yields good
+performance, including the Generalized Probabilistic Adjusted Count, the readme
+method, the energy distance minimization method, the EM algorithm for
+quantification, and Friedman's method. We also find that tuning the underlying
+classifiers has in most cases only a limited impact on the quantification
+performance. More generally, we find that the performance on multiclass
+quantification is inferior to the results obtained in the binary setting. Our
+results can guide practitioners who intend to apply quantification algorithms
+and help researchers to identify opportunities for future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 18 figures, 10 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Conversational Search: Large Language Model-Aided Informative
+  Query Rewriting <span class="chip">EMNLP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09716v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09716v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fanghua Ye, Meng Fang, Shenghui Li, Emine Yilmaz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Query rewriting plays a vital role in enhancing conversational search by
+transforming context-dependent user queries into standalone forms. Existing
+approaches primarily leverage human-rewritten queries as labels to train query
+rewriting models. However, human rewrites may lack sufficient information for
+optimal retrieval performance. To overcome this limitation, we propose
+utilizing large language models (LLMs) as query rewriters, enabling the
+generation of informative query rewrites through well-designed instructions. We
+define four essential properties for well-formed rewrites and incorporate all
+of them into the instruction. In addition, we introduce the role of rewrite
+editors for LLMs when initial query rewrites are available, forming a
+"rewrite-then-edit" process. Furthermore, we propose distilling the rewriting
+capabilities of LLMs into smaller models to reduce rewriting latency. Our
+experimental evaluation on the QReCC dataset demonstrates that informative
+query rewrites can yield substantially improved retrieval performance compared
+to human rewrites, especially with sparse retrievers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, accepted to EMNLP Findings 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph Neural Networks for Recommendation: Reproducibility, Graph
+  Topology, and Node Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11270v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11270v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniele Malitesta, Claudio Pomo, Tommaso Di Noia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) have gained prominence in recommendation systems
+in recent years. By representing the user-item matrix as a bipartite and
+undirected graph, GNNs have demonstrated their potential to capture short- and
+long-distance user-item interactions, thereby learning more accurate preference
+patterns than traditional recommendation approaches. In contrast to previous
+tutorials on the same topic, this tutorial aims to present and examine three
+key aspects that characterize GNNs for recommendation: (i) the reproducibility
+of state-of-the-art approaches, (ii) the potential impact of graph topological
+characteristics on the performance of these models, and (iii) strategies for
+learning node representations when training features from scratch or utilizing
+pre-trained embeddings as additional item information (e.g., multimodal
+features). The goal is to provide three novel theoretical and practical
+perspectives on the field, currently subject to debate in graph learning but
+long been overlooked in the context of recommendation systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">150</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Probabilistic Sampling of Balanced K-Means using Adiabatic Quantum
+  Computing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12153v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12153v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan-Nico Zaech, Martin Danelljan, Luc Van Gool
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adiabatic quantum computing (AQC) is a promising quantum computing approach
+for discrete and often NP-hard optimization problems. Current AQCs allow to
+implement problems of research interest, which has sparked the development of
+quantum representations for many machine learning and computer vision tasks.
+Despite requiring multiple measurements from the noisy AQC, current approaches
+only utilize the best measurement, discarding information contained in the
+remaining ones. In this work, we explore the potential of using this
+information for probabilistic balanced k-means clustering. Instead of
+discarding non-optimal solutions, we propose to use them to compute calibrated
+posterior probabilities with little additional compute cost. This allows us to
+identify ambiguous solutions and data points, which we demonstrate on a D-Wave
+AQC on synthetic and real data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fairer and More Accurate Tabular Models Through NAS 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12145v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12145v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Richeek Das, Samuel Dooley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Making models algorithmically fairer in tabular data has been long studied,
+with techniques typically oriented towards fixes which usually take a neural
+model with an undesirable outcome and make changes to how the data are
+ingested, what the model weights are, or how outputs are processed. We employ
+an emergent and different strategy where we consider updating the model's
+architecture and training hyperparameters to find an entirely new model with
+better outcomes from the beginning of the debiasing procedure. In this work, we
+propose using multi-objective Neural Architecture Search (NAS) and
+Hyperparameter Optimization (HPO) in the first application to the very
+challenging domain of tabular data. We conduct extensive exploration of
+architectural and hyperparameter spaces (MLP, ResNet, and FT-Transformer)
+across diverse datasets, demonstrating the dependence of accuracy and fairness
+metrics of model predictions on hyperparameter combinations. We show that
+models optimized solely for accuracy with NAS often fail to inherently address
+fairness concerns. We propose a novel approach that jointly optimizes
+architectural and training hyperparameters in a multi-objective constraint of
+both accuracy and fairness. We produce architectures that consistently Pareto
+dominate state-of-the-art bias mitigation methods either in fairness, accuracy
+or both, all of this while being Pareto-optimal over hyperparameters achieved
+through single-objective (accuracy) optimization runs. This research
+underscores the promise of automating fairness and accuracy optimization in
+deep learning models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic financial processes identification using sparse regressive
+  reservoir computers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12144v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12144v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fredy Vides, Idelfonso B. R. Nogueira, Lendy Banegas, Evelyn Flores
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this document, we present key findings in structured matrix approximation
+theory, with applications to the regressive representation of dynamic financial
+processes. Initially, we explore a comprehensive approach involving generic
+nonlinear time delay embedding for time series data extracted from a financial
+or economic system under examination. Subsequently, we employ sparse
+least-squares and structured matrix approximation methods to discern
+approximate representations of the output coupling matrices. These
+representations play a pivotal role in establishing the regressive models
+corresponding to the recursive structures inherent in a given financial system.
+The document further introduces prototypical algorithms that leverage the
+aforementioned techniques. These algorithms are demonstrated through
+applications in approximate identification and predictive simulation of dynamic
+financial and economic processes, encompassing scenarios that may or may not
+exhibit chaotic behavior.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The content of this publication represents the opinion of the
+  researchers affiliated with the Department of Statistics and Research, but
+  not the official opinion of the CNBS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simple Mechanisms for Representing, Indexing and Manipulating Concepts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12143v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12143v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanzhi Li, Raghu Meka, Rina Panigrahy, Kulin Shah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep networks typically learn concepts via classifiers, which involves
+setting up a model and training it via gradient descent to fit the
+concept-labeled data. We will argue instead that learning a concept could be
+done by looking at its moment statistics matrix to generate a concrete
+representation or signature of that concept. These signatures can be used to
+discover structure across the set of concepts and could recursively produce
+higher-level concepts by learning this structure from those signatures. When
+the concepts are `intersected', signatures of the concepts can be used to find
+a common theme across a number of related `intersected' concepts. This process
+could be used to keep a dictionary of concepts so that inputs could correctly
+identify and be routed to the set of concepts involved in the (latent)
+generation of the input.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diagrammer<span class="highlight-title">GPT</span>: Generating Open-Domain, Open-Platform Diagrams via LLM
+  Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12128v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12128v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhay Zala, Han Lin, Jaemin Cho, Mohit Bansal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image (T2I) generation has seen significant growth over the past few
+years. Despite this, there has been little work on generating diagrams with T2I
+models. A diagram is a symbolic/schematic representation that explains
+information using structurally rich and spatially complex visualizations (e.g.,
+a dense combination of related objects, text labels, directional arrows,
+connection lines, etc.). Existing state-of-the-art T2I models often fail at
+diagram generation because they lack fine-grained object layout control when
+many objects are densely connected via complex relations such as arrows/lines
+and also often fail to render comprehensible text labels. To address this gap,
+we present DiagrammerGPT, a novel two-stage text-to-diagram generation
+framework that leverages the layout guidance capabilities of LLMs (e.g., GPT-4)
+to generate more accurate open-domain, open-platform diagrams. In the first
+stage, we use LLMs to generate and iteratively refine 'diagram plans' (in a
+planner-auditor feedback loop) which describe all the entities (objects and
+text labels), their relationships (arrows or lines), and their bounding box
+layouts. In the second stage, we use a diagram generator, DiagramGLIGEN, and a
+text label rendering module to generate diagrams following the diagram plans.
+To benchmark the text-to-diagram generation task, we introduce AI2D-Caption, a
+densely annotated diagram dataset built on top of the AI2D dataset. We show
+quantitatively and qualitatively that our DiagrammerGPT framework produces more
+accurate diagrams, outperforming existing T2I models. We also provide
+comprehensive analysis including open-domain diagram generation, vector graphic
+diagram generation in different platforms, human-in-the-loop diagram plan
+editing, and multimodal planner/auditor LLMs (e.g., GPT-4Vision). We hope our
+work can inspire further research on diagram generation via T2I models and
+LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://diagrammerGPT.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Tale of Pronouns: Interpretability Informs Gender Bias Mitigation for
+  Fairer Instruction-Tuned Machine Translation <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12127v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12127v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giuseppe Attanasio, Flor Miriam Plaza-del-Arco, Debora Nozza, Anne Lauscher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent instruction fine-tuned models can solve multiple NLP tasks when
+prompted to do so, with machine translation (MT) being a prominent use case.
+However, current research often focuses on standard performance benchmarks,
+leaving compelling fairness and ethical considerations behind. In MT, this
+might lead to misgendered translations, resulting, among other harms, in the
+perpetuation of stereotypes and prejudices. In this work, we address this gap
+by investigating whether and to what extent such models exhibit gender bias in
+machine translation and how we can mitigate it. Concretely, we compute
+established gender bias metrics on the WinoMT corpus from English to German and
+Spanish. We discover that IFT models default to male-inflected translations,
+even disregarding female occupational stereotypes. Next, using interpretability
+methods, we unveil that models systematically overlook the pronoun indicating
+the gender of a target occupation in misgendered translations. Finally, based
+on this finding, we propose an easy-to-implement and effective bias mitigation
+solution based on few-shot learning that leads to significantly fairer
+translations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at EMNLP 2023. Code and data at
+  https://github.com/MilaNLProc/interpretability-mt-gender-bias</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SHARCS: Efficient <span class="highlight-title">Transformer</span>s through Routing with Dynamic Width
+  Sub-networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12126v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12126v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammadreza Salehi, Sachin Mehta, Aditya Kusupati, Ali Farhadi, Hannaneh Hajishirzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce SHARCS for adaptive inference that takes into account the
+hardness of input samples. SHARCS can train a router on any transformer
+network, enabling the model to direct different samples to sub-networks with
+varying widths. Our experiments demonstrate that: (1) SHARCS outperforms or
+complements existing per-sample adaptive inference methods across various
+classification tasks in terms of accuracy vs. FLOPs; (2) SHARCS generalizes
+across different architectures and can be even applied to compressed and
+efficient transformer encoders to further improve their efficiency; (3) SHARCS
+can provide a 2 times inference speed up at an insignificant drop in accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic prediction of mortality in patients with mental illness using
+  electronic health records 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12121v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12121v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sean Kim, Samuel Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mental disorders impact the lives of millions of people globally, not only
+impeding their day-to-day lives but also markedly reducing life expectancy.
+This paper addresses the persistent challenge of predicting mortality in
+patients with mental diagnoses using predictive machine-learning models with
+electronic health records (EHR). Data from patients with mental disease
+diagnoses were extracted from the well-known clinical MIMIC-III data set
+utilizing demographic, prescription, and procedural information. Four machine
+learning algorithms (Logistic Regression, Random Forest, Support Vector
+Machine, and K-Nearest Neighbors) were used, with results indicating that
+Random Forest and Support Vector Machine models outperformed others, with AUC
+scores of 0.911. Feature importance analysis revealed that drug prescriptions,
+particularly Morphine Sulfate, play a pivotal role in prediction. We applied a
+variety of machine learning algorithms to predict 30-day mortality followed by
+feature importance analysis. This study can be used to assist hospital workers
+in identifying at-risk patients to reduce excess mortality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MMD-based Variable Importance for Distributional Random Forest 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12115v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12115v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Clément Bénard, Jeffrey Näf, Julie Josse
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Distributional Random Forest (DRF) is a flexible forest-based method to
+estimate the full conditional distribution of a multivariate output of interest
+given input variables. In this article, we introduce a variable importance
+algorithm for DRFs, based on the well-established drop and relearn principle
+and MMD distance. While traditional importance measures only detect variables
+with an influence on the output mean, our algorithm detects variables impacting
+the output distribution more generally. We show that the introduced importance
+measure is consistent, exhibits high empirical performance on both real and
+simulated data, and outperforms competitors. In particular, our algorithm is
+highly efficient to select variables through recursive feature elimination, and
+can therefore provide small sets of variables to build accurate estimates of
+conditional output distributions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Cautionary Tale: On the Role of Reference Data in Empirical Privacy
+  Defenses 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12112v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12112v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Caelin G. Kaplan, Chuan Xu, Othmane Marfoq, Giovanni Neglia, Anderson Santana de Oliveira
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Within the realm of privacy-preserving machine learning, empirical privacy
+defenses have been proposed as a solution to achieve satisfactory levels of
+training data privacy without a significant drop in model utility. Most
+existing defenses against membership inference attacks assume access to
+reference data, defined as an additional dataset coming from the same (or a
+similar) underlying distribution as training data. Despite the common use of
+reference data, previous works are notably reticent about defining and
+evaluating reference data privacy. As gains in model utility and/or training
+data privacy may come at the expense of reference data privacy, it is essential
+that all three aspects are duly considered. In this paper, we first examine the
+availability of reference data and its privacy treatment in previous works and
+demonstrate its necessity for fairly comparing defenses. Second, we propose a
+baseline defense that enables the utility-privacy tradeoff with respect to both
+training and reference data to be easily understood. Our method is formulated
+as an empirical risk minimization with a constraint on the generalization
+error, which, in practice, can be evaluated as a weighted empirical risk
+minimization (WERM) over the training and reference datasets. Although we
+conceived of WERM as a simple baseline, our experiments show that,
+surprisingly, it outperforms the most well-studied and current state-of-the-art
+empirical privacy defenses using reference data for nearly all relative privacy
+levels of reference and training data. Our investigation also reveals that
+these existing methods are unable to effectively trade off reference data
+privacy for model utility and/or training data privacy. Overall, our work
+highlights the need for a proper evaluation of the triad model utility /
+training data privacy / reference data privacy when comparing privacy defenses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Monarch Mixer: A Simple Sub-Quadratic GEMM-Based Architecture <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12109v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12109v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Y. Fu, Simran Arora, Jessica Grogan, Isys Johnson, Sabri Eyuboglu, Armin W. Thomas, Benjamin Spector, Michael Poli, Atri Rudra, Christopher Ré
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning models are increasingly being scaled in both sequence length
+and model dimension to reach longer contexts and better performance. However,
+existing architectures such as Transformers scale quadratically along both
+these axes. We ask: are there performant architectures that can scale
+sub-quadratically along sequence length and model dimension? We introduce
+Monarch Mixer (M2), a new architecture that uses the same sub-quadratic
+primitive along both sequence length and model dimension: Monarch matrices, a
+simple class of expressive structured matrices that captures many linear
+transforms, achieves high hardware efficiency on GPUs, and scales
+sub-quadratically. As a proof of concept, we explore the performance of M2 in
+three domains: non-causal BERT-style language modeling, ViT-style image
+classification, and causal GPT-style language modeling. For non-causal
+BERT-style modeling, M2 matches BERT-base and BERT-large in downstream GLUE
+quality with up to 27% fewer parameters, and achieves up to 9.1$\times$ higher
+throughput at sequence length 4K. On ImageNet, M2 outperforms ViT-b by 1% in
+accuracy, with only half the parameters. Causal GPT-style models introduce a
+technical challenge: enforcing causality via masking introduces a quadratic
+bottleneck. To alleviate this bottleneck, we develop a novel theoretical view
+of Monarch matrices based on multivariate polynomial evaluation and
+interpolation, which lets us parameterize M2 to be causal while remaining
+sub-quadratic. Using this parameterization, M2 matches GPT-style Transformers
+at 360M parameters in pretraining perplexity on The PILE--showing for the first
+time that it may be possible to match Transformer quality without attention or
+MLPs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023 (Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Online Learning Theory of Brokerage 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12107v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12107v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nataša Bolić, Tommaso Cesari, Roberto Colomboni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate brokerage between traders from an online learning perspective.
+At any round $t$, two traders arrive with their private valuations, and the
+broker proposes a trading price. Unlike other bilateral trade problems already
+studied in the online learning literature, we focus on the case where there are
+no designated buyer and seller roles: each trader will attempt to either buy or
+sell depending on the current price of the good.
+  We assume the agents' valuations are drawn i.i.d. from a fixed but unknown
+distribution. If the distribution admits a density bounded by some constant
+$M$, then, for any time horizon $T$:
+  $\bullet$ If the agents' valuations are revealed after each interaction, we
+provide an algorithm achieving regret $M \log T$ and show this rate is optimal,
+up to constant factors.
+  $\bullet$ If only their willingness to sell or buy at the proposed price is
+revealed after each interaction, we provide an algorithm achieving regret
+$\sqrt{M T}$ and show this rate is optimal, up to constant factors.
+  Finally, if we drop the bounded density assumption, we show that the optimal
+rate degrades to $\sqrt{T}$ in the first case, and the problem becomes
+unlearnable in the second.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Non-Intrusive Adaptation: Input-Centric Parameter-efficient Fine-Tuning
+  for Versatile Multimodal Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12100v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12100v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaqing Wang, Jialin Wu, Tanmaya Dabral, Jiageng Zhang, Geoff Brown, Chun-Ta Lu, Frederick Liu, Yi Liang, Bo Pang, Michael Bendersky, Radu Soricut
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) and vision language models (VLMs) demonstrate
+excellent performance on a wide range of tasks by scaling up parameter counts
+from O(10^9) to O(10^{12}) levels and further beyond. These large scales make
+it impossible to adapt and deploy fully specialized models given a task of
+interest. Parameter-efficient fine-tuning (PEFT) emerges as a promising
+direction to tackle the adaptation and serving challenges for such large
+models. We categorize PEFT techniques into two types: intrusive and
+non-intrusive. Intrusive PEFT techniques directly change a model's internal
+architecture. Though more flexible, they introduce significant complexities for
+training and serving. Non-intrusive PEFT techniques leave the internal
+architecture unchanged and only adapt model-external parameters, such as
+embeddings for input. In this work, we describe AdaLink as a non-intrusive PEFT
+technique that achieves competitive performance compared to SoTA intrusive PEFT
+(LoRA) and full model fine-tuning (FT) on various tasks. We evaluate using both
+text-only and multimodal tasks, with experiments that account for both
+parameter-count scaling and training regime (with and without instruction
+tuning).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the latent dimension of deep autoencoders for reduced order modeling
+  of PDEs parametrized by random fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12095v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12095v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicola Rares Franco, Daniel Fraulin, Andrea Manzoni, Paolo Zunino
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Learning is having a remarkable impact on the design of Reduced Order
+Models (ROMs) for Partial Differential Equations (PDEs), where it is exploited
+as a powerful tool for tackling complex problems for which classical methods
+might fail. In this respect, deep autoencoders play a fundamental role, as they
+provide an extremely flexible tool for reducing the dimensionality of a given
+problem by leveraging on the nonlinear capabilities of neural networks. Indeed,
+starting from this paradigm, several successful approaches have already been
+developed, which are here referred to as Deep Learning-based ROMs (DL-ROMs).
+Nevertheless, when it comes to stochastic problems parameterized by random
+fields, the current understanding of DL-ROMs is mostly based on empirical
+evidence: in fact, their theoretical analysis is currently limited to the case
+of PDEs depending on a finite number of (deterministic) parameters. The purpose
+of this work is to extend the existing literature by providing some theoretical
+insights about the use of DL-ROMs in the presence of stochasticity generated by
+random fields. In particular, we derive explicit error bounds that can guide
+domain practitioners when choosing the latent dimension of deep autoencoders.
+We evaluate the practical usefulness of our theory by means of numerical
+experiments, showing how our analysis can significantly impact the performance
+of DL-ROMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unveiling the Siren's Song: Towards Reliable Fact-Conflicting
+  Hallucination Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12086v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12086v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Chen, Duanzheng Song, Honghao Gui, Chengxi Wang, Ningyu Zhang, Fei Huang, Chengfei Lv, Dan Zhang, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs), such as ChatGPT/GPT-4, have garnered widespread
+attention owing to their myriad of practical applications, yet their adoption
+has been constrained by issues of fact-conflicting hallucinations across web
+platforms. The assessment of factuality in text, produced by LLMs, remains
+inadequately explored, extending not only to the judgment of vanilla facts but
+also encompassing the evaluation of factual errors emerging in complex
+inferential tasks like multi-hop, and etc. In response, we introduce FactCHD, a
+fact-conflicting hallucination detection benchmark meticulously designed for
+LLMs. Functioning as a pivotal tool in evaluating factuality within
+"Query-Respons" contexts, our benchmark assimilates a large-scale dataset,
+encapsulating a broad spectrum of factuality patterns, such as vanilla,
+multi-hops, comparison, and set-operation patterns. A distinctive feature of
+our benchmark is its incorporation of fact-based chains of evidence, thereby
+facilitating comprehensive and conducive factual reasoning throughout the
+assessment process. We evaluate multiple LLMs, demonstrating the effectiveness
+of the benchmark and current methods fall short of faithfully detecting factual
+errors. Furthermore, we present TRUTH-TRIANGULATOR that synthesizes reflective
+considerations by tool-enhanced ChatGPT and LoRA-tuning based on Llama2, aiming
+to yield more credible detection through the amalgamation of predictive results
+and evidence. The benchmark dataset and source code will be made available in
+https://github.com/zjunlp/FactCHD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contributing Components of Metabolic Energy Models to Metabolic Cost
+  Estimations in Gait 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12083v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12083v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Markus Gambietz, Marlies Nitschke, Jörg Miehling, Anne Koelewijn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objective: As metabolic cost is a primary factor influencing humans' gait, we
+want to deepen our understanding of metabolic energy expenditure models.
+Therefore, this paper identifies the parameters and input variables, such as
+muscle or joint states, that contribute to accurate metabolic cost estimations.
+Methods: We explored the parameters of four metabolic energy expenditure models
+in a Monte Carlo sensitivity analysis. Then, we analysed the model parameters
+by their calculated sensitivity indices, physiological context, and the
+resulting metabolic rates during the gait cycle. The parameter combination with
+the highest accuracy in the Monte Carlo simulations represented a
+quasi-optimized model. In the second step, we investigated the importance of
+input parameters and variables by analysing the accuracy of neural networks
+trained with different input features. Results: Power-related parameters were
+most influential in the sensitivity analysis and the neural network-based
+feature selection. We observed that the quasi-optimized models produced
+negative metabolic rates, contradicting muscle physiology. Neural network-based
+models showed promising abilities but have been unable to match the accuracy of
+traditional metabolic energy expenditure models. Conclusion: We showed that
+power-related metabolic energy expenditure model parameters and inputs are most
+influential during gait. Furthermore, our results suggest that neural
+network-based metabolic energy expenditure models are viable. However, bigger
+datasets are required to achieve better accuracy. Significance: As there is a
+need for more accurate metabolic energy expenditure models, we explored which
+musculoskeletal parameters are essential when developing a model to estimate
+metabolic energy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Differential Equation Scaling Limits of Shaped and Unshaped Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12079v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12079v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mufan Bill Li, Mihai Nica
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent analyses of neural networks with shaped activations (i.e. the
+activation function is scaled as the network size grows) have led to scaling
+limits described by differential equations. However, these results do not a
+priori tell us anything about "ordinary" unshaped networks, where the
+activation is unchanged as the network size grows. In this article, we find
+similar differential equation based asymptotic characterization for two types
+of unshaped networks.
+  Firstly, we show that the following two architectures converge to the same
+infinite-depth-and-width limit at initialization: (i) a fully connected ResNet
+with a $d^{-1/2}$ factor on the residual branch, where $d$ is the network
+depth. (ii) a multilayer perceptron (MLP) with depth $d \ll$ width $n$ and
+shaped ReLU activation at rate $d^{-1/2}$.
+  Secondly, for an unshaped MLP at initialization, we derive the first order
+asymptotic correction to the layerwise correlation. In particular, if
+$\rho_\ell$ is the correlation at layer $\ell$, then $q_t = \ell^2 (1 -
+\rho_\ell)$ with $t = \frac{\ell}{n}$ converges to an SDE with a singularity at
+$t=0$.
+  These results together provide a connection between shaped and unshaped
+network architectures, and opens up the possibility of studying the effect of
+normalization methods and how it connects with shaping activation functions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ One-Shot Imitation Learning: A Pose Estimation Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12077v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12077v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pietro Vitiello, Kamil Dreczkowski, Edward Johns
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we study imitation learning under the challenging setting of:
+(1) only a single demonstration, (2) no further data collection, and (3) no
+prior task or object knowledge. We show how, with these constraints, imitation
+learning can be formulated as a combination of trajectory transfer and unseen
+object pose estimation. To explore this idea, we provide an in-depth study on
+how state-of-the-art unseen object pose estimators perform for one-shot
+imitation learning on ten real-world tasks, and we take a deep dive into the
+effects that camera calibration, pose estimation error, and spatial
+generalisation have on task success rates. For videos, please visit
+https://www.robot-learning.uk/pose-estimation-perspective.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at the 7th Conference on Robot Learning (CoRL 2023). For
+  more details please visit
+  https://www.robot-learning.uk/pose-estimation-perspective</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Transformer</span>s for scientific data: a pedagogical <span class="highlight-title">review</span> for astronomers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12069v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12069v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dimitrios Tanoglidis, Bhuvnesh Jain, Helen Qu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The deep learning architecture associated with ChatGPT and related generative
+AI products is known as transformers. Initially applied to Natural Language
+Processing, transformers and the self-attention mechanism they exploit have
+gained widespread interest across the natural sciences. The goal of this
+pedagogical and informal review is to introduce transformers to scientists. Our
+pedagogical and informal review includes the mathematics underlying the
+attention mechanism, a description of the original transformer architecture,
+and a section on applications to time series and imaging data in astronomy. We
+include with a Frequently Asked Questions section for readers who are curious
+about generative AI and interested in getting started with transformers for
+their research problem.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Black-Box Training Data Identification in GANs via Detector Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12063v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12063v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukman Olagoke, Salil Vadhan, Seth Neel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since their inception Generative Adversarial Networks (GANs) have been
+popular generative models across images, audio, video, and tabular data. In
+this paper we study whether given access to a trained GAN, as well as fresh
+samples from the underlying distribution, if it is possible for an attacker to
+efficiently identify if a given point is a member of the GAN's training data.
+This is of interest for both reasons related to copyright, where a user may
+want to determine if their copyrighted data has been used to train a GAN, and
+in the study of data privacy, where the ability to detect training set
+membership is known as a membership inference attack. Unlike the majority of
+prior work this paper investigates the privacy implications of using GANs in
+black-box settings, where the attack only has access to samples from the
+generator, rather than access to the discriminator as well. We introduce a
+suite of membership inference attacks against GANs in the black-box setting and
+evaluate our attacks on image GANs trained on the CIFAR10 dataset and tabular
+GANs trained on genomic data. Our most successful attack, called The Detector,
+involve training a second network to score samples based on their likelihood of
+being generated by the GAN, as opposed to a fresh sample from the distribution.
+We prove under a simple model of the generator that the detector is an
+approximately optimal membership inference attack. Across a wide range of
+tabular and image datasets, attacks, and GAN architectures, we find that
+adversaries can orchestrate non-trivial privacy attacks when provided with
+access to samples from the generator. At the same time, the attack success
+achievable against GANs still appears to be lower compared to other generative
+and discriminative models; this leaves the intriguing open question of whether
+GANs are in fact more private, or if it is a matter of developing stronger
+attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding Reward Ambiguity Through Optimal Transport Theory in
+  Inverse Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12055v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12055v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Baheri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In inverse reinforcement learning (IRL), the central objective is to infer
+underlying reward functions from observed expert behaviors in a way that not
+only explains the given data but also generalizes to unseen scenarios. This
+ensures robustness against reward ambiguity where multiple reward functions can
+equally explain the same expert behaviors. While significant efforts have been
+made in addressing this issue, current methods often face challenges with
+high-dimensional problems and lack a geometric foundation. This paper harnesses
+the optimal transport (OT) theory to provide a fresh perspective on these
+challenges. By utilizing the Wasserstein distance from OT, we establish a
+geometric framework that allows for quantifying reward ambiguity and
+identifying a central representation or centroid of reward functions. These
+insights pave the way for robust IRL methodologies anchored in geometric
+interpretations, offering a structured approach to tackle reward ambiguity in
+high-dimensional settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Machine Learning-based Nutrient Application's Timeline Recommendation
+  for Smart Agriculture: A Large-Scale Data Mining Approach <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12052v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12052v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Usama Ikhlaq, Tahar Kechadi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study addresses the vital role of data analytics in monitoring
+fertiliser applications in crop cultivation. Inaccurate fertiliser application
+decisions can lead to costly consequences, hinder food production, and cause
+environmental harm. We propose a solution to predict nutrient application by
+determining required fertiliser quantities for an entire season. The proposed
+solution recommends adjusting fertiliser amounts based on weather conditions
+and soil characteristics to promote cost-effective and environmentally friendly
+agriculture. The collected dataset is high-dimensional and heterogeneous. Our
+research examines large-scale heterogeneous datasets in the context of the
+decision-making process, encompassing data collection and analysis. We also
+study the impact of fertiliser applications combined with weather data on crop
+yield, using the winter wheat crop as a case study. By understanding local
+contextual and geographic factors, we aspire to stabilise or even reduce the
+demand for agricultural nutrients while enhancing crop development. The
+proposed approach is proven to be efficient and scalable, as it is validated
+using a real-world and large dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Research articles have: 6 Pages, 6 Figures, and 3 Tables |
+  ACKNOWLEDGMENT: CONSUS is funded under Science Foundation Ireland's Strategic
+  Partnerships Programme (16/SPP/3296) and is co-funded by Origin Enterprises
+  Plc</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Applications of ML-Based Surrogates in Bayesian Approaches to Inverse
+  Problems <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12046v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12046v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pelin Ersin, Emma Hayes, Peter Matthews, Paramjyoti Mohapatra, Elisa Negrini, Karl Schulz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks have become a powerful tool as surrogate models to provide
+numerical solutions for scientific problems with increased computational
+efficiency. This efficiency can be advantageous for numerically challenging
+problems where time to solution is important or when evaluation of many similar
+analysis scenarios is required. One particular area of scientific interest is
+the setting of inverse problems, where one knows the forward dynamics of a
+system are described by a partial differential equation and the task is to
+infer properties of the system given (potentially noisy) observations of these
+dynamics. We consider the inverse problem of inferring the location of a wave
+source on a square domain, given a noisy solution to the 2-D acoustic wave
+equation. Under the assumption of Gaussian noise, a likelihood function for
+source location can be formulated, which requires one forward simulation of the
+system per evaluation. Using a standard neural network as a surrogate model
+makes it computationally feasible to evaluate this likelihood several times,
+and so Markov Chain Monte Carlo methods can be used to evaluate the posterior
+distribution of the source location. We demonstrate that this method can
+accurately infer source-locations from noisy data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures, submitted to NeurIPS Workshop on Machine Learning
+  for Physical Sciences</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A General Theoretical Paradigm to Understand Learning from Human
+  Preferences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12036v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12036v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Gheshlaghi Azar, Mark Rowland, Bilal Piot, Daniel Guo, Daniele Calandriello, Michal Valko, Rémi Munos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The prevalent deployment of learning from human preferences through
+reinforcement learning (RLHF) relies on two important approximations: the first
+assumes that pairwise preferences can be substituted with pointwise rewards.
+The second assumes that a reward model trained on these pointwise rewards can
+generalize from collected data to out-of-distribution data sampled by the
+policy. Recently, Direct Preference Optimisation (DPO) has been proposed as an
+approach that bypasses the second approximation and learn directly a policy
+from collected data without the reward modelling stage. However, this method
+still heavily relies on the first approximation.
+  In this paper we try to gain a deeper theoretical understanding of these
+practical algorithms. In particular we derive a new general objective called
+$\Psi$PO for learning from human preferences that is expressed in terms of
+pairwise preferences and therefore bypasses both approximations. This new
+general objective allows us to perform an in-depth analysis of the behavior of
+RLHF and DPO (as special cases of $\Psi$PO) and to identify their potential
+pitfalls. We then consider another special case for $\Psi$PO by setting $\Psi$
+simply to Identity, for which we can derive an efficient optimisation
+procedure, prove performance guarantees and demonstrate its empirical
+superiority to DPO on some illustrative examples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conformal Drug Property Prediction with Density Estimation under
+  Covariate Shift <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12033v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12033v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siddhartha Laghuvarapu, Zhen Lin, Jimeng Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In drug discovery, it is vital to confirm the predictions of pharmaceutical
+properties from computational models using costly wet-lab experiments. Hence,
+obtaining reliable uncertainty estimates is crucial for prioritizing drug
+molecules for subsequent experimental validation. Conformal Prediction (CP) is
+a promising tool for creating such prediction sets for molecular properties
+with a coverage guarantee. However, the exchangeability assumption of CP is
+often challenged with covariate shift in drug discovery tasks: Most datasets
+contain limited labeled data, which may not be representative of the vast
+chemical space from which molecules are drawn. To address this limitation, we
+propose a method called CoDrug that employs an energy-based model leveraging
+both training data and unlabelled data, and Kernel Density Estimation (KDE) to
+assess the densities of a molecule set. The estimated densities are then used
+to weigh the molecule samples while building prediction sets and rectifying for
+distribution shift. In extensive experiments involving realistic distribution
+drifts in various small-molecule drug discovery tasks, we demonstrate the
+ability of CoDrug to provide valid prediction sets and its utility in
+addressing the distribution shift arising from de novo drug design models. On
+average, using CoDrug can reduce the coverage gap by over 35% when compared to
+conformal prediction sets not adjusted for covariate shift.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exact and efficient solutions of the LMC Multitask Gaussian Process
+  model <span class="chip">AISTATS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12032v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12032v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Olivier Truffinet, Karim Ammar, Jean-Philippe Argaud, Bertrand Bouriquet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Linear Model of Co-regionalization (LMC) is a very general model of
+multitask gaussian process for regression or classification. While its
+expressivity and conceptual simplicity are appealing, naive implementations
+have cubic complexity in the number of datapoints and number of tasks, making
+approximations mandatory for most applications. However, recent work has shown
+that under some conditions the latent processes of the model can be decoupled,
+leading to a complexity that is only linear in the number of said processes. We
+here extend these results, showing from the most general assumptions that the
+only condition necessary to an efficient exact computation of the LMC is a mild
+hypothesis on the noise model. We introduce a full parametrization of the
+resulting \emph{projected LMC} model, and an expression of the marginal
+likelihood enabling efficient optimization. We perform a parametric study on
+synthetic data to show the excellent performance of our approach, compared to
+an unrestricted exact LMC and approximations of the latter. Overall, the
+projected LMC appears as a credible and simpler alternative to state-of-the art
+models, which greatly facilitates some computations such as leave-one-out
+cross-validation and fantasization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 5 figures, submitted to AISTATS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SegmATRon: Embodied Adaptive Semantic Segmentation for Indoor
+  Environment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12031v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12031v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tatiana Zemskova, Margarita Kichik, Dmitry Yudin, Aleksei Staroverov, Aleksandr Panov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an adaptive transformer model named SegmATRon for
+embodied image semantic segmentation. Its distinctive feature is the adaptation
+of model weights during inference on several images using a hybrid
+multicomponent loss function. We studied this model on datasets collected in
+the photorealistic Habitat and the synthetic AI2-THOR Simulators. We showed
+that obtaining additional images using the agent's actions in an indoor
+environment can improve the quality of semantic segmentation. The code of the
+proposed approach and datasets are publicly available at
+https://github.com/wingrune/SegmATRon.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nonparametric Discrete Choice Experiments with Machine Learning Guided
+  Adaptive Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12026v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12026v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingzhang Yin, Ruijiang Gao, Weiran Lin, Steven M. Shugan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Designing products to meet consumers' preferences is essential for a
+business's success. We propose the Gradient-based Survey (GBS), a discrete
+choice experiment for multiattribute product design. The experiment elicits
+consumer preferences through a sequence of paired comparisons for partial
+profiles. GBS adaptively constructs paired comparison questions based on the
+respondents' previous choices. Unlike the traditional random utility
+maximization paradigm, GBS is robust to model misspecification by not requiring
+a parametric utility model. Cross-pollinating the machine learning and
+experiment design, GBS is scalable to products with hundreds of attributes and
+can design personalized products for heterogeneous consumers. We demonstrate
+the advantage of GBS in accuracy and sample efficiency compared to the existing
+parametric and nonparametric methods in simulations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bayesian Flow Networks in Continual Learning <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12001v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12001v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mateusz Pyla, Kamil Deja, Bartłomiej Twardowski, Tomasz Trzciński
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian Flow Networks (BFNs) has been recently proposed as one of the most
+promising direction to universal generative modelling, having ability to learn
+any of the data type. Their power comes from the expressiveness of neural
+networks and Bayesian inference which make them suitable in the context of
+continual learning. We delve into the mechanics behind BFNs and conduct the
+experiments to empirically verify the generative capabilities on non-stationary
+data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to NeurIPS 2023 Workshop on Diffusion Models</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Iterative Methods for Vecchia-Laplace Approximations for Latent Gaussian
+  Process Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12000v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12000v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pascal Kündig, Fabio Sigrist
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Latent Gaussian process (GP) models are flexible probabilistic non-parametric
+function models. Vecchia approximations are accurate approximations for GPs to
+overcome computational bottlenecks for large data, and the Laplace
+approximation is a fast method with asymptotic convergence guarantees to
+approximate marginal likelihoods and posterior predictive distributions for
+non-Gaussian likelihoods. Unfortunately, the computational complexity of
+combined Vecchia-Laplace approximations grows faster than linearly in the
+sample size when used in combination with direct solver methods such as the
+Cholesky decomposition. Computations with Vecchia-Laplace approximations thus
+become prohibitively slow precisely when the approximations are usually the
+most accurate, i.e., on large data sets. In this article, we present several
+iterative methods for inference with Vecchia-Laplace approximations which make
+computations considerably faster compared to Cholesky-based calculations. We
+analyze our proposed methods theoretically and in experiments with simulated
+and real-world data. In particular, we obtain a speed-up of an order of
+magnitude compared to Cholesky-based inference and a threefold increase in
+prediction accuracy in terms of the continuous ranked probability score
+compared to a state-of-the-art method on a large satellite data set. All
+methods are implemented in a free C++ software library with high-level Python
+and R packages.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Removing Spurious Concepts from Neural Network Representations via Joint
+  Subspace Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11991v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11991v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Floris Holstege, Bram Wouters, Noud van Giersbergen, Cees Diks
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-distribution generalization in neural networks is often hampered by
+spurious correlations. A common strategy is to mitigate this by removing
+spurious concepts from the neural network representation of the data. Existing
+concept-removal methods tend to be overzealous by inadvertently eliminating
+features associated with the main task of the model, thereby harming model
+performance. We propose an iterative algorithm that separates spurious from
+main-task concepts by jointly identifying two low-dimensional orthogonal
+subspaces in the neural network representation. We evaluate the algorithm on
+benchmark datasets for computer vision (Waterbirds, CelebA) and natural
+language processing (MultiNLI), and show that it outperforms existing concept
+removal methods
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Under Review. 33 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Image Clustering with External Guidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11989v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11989v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunfan Li, Peng Hu, Dezhong Peng, Jiancheng Lv, Jianping Fan, Xi Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The core of clustering is incorporating prior knowledge to construct
+supervision signals. From classic k-means based on data compactness to recent
+contrastive clustering guided by self-supervision, the evolution of clustering
+methods intrinsically corresponds to the progression of supervision signals. At
+present, substantial efforts have been devoted to mining internal supervision
+signals from data. Nevertheless, the abundant external knowledge such as
+semantic descriptions, which naturally conduces to clustering, is regrettably
+overlooked. In this work, we propose leveraging external knowledge as a new
+supervision signal to guide clustering, even though it seems irrelevant to the
+given data. To implement and validate our idea, we design an externally guided
+clustering method (Text-Aided Clustering, TAC), which leverages the textual
+semantics of WordNet to facilitate image clustering. Specifically, TAC first
+selects and retrieves WordNet nouns that best distinguish images to enhance the
+feature discriminability. Then, to improve image clustering performance, TAC
+collaborates text and image modalities by mutually distilling cross-modal
+neighborhood information. Experiments demonstrate that TAC achieves
+state-of-the-art performance on five widely used and three more challenging
+image clustering benchmarks, including the full ImageNet-1K dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Finite-Horizon Approach to Active Level Set Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11985v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11985v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Phillip Kearns, Bruno Jedynak, John Lipor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of active learning in the context of spatial sampling
+for level set estimation (LSE), where the goal is to localize all regions where
+a function of interest lies above/below a given threshold as quickly as
+possible. We present a finite-horizon search procedure to perform LSE in one
+dimension while optimally balancing both the final estimation error and the
+distance traveled for a fixed number of samples. A tuning parameter is used to
+trade off between the estimation accuracy and distance traveled. We show that
+the resulting optimization problem can be solved in closed form and that the
+resulting policy generalizes existing approaches to this problem. We then show
+how this approach can be used to perform level set estimation in higher
+dimensions under the popular Gaussian process model. Empirical results on
+synthetic data indicate that as the cost of travel increases, our method's
+ability to treat distance nonmyopically allows it to significantly improve on
+the state of the art. On real air quality data, our approach achieves roughly
+one fifth the estimation error at less than half the cost of competing
+algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Interpolation to Extrapolation: Complete Length Generalization for
+  Arithmetic <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11984v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11984v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaoxiong Duan, Yining Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since its introduction, the transformer model has demonstrated outstanding
+performance across various tasks. However, there are still unresolved issues
+regarding length generalization, particularly in algorithmic tasks. In this
+paper, we investigate the inherent capabilities of transformer models in
+learning arithmetic algorithms, such as addition and multiplication. Through
+experiments and attention analysis, we identify a number of crucial factors for
+achieving optimal length generalization. We show that transformer models are
+able to generalize to long lengths with the help of targeted attention biasing.
+We then introduce Attention Bias Calibration (ABC), a calibration stage that
+enables the model to automatically learn the proper attention biases, which we
+link to mechanisms in relative position encoding. We demonstrate that using
+ABC, the transformer model can achieve unprecedented perfect length
+generalization on certain arithmetic tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can bin-wise scaling improve consistency and adaptivity of prediction
+  uncertainty for machine learning regression ? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11978v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11978v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pascal Pernot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Binwise Variance Scaling (BVS) has recently been proposed as a post hoc
+recalibration method for prediction uncertainties of machine learning
+regression problems that is able of more efficient corrections than uniform
+variance (or temperature) scaling. The original version of BVS uses
+uncertainty-based binning, which is aimed to improve calibration conditionally
+on uncertainty, i.e. consistency. I explore here several adaptations of BVS, in
+particular with alternative loss functions and a binning scheme based on an
+input-feature (X) in order to improve adaptivity, i.e. calibration conditional
+on X. The performances of BVS and its proposed variants are tested on a
+benchmark dataset for the prediction of atomization energies and compared to
+the results of isotonic regression.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Generalization of Alignment with Human Preferences through
+  Group Invariant Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11971v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11971v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Zheng, Wei Shen, Yuan Hua, Wenbin Lai, Shihan Dou, Yuhao Zhou, Zhiheng Xi, Xiao Wang, Haoran Huang, Tao Gui, Qi Zhang, Xuanjing Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The success of AI assistants based on language models (LLMs) hinges crucially
+on Reinforcement Learning from Human Feedback (RLHF), which enables the
+generation of responses more aligned with human preferences. As universal AI
+assistants, there's a growing expectation for them to perform consistently
+across various domains. However, previous work shows that Reinforcement
+Learning (RL) often exploits shortcuts to attain high rewards and overlooks
+challenging samples. This focus on quick reward gains undermines both the
+stability in training and the model's ability to generalize to new, unseen
+data. In this work, we propose a novel approach that can learn a consistent
+policy via RL across various data groups or domains. Given the challenges
+associated with acquiring group annotations, our method automatically
+classifies data into different groups, deliberately maximizing performance
+variance. Then, we optimize the policy to perform well on challenging groups.
+Lastly, leveraging the established groups, our approach adaptively adjusts the
+exploration space, allocating more learning capacity to more challenging data
+and preventing the model from over-optimizing on simpler data. Experimental
+results indicate that our approach significantly enhances training stability
+and model generalization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Take the aTrain. Introducing an Interface for the Accessible
+  Transcription of Interviews 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11967v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11967v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Armin Haberl, Jürgen Fleiß, Dominik Kowald, Stefan Thalmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  aTrain is an open-source and offline tool for transcribing audio data in
+multiple languages with CPU and NVIDIA GPU support. It is specifically designed
+for researchers using qualitative data generated from various forms of speech
+interactions with research participants. aTrain requires no programming skills,
+runs on most computers, does not require an internet connection, and was
+verified not to upload data to any server. aTrain combines OpenAI's Whisper
+model with speaker recognition to provide output that integrates with the
+popular qualitative data analysis software tools MAXQDA and ATLAS.ti. It has an
+easy-to-use graphical interface and is provided as a Windows-App through the
+Microsoft Store allowing for simple installation by researchers. The source
+code is freely available on GitHub. Having developed aTrain with a focus on
+speed on local computers, we show that the transcription time on current mobile
+CPUs is around 2 to 3 times the duration of the audio file using the
+highest-accuracy transcription models. If an entry-level graphics card is
+available, the transcription speed increases to 20% of the audio duration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Install via Microsoft store:
+  apps.microsoft.com/store/detail/atrain/9N15Q44SZNS2. Github:
+  github.com/BANDAS-Center/aTrain</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Flexible Payload Configuration for Satellites using Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11966v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11966v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marcele O. K. Mendonca, Flor G. Ortiz-Gomez, Jorge Querol, Eva Lagunas, Juan A. Vásquez Peralvo, Victor Monzon Baeza, Symeon Chatzinotas, Bjorn Ottersten
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Satellite communications, essential for modern connectivity, extend access to
+maritime, aeronautical, and remote areas where terrestrial networks are
+unfeasible. Current GEO systems distribute power and bandwidth uniformly across
+beams using multi-beam footprints with fractional frequency reuse. However,
+recent research reveals the limitations of this approach in heterogeneous
+traffic scenarios, leading to inefficiencies. To address this, this paper
+presents a machine learning (ML)-based approach to Radio Resource Management
+(RRM).
+  We treat the RRM task as a regression ML problem, integrating RRM objectives
+and constraints into the loss function that the ML algorithm aims at
+minimizing. Moreover, we introduce a context-aware ML metric that evaluates the
+ML model's performance but also considers the impact of its resource allocation
+decisions on the overall performance of the communication system.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>in review for conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast Multipole Attention: A Divide-and-Conquer Attention Mechanism for
+  Long Sequences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11960v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11960v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanming Kang, Giang Tran, Hans De Sterck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based models have achieved state-of-the-art performance in many
+areas. However, the quadratic complexity of self-attention with respect to the
+input length hinders the applicability of Transformer-based models to long
+sequences. To address this, we present Fast Multipole Attention, a new
+attention mechanism that uses a divide-and-conquer strategy to reduce the time
+and memory complexity of attention for sequences of length $n$ from
+$\mathcal{O}(n^2)$ to $\mathcal{O}(n \log n)$ or $O(n)$, while retaining a
+global receptive field. The hierarchical approach groups queries, keys, and
+values into $\mathcal{O}( \log n)$ levels of resolution, where groups at
+greater distances are increasingly larger in size and the weights to compute
+group quantities are learned. As such, the interaction between tokens far from
+each other is considered in lower resolution in an efficient hierarchical
+manner. The overall complexity of Fast Multipole Attention is $\mathcal{O}(n)$
+or $\mathcal{O}(n \log n)$, depending on whether the queries are down-sampled
+or not. This multi-level divide-and-conquer strategy is inspired by fast
+summation methods from $n$-body physics and the Fast Multipole Method. We
+perform evaluation on autoregressive and bidirectional language modeling tasks
+and compare our Fast Multipole Attention model with other efficient attention
+variants on medium-size datasets. We find empirically that the Fast Multipole
+Transformer performs much better than other efficient transformers in terms of
+memory size and accuracy. The Fast Multipole Attention mechanism has the
+potential to empower large language models with much greater sequence lengths,
+taking the full context into account in an efficient, naturally hierarchical
+manner during training and when generating long sequences.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Multi-Scale Decomposition MLP-Mixer for Time Series Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11959v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11959v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuhan Zhong, Sizhe Song, Guanyao Li, Weipeng Zhuo, Yang Liu, S. -H. Gary Chan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time series data, often characterized by unique composition and complex
+multi-scale temporal variations, requires special consideration of
+decomposition and multi-scale modeling in its analysis. Existing deep learning
+methods on this best fit to only univariate time series, and have not
+sufficiently accounted for sub-series level modeling and decomposition
+completeness. To address this, we propose MSD-Mixer, a Multi-Scale
+Decomposition MLP-Mixer which learns to explicitly decompose the input time
+series into different components, and represents the components in different
+layers. To handle multi-scale temporal patterns and inter-channel dependencies,
+we propose a novel temporal patching approach to model the time series as
+multi-scale sub-series, i.e., patches, and employ MLPs to mix intra- and
+inter-patch variations and channel-wise correlations. In addition, we propose a
+loss function to constrain both the magnitude and autocorrelation of the
+decomposition residual for decomposition completeness. Through extensive
+experiments on various real-world datasets for five common time series analysis
+tasks (long- and short-term forecasting, imputation, anomaly detection, and
+classification), we demonstrate that MSD-Mixer consistently achieves
+significantly better performance in comparison with other state-of-the-art
+task-general and task-specific approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Emptying the Ocean with a Spoon: Should We Edit Models? <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11958v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11958v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuval Pinter, Michael Elhadad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We call into question the recently popularized method of direct model editing
+as a means of correcting factual errors in LLM generations. We contrast model
+editing with three similar but distinct approaches that pursue better defined
+objectives: (1) retrieval-based architectures, which decouple factual memory
+from inference and linguistic capabilities embodied in LLMs; (2) concept
+erasure methods, which aim at preventing systemic bias in generated text; and
+(3) attribution methods, which aim at grounding generations into identified
+textual sources. We argue that direct model editing cannot be trusted as a
+systematic remedy for the disadvantages inherent to LLMs, and while it has
+proven potential in improving model explainability, it opens risks by
+reinforcing the notion that models can be trusted for factuality. We call for
+cautious promotion and application of model editing as part of the LLM
+deployment process, and for responsibly limiting the use cases of LLMs to those
+not relying on editing as a critical component.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of ACL: EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Recasting Continual Learning as Sequence Modeling <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11952v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11952v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soochan Lee, Jaehyeon Son, Gunhee Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we aim to establish a strong connection between two significant
+bodies of machine learning research: continual learning and sequence modeling.
+That is, we propose to formulate continual learning as a sequence modeling
+problem, allowing advanced sequence models to be utilized for continual
+learning. Under this formulation, the continual learning process becomes the
+forward pass of a sequence model. By adopting the meta-continual learning (MCL)
+framework, we can train the sequence model at the meta-level, on multiple
+continual learning episodes. As a specific example of our new formulation, we
+demonstrate the application of Transformers and their efficient variants as MCL
+methods. Our experiments on seven benchmarks, covering both classification and
+regression, show that sequence models can be an attractive solution for general
+MCL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Too Good To Be True: performance overestimation in (re)current practices
+  for Human Activity Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11950v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11950v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrés Tello, Victoria Degeler, Alexander Lazovik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Today, there are standard and well established procedures within the Human
+Activity Recognition (HAR) pipeline. However, some of these conventional
+approaches lead to accuracy overestimation. In particular, sliding windows for
+data segmentation followed by standard random k-fold cross validation, produce
+biased results. An analysis of previous literature and present-day studies,
+surprisingly, shows that these are common approaches in state-of-the-art
+studies on HAR. It is important to raise awareness in the scientific community
+about this problem, whose negative effects are being overlooked. Otherwise,
+publications of biased results lead to papers that report lower accuracies,
+with correct unbiased methods, harder to publish. Several experiments with
+different types of datasets and different types of classification models allow
+us to exhibit the problem and show it persists independently of the method or
+dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interpretable Spectral Variational AutoEncoder (ISVAE) for time series
+  clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11940v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11940v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Óscar Jiménez Rama, Fernando Moreno-Pino, David Ramírez, Pablo M. Olmos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The best encoding is the one that is interpretable in nature. In this work,
+we introduce a novel model that incorporates an interpretable bottleneck-termed
+the Filter Bank (FB)-at the outset of a Variational Autoencoder (VAE). This
+arrangement compels the VAE to attend on the most informative segments of the
+input signal, fostering the learning of a novel encoding ${f_0}$ which boasts
+enhanced interpretability and clusterability over traditional latent spaces. By
+deliberately constraining the VAE with this FB, we intentionally constrict its
+capacity to access broad input domain information, promoting the development of
+an encoding that is discernible, separable, and of reduced dimensionality. The
+evolutionary learning trajectory of ${f_0}$ further manifests as a dynamic
+hierarchical tree, offering profound insights into cluster similarities.
+Additionally, for handling intricate data configurations, we propose a tailored
+decoder structure that is symmetrically aligned with FB's architecture.
+Empirical evaluations highlight the superior efficacy of ISVAE, which compares
+favorably to state-of-the-art results in clustering metrics across real-world
+datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Benchmark for Semi-Inductive Link Prediction in Knowledge Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11917v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11917v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrian Kochsiek, Rainer Gemulla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semi-inductive link prediction (LP) in knowledge graphs (KG) is the task of
+predicting facts for new, previously unseen entities based on context
+information. Although new entities can be integrated by retraining the model
+from scratch in principle, such an approach is infeasible for large-scale KGs,
+where retraining is expensive and new entities may arise frequently. In this
+paper, we propose and describe a large-scale benchmark to evaluate
+semi-inductive LP models. The benchmark is based on and extends Wikidata5M: It
+provides transductive, k-shot, and 0-shot LP tasks, each varying the available
+information from (i) only KG structure, to (ii) including textual mentions, and
+(iii) detailed descriptions of the entities. We report on a small study of
+recent approaches and found that semi-inductive LP performance is far from
+transductive performance on long-tail entities throughout all experiments. The
+benchmark provides a test bed for further research into integrating context and
+textual information in semi-inductive LP models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-modal Medical Neurological Image Fusion using Wavelet Pooled Edge
+  Preserving Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11910v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11910v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manisha Das, Deep Gupta, Petia Radeva, Ashwini M Bakde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image fusion integrates the complementary diagnostic information of
+the source image modalities for improved visualization and analysis of
+underlying anomalies. Recently, deep learning-based models have excelled the
+conventional fusion methods by executing feature extraction, feature selection,
+and feature fusion tasks, simultaneously. However, most of the existing
+convolutional neural network (CNN) architectures use conventional pooling or
+strided convolutional strategies to downsample the feature maps. It causes the
+blurring or loss of important diagnostic information and edge details available
+in the source images and dilutes the efficacy of the feature extraction
+process. Therefore, this paper presents an end-to-end unsupervised fusion model
+for multimodal medical images based on an edge-preserving dense autoencoder
+network. In the proposed model, feature extraction is improved by using wavelet
+decomposition-based attention pooling of feature maps. This helps in preserving
+the fine edge detail information present in both the source images and enhances
+the visual perception of fused images. Further, the proposed model is trained
+on a variety of medical image pairs which helps in capturing the intensity
+distributions of the source images and preserves the diagnostic information
+effectively. Substantial experiments are conducted which demonstrate that the
+proposed method provides improved visual and quantitative results as compared
+to the other state-of-the-art fusion methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 5 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Accelerated Policy Gradient: On the Nesterov Momentum for Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11897v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11897v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yen-Ju Chen, Nai-Chieh Huang, Ping-Chun Hsieh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Policy gradient methods have recently been shown to enjoy global convergence
+at a $\Theta(1/t)$ rate in the non-regularized tabular softmax setting.
+Accordingly, one important research question is whether this convergence rate
+can be further improved, with only first-order updates. In this paper, we
+answer the above question from the perspective of momentum by adapting the
+celebrated Nesterov's accelerated gradient (NAG) method to reinforcement
+learning (RL), termed \textit{Accelerated Policy Gradient} (APG). To
+demonstrate the potential of APG in achieving faster global convergence, we
+formally show that with the true gradient, APG with softmax policy
+parametrization converges to an optimal policy at a $\tilde{O}(1/t^2)$ rate. To
+the best of our knowledge, this is the first characterization of the global
+convergence rate of NAG in the context of RL. Notably, our analysis relies on
+one interesting finding: Regardless of the initialization, APG could end up
+reaching a locally nearly-concave regime, where APG could benefit significantly
+from the momentum, within finite iterations. By means of numerical validation,
+we confirm that APG exhibits $\tilde{O}(1/t^2)$ rate as well as show that APG
+could significantly improve the convergence behavior over the standard policy
+gradient.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>51 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A New Multimodal Medical Image Fusion based on Laplacian Autoencoder
+  with Channel Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11896v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11896v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Payal Wankhede, Manisha Das, Deep Gupta, Petia Radeva, Ashwini M Bakde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Medical image fusion combines the complementary information of multimodal
+medical images to assist medical professionals in the clinical diagnosis of
+patients' disorders and provide guidance during preoperative and
+intra-operative procedures. Deep learning (DL) models have achieved end-to-end
+image fusion with highly robust and accurate fusion performance. However, most
+DL-based fusion models perform down-sampling on the input images to minimize
+the number of learnable parameters and computations. During this process,
+salient features of the source images become irretrievable leading to the loss
+of crucial diagnostic edge details and contrast of various brain tissues. In
+this paper, we propose a new multimodal medical image fusion model is proposed
+that is based on integrated Laplacian-Gaussian concatenation with attention
+pooling (LGCA). We prove that our model preserves effectively complementary
+information and important tissue structures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures, % tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Hyperparameter Study for Quantum Kernel Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11891v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11891v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Egginger, Alona Sakhnenko, Jeanette Miriam Lorenz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantum kernel methods are a promising method in quantum machine learning
+thanks to the guarantees connected to them. Their accessibility for analytic
+considerations also opens up the possibility of prescreening datasets based on
+their potential for a quantum advantage. To do so, earlier works developed the
+geometric difference, which can be understood as a closeness measure between
+two kernel-based machine learning approaches, most importantly between a
+quantum kernel and classical kernel. This metric links the quantum and
+classical model complexities. Therefore, it raises the question of whether the
+geometric difference, based on its relation to model complexity, can be a
+useful tool in evaluations other than for the potential for quantum advantage.
+In this work, we investigate the effects of hyperparameter choice on the model
+performance and the generalization gap between classical and quantum kernels.
+The importance of hyperparameter optimization is well known also for classical
+machine learning. Especially for the quantum Hamiltonian evolution feature map,
+the scaling of the input data has been shown to be crucial. However, there are
+additional parameters left to be optimized, like the best number of qubits to
+trace out before computing a projected quantum kernel. We investigate the
+influence of these hyperparameters and compare the classically reliable method
+of cross validation with the method of choosing based on the geometric
+difference. Based on the thorough investigation of the hyperparameters across
+11 datasets we identified commodities that can be exploited when examining a
+new dataset. In addition, our findings contribute to better understanding of
+the applicability of the geometric difference.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Building a Graph-based Deep Learning network model from captured traffic
+  traces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11889v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11889v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlos Güemes-Palau, Miquel Ferriol Galmés, Albert Cabellos-Aparicio, Pere Barlet-Ros
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Currently the state of the art network models are based or depend on Discrete
+Event Simulation (DES). While DES is highly accurate, it is also
+computationally costly and cumbersome to parallelize, making it unpractical to
+simulate high performance networks. Additionally, simulated scenarios fail to
+capture all of the complexities present in real network scenarios. While there
+exists network models based on Machine Learning (ML) techniques to minimize
+these issues, these models are also trained with simulated data and hence
+vulnerable to the same pitfalls. Consequently, the Graph Neural Networking
+Challenge 2023 introduces a dataset of captured traffic traces that can be used
+to build a ML-based network model without these limitations. In this paper we
+propose a Graph Neural Network (GNN)-based solution specifically designed to
+better capture the complexities of real network scenarios. This is done through
+a novel encoding method to capture information from the sequence of captured
+packets, and an improved message passing algorithm to better represent the
+dependencies present in physical networks. We show that the proposed solution
+it is able to learn and generalize to unseen captured network scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analyze Mass Spectrometry data with Artificial Intelligence to assist
+  the understanding of past habitability of Mars and provide insights for
+  future missions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11888v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11888v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ioannis Nasios
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents an application of artificial intelligence on mass
+spectrometry data for detecting habitability potential of ancient Mars.
+Although data was collected for planet Mars the same approach can be replicated
+for any terrestrial object of our solar system. Furthermore, proposed
+methodology can be adapted to any domain that uses mass spectrometry. This
+research is focused in data analysis of two mass spectrometry techniques,
+evolved gas analysis (EGA-MS) and gas chromatography (GC-MS), which are used to
+identify specific chemical compounds in geological material samples. The study
+demonstrates the applicability of EGA-MS and GC-MS data to extra-terrestrial
+material analysis. Most important features of proposed methodology includes
+square root transformation of mass spectrometry values, conversion of raw data
+to 2D sprectrograms and utilization of specific machine learning models and
+techniques to avoid overfitting on relative small datasets. Both EGA-MS and
+GC-MS datasets come from NASA and two machine learning competitions that the
+author participated and exploited. Complete running code for the GC-MS
+dataset/competition is available at GitHub.1 Raw training mass spectrometry
+data include [0, 1] labels of specific chemical compounds, selected to provide
+valuable insights and contribute to our understanding of the potential past
+habitability of Mars.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Neural Activations to Concepts: A <span class="highlight-title">Survey</span> on Explaining Concepts in
+  Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11884v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11884v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jae Hee Lee, Sergio Lanza, Stefan Wermter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we review recent approaches for explaining concepts in neural
+networks. Concepts can act as a natural link between learning and reasoning:
+once the concepts are identified that a neural learning system uses, one can
+integrate those concepts with a reasoning system for inference or use a
+reasoning system to act upon them to improve or enhance the learning system. On
+the other hand, knowledge can not only be extracted from neural networks but
+concept knowledge can also be inserted into neural network architectures. Since
+integrating learning and reasoning is at the core of neuro-symbolic AI, the
+insights gained from this survey can serve as an important step towards
+realizing neuro-symbolic AI based on explainable concepts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to Neurosymbolic Artificial Intelligence
+  (https://neurosymbolic-ai-journal.com/paper/neural-activations-concepts-survey-explaining-concepts-neural-networks)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online Convex Optimization with Switching Cost and Delayed Gradients 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11880v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11880v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Spandan Senapati, Rahul Vaze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the online convex optimization (OCO) problem with quadratic and
+linear switching cost in the limited information setting, where an online
+algorithm can choose its action using only gradient information about the
+previous objective function. For $L$-smooth and $\mu$-strongly convex objective
+functions, we propose an online multiple gradient descent (OMGD) algorithm and
+show that its competitive ratio for the OCO problem with quadratic switching
+cost is at most $4(L + 5) + \frac{16(L + 5)}{\mu}$. The competitive ratio upper
+bound for OMGD is also shown to be order-wise tight in terms of $L,\mu$. In
+addition, we show that the competitive ratio of any online algorithm is
+$\max\{\Omega(L), \Omega(\frac{L}{\sqrt{\mu}})\}$ in the limited information
+setting when the switching cost is quadratic. We also show that the OMGD
+algorithm achieves the optimal (order-wise) dynamic regret in the limited
+information setting. For the linear switching cost, the competitive ratio upper
+bound of the OMGD algorithm is shown to depend on both the path length and the
+squared path length of the problem instance, in addition to $L, \mu$, and is
+shown to be order-wise, the best competitive ratio any online algorithm can
+achieve. Consequently, we conclude that the optimal competitive ratio for the
+quadratic and linear switching costs are fundamentally different in the limited
+information setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, accepted at IFIP Performance'23, appears in Elsevier
+  Performance Evaluation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SQ Lower Bounds for Learning Mixtures of Linear Classifiers <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11876v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11876v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ilias Diakonikolas, Daniel M. Kane, Yuxin Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of learning mixtures of linear classifiers under
+Gaussian covariates. Given sample access to a mixture of $r$ distributions on
+$\mathbb{R}^n$ of the form $(\mathbf{x},y_{\ell})$, $\ell\in [r]$, where
+$\mathbf{x}\sim\mathcal{N}(0,\mathbf{I}_n)$ and
+$y_\ell=\mathrm{sign}(\langle\mathbf{v}_\ell,\mathbf{x}\rangle)$ for an unknown
+unit vector $\mathbf{v}_\ell$, the goal is to learn the underlying distribution
+in total variation distance. Our main result is a Statistical Query (SQ) lower
+bound suggesting that known algorithms for this problem are essentially best
+possible, even for the special case of uniform mixtures. In particular, we show
+that the complexity of any SQ algorithm for the problem is
+$n^{\mathrm{poly}(1/\Delta) \log(r)}$, where $\Delta$ is a lower bound on the
+pairwise $\ell_2$-separation between the $\mathbf{v}_\ell$'s. The key technical
+ingredient underlying our result is a new construction of spherical designs
+that may be of independent interest.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fractional Concepts in Neural Networks: Enhancing Activation and Loss
+  Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11875v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11875v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zahra Alijani, Vojtech Molek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The paper presents a method for using fractional concepts in a neural network
+to modify the activation and loss functions. The methodology allows the neural
+network to define and optimize its activation functions by determining the
+fractional derivative order of the training process as an additional
+hyperparameter. This will enable neurons in the network to adjust their
+activation functions to match input data better and reduce output errors,
+potentially improving the network's overall performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 6 figures, submitted to Neurocomputing journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating the Fairness of Discriminative Foundation Models in Computer
+  Vision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11867v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11867v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junaid Ali, Matthaeus Kleindessner, Florian Wenzel, Kailash Budhathoki, Volkan Cevher, Chris Russell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel taxonomy for bias evaluation of discriminative foundation
+models, such as Contrastive Language-Pretraining (CLIP), that are used for
+labeling tasks. We then systematically evaluate existing methods for mitigating
+bias in these models with respect to our taxonomy. Specifically, we evaluate
+OpenAI's CLIP and OpenCLIP models for key applications, such as zero-shot
+classification, image retrieval and image captioning. We categorize desired
+behaviors based around three axes: (i) if the task concerns humans; (ii) how
+subjective the task is (i.e., how likely it is that people from a diverse range
+of backgrounds would agree on a labeling); and (iii) the intended purpose of
+the task and if fairness is better served by impartiality (i.e., making
+decisions independent of the protected attributes) or representation (i.e.,
+making decisions to maximize diversity). Finally, we provide quantitative
+fairness evaluations for both binary-valued and multi-valued protected
+attributes over ten diverse datasets. We find that fair PCA, a post-processing
+method for fair representations, works very well for debiasing in most of the
+aforementioned tasks while incurring only minor loss of performance. However,
+different debiasing approaches vary in their effectiveness depending on the
+task. Hence, one should choose the debiasing approach depending on the specific
+use case.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at AIES'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stochastic Optimization for Non-convex Problem with Inexact Hessian
+  Matrix, Gradient, and Function 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11866v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11866v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liu Liu, Xuanqing Liu, Cho-Jui Hsieh, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Trust-region (TR) and adaptive regularization using cubics (ARC) have proven
+to have some very appealing theoretical properties for non-convex optimization
+by concurrently computing function value, gradient, and Hessian matrix to
+obtain the next search direction and the adjusted parameters. Although
+stochastic approximations help largely reduce the computational cost, it is
+challenging to theoretically guarantee the convergence rate. In this paper, we
+explore a family of stochastic TR and ARC methods that can simultaneously
+provide inexact computations of the Hessian matrix, gradient, and function
+values. Our algorithms require much fewer propagations overhead per iteration
+than TR and ARC. We prove that the iteration complexity to achieve
+$\epsilon$-approximate second-order optimality is of the same order as the
+exact computations demonstrated in previous studies. Additionally, the mild
+conditions on inexactness can be met by leveraging a random sampling technology
+in the finite-sum minimization problem. Numerical experiments with a non-convex
+problem support these findings and demonstrate that, with the same or a similar
+number of iterations, our algorithms require less computational overhead per
+iteration than current second-order methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:1809.09853</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Effective and Efficient Federated Tree Learning on Hybrid Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11865v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11865v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qinbin Li, Chulin Xie, Xiaojun Xu, Xiaoyuan Liu, Ce Zhang, Bo Li, Bingsheng He, Dawn Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning has emerged as a promising distributed learning paradigm
+that facilitates collaborative learning among multiple parties without
+transferring raw data. However, most existing federated learning studies focus
+on either horizontal or vertical data settings, where the data of different
+parties are assumed to be from the same feature or sample space. In practice, a
+common scenario is the hybrid data setting, where data from different parties
+may differ both in the features and samples. To address this, we propose
+HybridTree, a novel federated learning approach that enables federated tree
+learning on hybrid data. We observe the existence of consistent split rules in
+trees. With the help of these split rules, we theoretically show that the
+knowledge of parties can be incorporated into the lower layers of a tree. Based
+on our theoretical analysis, we propose a layer-level solution that does not
+need frequent communication traffic to train a tree. Our experiments
+demonstrate that HybridTree can achieve comparable accuracy to the centralized
+setting with low computational and communication overhead. HybridTree can
+achieve up to 8 times speedup compared with the other baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VQ-NeRF: Neural Reflectance Decomposition and Editing with Vector
+  Quantization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11864v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11864v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongliang Zhong, Jingbo Zhang, Jing Liao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose VQ-NeRF, a two-branch neural network model that incorporates
+Vector Quantization (VQ) to decompose and edit reflectance fields in 3D scenes.
+Conventional neural reflectance fields use only continuous representations to
+model 3D scenes, despite the fact that objects are typically composed of
+discrete materials in reality. This lack of discretization can result in noisy
+material decomposition and complicated material editing. To address these
+limitations, our model consists of a continuous branch and a discrete branch.
+The continuous branch follows the conventional pipeline to predict decomposed
+materials, while the discrete branch uses the VQ mechanism to quantize
+continuous materials into individual ones. By discretizing the materials, our
+model can reduce noise in the decomposition process and generate a segmentation
+map of discrete materials. Specific materials can be easily selected for
+further editing by clicking on the corresponding area of the segmentation
+outcomes. Additionally, we propose a dropout-based VQ codeword ranking strategy
+to predict the number of materials in a scene, which reduces redundancy in the
+material segmentation process. To improve usability, we also develop an
+interactive interface to further assist material editing. We evaluate our model
+on both computer-generated and real-world scenes, demonstrating its superior
+performance. To the best of our knowledge, our model is the first to enable
+discrete material editing in 3D scenes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review. Project Page:
+  https://jtbzhl.github.io/VQ-NeRF.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Transferable Adversarial Image Examples: Attack
+  Categorization, Evaluation Guidelines, and New Insights 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11850v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11850v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengyu Zhao, Hanwei Zhang, Renjue Li, Ronan Sicre, Laurent Amsaleg, Michael Backes, Qi Li, Chao Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transferable adversarial examples raise critical security concerns in
+real-world, black-box attack scenarios. However, in this work, we identify two
+main problems in common evaluation practices: (1) For attack transferability,
+lack of systematic, one-to-one attack comparison and fair hyperparameter
+settings. (2) For attack stealthiness, simply no comparisons. To address these
+problems, we establish new evaluation guidelines by (1) proposing a novel
+attack categorization strategy and conducting systematic and fair
+intra-category analyses on transferability, and (2) considering diverse
+imperceptibility metrics and finer-grained stealthiness characteristics from
+the perspective of attack traceback. To this end, we provide the first
+large-scale evaluation of transferable adversarial examples on ImageNet,
+involving 23 representative attacks against 9 representative defenses. Our
+evaluation leads to a number of new insights, including consensus-challenging
+ones: (1) Under a fair attack hyperparameter setting, one early attack method,
+DI, actually outperforms all the follow-up methods. (2) A state-of-the-art
+defense, DiffPure, actually gives a false sense of (white-box) security since
+it is indeed largely bypassed by our (black-box) transferable attacks. (3) Even
+when all attacks are bounded by the same $L_p$ norm, they lead to dramatically
+different stealthiness performance, which negatively correlates with their
+transferability performance. Overall, our work demonstrates that existing
+problematic evaluations have indeed caused misleading conclusions and missing
+points, and as a result, hindered the assessment of the actual progress in this
+field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at
+  https://github.com/ZhengyuZhao/TransferAttackEval</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Accelerate Presolve in Large-Scale Linear Programming via Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11845v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11845v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yufei Kuang, Xijun Li, Jie Wang, Fangzhou Zhu, Meng Lu, Zhihai Wang, Jia Zeng, Houqiang Li, Yongdong Zhang, Feng Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale LP problems from industry usually contain much redundancy that
+severely hurts the efficiency and reliability of solving LPs, making presolve
+(i.e., the problem simplification module) one of the most critical components
+in modern LP solvers. However, how to design high-quality presolve routines --
+that is, the program determining (P1) which presolvers to select, (P2) in what
+order to execute, and (P3) when to stop -- remains a highly challenging task
+due to the extensive requirements on expert knowledge and the large search
+space. Due to the sequential decision property of the task and the lack of
+expert demonstrations, we propose a simple and efficient reinforcement learning
+(RL) framework -- namely, reinforcement learning for presolve (RL4Presolve) --
+to tackle (P1)-(P3) simultaneously. Specifically, we formulate the routine
+design task as a Markov decision process and propose an RL framework with
+adaptive action sequences to generate high-quality presolve routines
+efficiently. Note that adaptive action sequences help learn complex behaviors
+efficiently and adapt to various benchmarks. Experiments on two solvers
+(open-source and commercial) and eight benchmarks (real-world and synthetic)
+demonstrate that RL4Presolve significantly and consistently improves the
+efficiency of solving large-scale LPs, especially on benchmarks from industry.
+Furthermore, we optimize the hard-coded presolve routines in LP solvers by
+extracting rules from learned policies for simple and efficient deployment to
+Huawei's supply chain. The results show encouraging economic and academic
+potential for incorporating machine learning to modern solvers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On The Expressivity of Objective-Specification Formalisms in
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11840v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11840v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rohan Subramani, Marcus Williams, Max Heitmann, Halfdan Holm, Charlie Griffin, Joar Skalse
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To solve a task with reinforcement learning (RL), it is necessary to formally
+specify the goal of that task. Although most RL algorithms require that the
+goal is formalised as a Markovian reward function, alternatives have been
+developed (such as Linear Temporal Logic and Multi-Objective Reinforcement
+Learning). Moreover, it is well known that some of these formalisms are able to
+express certain tasks that other formalisms cannot express. However, there has
+not yet been any thorough analysis of how these formalisms relate to each other
+in terms of expressivity. In this work, we fill this gap in the existing
+literature by providing a comprehensive comparison of the expressivities of 17
+objective-specification formalisms in RL. We place these formalisms in a
+preorder based on their expressive power, and present this preorder as a Hasse
+diagram. We find a variety of limitations for the different formalisms, and
+that no formalism is both dominantly expressive and straightforward to optimise
+with current techniques. For example, we prove that each of Regularised RL,
+Outer Nonlinear Markov Rewards, Reward Machines, Linear Temporal Logic, and
+Limit Average Rewards can express an objective that the others cannot. Our
+findings have implications for both policy optimisation and reward learning.
+Firstly, we identify expressivity limitations which are important to consider
+when specifying objectives in practice. Secondly, our results highlight the
+need for future research which adapts reward learning to work with a variety of
+formalisms, since many existing reward learning methods implicitly assume that
+desired objectives can be expressed with Markovian rewards. Our work
+contributes towards a more cohesive understanding of the costs and benefits of
+different RL objective-specification formalisms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Equivariant Bootstrapping for Uncertainty Quantification in Imaging
+  Inverse Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11838v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11838v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julian Tachella, Marcelo Pereyra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scientific imaging problems are often severely ill-posed, and hence have
+significant intrinsic uncertainty. Accurately quantifying the uncertainty in
+the solutions to such problems is therefore critical for the rigorous
+interpretation of experimental results as well as for reliably using the
+reconstructed images as scientific evidence. Unfortunately, existing imaging
+methods are unable to quantify the uncertainty in the reconstructed images in a
+manner that is robust to experiment replications. This paper presents a new
+uncertainty quantification methodology based on an equivariant formulation of
+the parametric bootstrap algorithm that leverages symmetries and invariance
+properties commonly encountered in imaging problems. Additionally, the proposed
+methodology is general and can be easily applied with any image reconstruction
+technique, including unsupervised training strategies that can be trained from
+observed data alone, thus enabling uncertainty quantification in situations
+where there is no ground truth data available. We demonstrate the proposed
+approach with a series of numerical experiments and through comparisons with
+alternative uncertainty quantification strategies from the state-of-the-art,
+such as Bayesian strategies involving score-based diffusion models and Langevin
+samplers. In all our experiments, the proposed method delivers remarkably
+accurate high-dimensional confidence regions and outperforms the competing
+approaches in terms of estimation accuracy, uncertainty quantification
+accuracy, and computing time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimising Distributions with Natural Gradient Surrogates 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11837v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11837v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan So, Richard E. Turner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Natural gradient methods have been used to optimise the parameters of
+probability distributions in a variety of settings, often resulting in
+fast-converging procedures. Unfortunately, for many distributions of interest,
+computing the natural gradient has a number of challenges. In this work we
+propose a novel technique for tackling such issues, which involves reframing
+the optimisation as one with respect to the parameters of a surrogate
+distribution, for which computing the natural gradient is easy. We give several
+examples of existing methods that can be interpreted as applying this
+technique, and propose a new method for applying it to a wide variety of
+problems. Our method expands the set of distributions that can be efficiently
+targeted with natural gradients. Furthermore, it is fast, easy to understand,
+simple to implement using standard autodiff software, and does not require
+lengthy model-specific derivations. We demonstrate our method on maximum
+likelihood estimation and variational inference tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLARA: Multilingual Contrastive Learning for Audio Representation
+  Acquisition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11830v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11830v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kari A Noriy, Xiaosong Yang, Marcin Budka, Jian Jun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a novel framework for multilingual speech and sound
+representation learning using contrastive learning. The lack of sizeable
+labelled datasets hinders speech-processing research across languages. Recent
+advances in contrastive learning provide self-supervised techniques to learn
+from unlabelled data. Motivated by reducing data dependence and improving
+generalisation across diverse languages and conditions, we develop a
+multilingual contrastive framework. This framework enables models to acquire
+shared representations across languages, facilitating cross-lingual transfer
+with limited target language data.
+  Additionally, capturing emotional cues within speech is challenging due to
+subjective perceptual assessments. By learning expressive representations from
+diverse, multilingual data in a self-supervised manner, our approach aims to
+develop speech representations that encode emotive dimensions.
+  Our method trains encoders on a large corpus of multi-lingual audio data.
+Data augmentation techniques are employed to expand the dataset. The
+contrastive learning approach trains the model to maximise agreement between
+positive pairs and minimise agreement between negative pairs. Extensive
+experiments demonstrate state-of-the-art performance of the proposed model on
+emotion recognition, audio classification, and retrieval benchmarks under
+zero-shot and few-shot conditions. This provides an effective approach for
+acquiring shared and generalised speech representations across languages and
+acoustic conditions while encoding latent emotional dimensions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Graph Foundation Models: A <span class="highlight-title">Survey</span> and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11829v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11829v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Liu, Cheng Yang, Zhiyuan Lu, Junze Chen, Yibo Li, Mengmei Zhang, Ting Bai, Yuan Fang, Lichao Sun, Philip S. Yu, Chuan Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emerging as fundamental building blocks for diverse artificial intelligence
+applications, foundation models have achieved notable success across natural
+language processing and many other domains. Parallelly, graph machine learning
+has witnessed a transformative shift, with shallow methods giving way to deep
+learning approaches. The emergence and homogenization capabilities of
+foundation models have piqued the interest of graph machine learning
+researchers, sparking discussions about developing the next graph learning
+paradigm that is pre-trained on broad graph data and can be adapted to a wide
+range of downstream graph tasks. However, there is currently no clear
+definition and systematic analysis for this type of work. In this article, we
+propose the concept of graph foundation models (GFMs), and provide the first
+comprehensive elucidation on their key characteristics and technologies.
+Following that, we categorize existing works towards GFMs into three categories
+based on their reliance on graph neural networks and large language models.
+Beyond providing a comprehensive overview of the current landscape of graph
+foundation models, this article also discusses potential research directions
+for this evolving field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conservative Predictions on Noisy Financial Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11815v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11815v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omkar Nabar, Gautam Shroff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Price movements in financial markets are well known to be very noisy. As a
+result, even if there are, on occasion, exploitable patterns that could be
+picked up by machine-learning algorithms, these are obscured by feature and
+label noise rendering the predictions less useful, and risky in practice.
+Traditional rule-learning techniques developed for noisy data, such as CN2,
+would seek only high precision rules and refrain from making predictions where
+their antecedents did not apply. We apply a similar approach, where a model
+abstains from making a prediction on data points that it is uncertain on.
+During training, a cascade of such models are learned in sequence, similar to
+rule lists, with each model being trained only on data on which the previous
+model(s) were uncertain. Similar pruning of data takes place at test-time, with
+(higher accuracy) predictions being made albeit only on a fraction (support) of
+test-time data. In a financial prediction setting, such an approach allows
+decisions to be taken only when the ensemble model is confident, thereby
+reducing risk. We present results using traditional MLPs as well as
+differentiable decision trees, on synthetic data as well as real financial
+market data, to predict fixed-term returns using commonly used features. We
+submit that our approach is likely to result in better overall returns at a
+lower level of risk. In this context we introduce an utility metric to measure
+the average gain per trade, as well as the return adjusted for downside risk,
+both of which are improved significantly by our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACM ICAIF 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ De novo protein design using geometric vector field networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11802v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11802v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weian Mao, Muzhi Zhu, Zheng Sun, Shuaike Shen, Lin Yuanbo Wu, Hao Chen, Chunhua Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Innovations like protein diffusion have enabled significant progress in de
+novo protein design, which is a vital topic in life science. These methods
+typically depend on protein structure encoders to model residue backbone
+frames, where atoms do not exist. Most prior encoders rely on atom-wise
+features, such as angles and distances between atoms, which are not available
+in this context. Thus far, only several simple encoders, such as IPA, have been
+proposed for this scenario, exposing the frame modeling as a bottleneck. In
+this work, we proffer the Vector Field Network (VFN), which enables network
+layers to perform learnable vector computations between coordinates of
+frame-anchored virtual atoms, thus achieving a higher capability for modeling
+frames. The vector computation operates in a manner similar to a linear layer,
+with each input channel receiving 3D virtual atom coordinates instead of scalar
+values. The multiple feature vectors output by the vector computation are then
+used to update the residue representations and virtual atom coordinates via
+attention aggregation. Remarkably, VFN also excels in modeling both frames and
+atoms, as the real atoms can be treated as the virtual atoms for modeling,
+positioning VFN as a potential universal encoder. In protein diffusion (frame
+modeling), VFN exhibits an impressive performance advantage over IPA, excelling
+in terms of both designability (67.04% vs. 53.58%) and diversity (66.54% vs.
+51.98%). In inverse folding (frame and atom modeling), VFN outperforms the
+previous SoTA model, PiFold (54.7% vs. 51.66%), on sequence recovery rate. We
+also propose a method of equipping VFN with the ESM model, which significantly
+surpasses the previous ESM-based SoTA (62.67% vs. 55.65%), LM-Design, by a
+substantial margin.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adversarial Training for Physics-Informed Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11789v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11789v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao Li, Shengzhu Shi, Zhichang Guo, Boying Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Physics-informed neural networks have shown great promise in solving partial
+differential equations. However, due to insufficient robustness, vanilla PINNs
+often face challenges when solving complex PDEs, especially those involving
+multi-scale behaviors or solutions with sharp or oscillatory characteristics.
+To address these issues, based on the projected gradient descent adversarial
+attack, we proposed an adversarial training strategy for PINNs termed by
+AT-PINNs. AT-PINNs enhance the robustness of PINNs by fine-tuning the model
+with adversarial samples, which can accurately identify model failure locations
+and drive the model to focus on those regions during training. AT-PINNs can
+also perform inference with temporal causality by selecting the initial
+collocation points around temporal initial values. We implement AT-PINNs to the
+elliptic equation with multi-scale coefficients, Poisson equation with
+multi-peak solutions, Burgers equation with sharp solutions and the Allen-Cahn
+equation. The results demonstrate that AT-PINNs can effectively locate and
+reduce failure regions. Moreover, AT-PINNs are suitable for solving complex
+PDEs, since locating failure regions through adversarial attacks is independent
+of the size of failure regions or the complexity of the distribution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NeuroCUT: A Neural Approach for Robust Graph Partitioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11787v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11787v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rishi Shah, Krishnanshu Jain, Sahil Manchanda, Sourav Medya, Sayan Ranu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph partitioning aims to divide a graph into $k$ disjoint subsets while
+optimizing a specific partitioning objective. The majority of formulations
+related to graph partitioning exhibit NP-hardness due to their combinatorial
+nature. As a result, conventional approximation algorithms rely on heuristic
+methods, sometimes with approximation guarantees and sometimes without.
+Unfortunately, traditional approaches are tailored for specific partitioning
+objectives and do not generalize well across other known partitioning
+objectives from the literature. To overcome this limitation, and learn
+heuristics from the data directly, neural approaches have emerged,
+demonstrating promising outcomes. In this study, we extend this line of work
+through a novel framework, NeuroCut. NeuroCut introduces two key innovations
+over prevailing methodologies. First, it is inductive to both graph topology
+and the partition count, which is provided at query time. Second, by leveraging
+a reinforcement learning based framework over node representations derived from
+a graph neural network, NeuroCut can accommodate any optimization objective,
+even those encompassing non-differentiable functions. Through empirical
+evaluation, we demonstrate that NeuroCut excels in identifying high-quality
+partitions, showcases strong generalization across a wide spectrum of
+partitioning objectives, and exhibits resilience to topological modifications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Quasi-Wasserstein Loss for Learning Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11762v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11762v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minjie Cheng, Hongteng Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When learning graph neural networks (GNNs) in node-level prediction tasks,
+most existing loss functions are applied for each node independently, even if
+node embeddings and their labels are non-i.i.d. because of their graph
+structures. To eliminate such inconsistency, in this study we propose a novel
+Quasi-Wasserstein (QW) loss with the help of the optimal transport defined on
+graphs, leading to new learning and prediction paradigms of GNNs. In
+particular, we design a "Quasi-Wasserstein" distance between the observed
+multi-dimensional node labels and their estimations, optimizing the label
+transport defined on graph edges. The estimations are parameterized by a GNN in
+which the optimal label transport may determine the graph edge weights
+optionally. By reformulating the strict constraint of the label transport to a
+Bregman divergence-based regularizer, we obtain the proposed Quasi-Wasserstein
+loss associated with two efficient solvers learning the GNN together with
+optimal label transport. When predicting node labels, our model combines the
+output of the GNN with the residual component provided by the optimal label
+transport, leading to a new transductive prediction paradigm. Experiments show
+that the proposed QW loss applies to various GNNs and helps to improve their
+performance in node-level classification and regression tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Domain-Generalized Face Anti-Spoofing with Unknown Attacks <span class="chip">ICIP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11758v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11758v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zong-Wei Hong, Yu-Chen Lin, Hsuan-Tung Liu, Yi-Ren Yeh, Chu-Song Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although face anti-spoofing (FAS) methods have achieved remarkable
+performance on specific domains or attack types, few studies have focused on
+the simultaneous presence of domain changes and unknown attacks, which is
+closer to real application scenarios. To handle domain-generalized unknown
+attacks, we introduce a new method, DGUA-FAS, which consists of a
+Transformer-based feature extractor and a synthetic unknown attack sample
+generator (SUASG). The SUASG network simulates unknown attack samples to assist
+the training of the feature extractor. Experimental results show that our
+method achieves superior performance on domain generalization FAS with known or
+unknown attacks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE International Conference on Image Processing (ICIP 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Estimating Material Properties of Interacting Objects Using Sum-GP-UCB 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11749v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11749v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        M. Yunus Seker, Oliver Kroemer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robots need to estimate the material and dynamic properties of objects from
+observations in order to simulate them accurately. We present a Bayesian
+optimization approach to identifying the material property parameters of
+objects based on a set of observations. Our focus is on estimating these
+properties based on observations of scenes with different sets of interacting
+objects. We propose an approach that exploits the structure of the reward
+function by modeling the reward for each observation separately and using only
+the parameters of the objects in that scene as inputs. The resulting
+lower-dimensional models generalize better over the parameter space, which in
+turn results in a faster optimization. To speed up the optimization process
+further, and reduce the number of simulation runs needed to find good parameter
+values, we also propose partial evaluations of the reward function, wherein the
+selected parameters are only evaluated on a subset of real world evaluations.
+The approach was successfully evaluated on a set of scenes with a wide range of
+object interactions, and we showed that our method can effectively perform
+incremental learning without resetting the rewards of the gathered
+observations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unintended Memorization in Large ASR Models, and How to Mitigate It 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11739v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11739v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lun Wang, Om Thakkar, Rajiv Mathews
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is well-known that neural networks can unintentionally memorize their
+training examples, causing privacy concerns. However, auditing memorization in
+large non-auto-regressive automatic speech recognition (ASR) models has been
+challenging due to the high compute cost of existing methods such as hardness
+calibration. In this work, we design a simple auditing method to measure
+memorization in large ASR models without the extra compute overhead.
+Concretely, we speed up randomly-generated utterances to create a mapping
+between vocal and text information that is difficult to learn from typical
+training examples. Hence, accurate predictions only for sped-up training
+examples can serve as clear evidence for memorization, and the corresponding
+accuracy can be used to measure memorization. Using the proposed method, we
+showcase memorization in the state-of-the-art ASR models. To mitigate
+memorization, we tried gradient clipping during training to bound the influence
+of any individual example on the final model. We empirically show that clipping
+each example's gradient can mitigate memorization for sped-up training examples
+with up to 16 repetitions in the training set. Furthermore, we show that in
+large-scale distributed training, clipping the average gradient on each compute
+core maintains neutral model quality and compute cost while providing strong
+privacy protection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating Uncertainty Calibration of Aligned Language Models under
+  the Multiple-Choice Setting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11732v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11732v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guande He, Peng Cui, Jianfei Chen, Wenbo Hu, Jun Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the significant progress made in practical applications of aligned
+language models (LMs), they tend to be overconfident in output answers compared
+to the corresponding pre-trained LMs. In this work, we systematically evaluate
+the impact of the alignment process on logit-based uncertainty calibration of
+LMs under the multiple-choice setting. We first conduct a thoughtful empirical
+study on how aligned LMs differ in calibration from their pre-trained
+counterparts. Experimental results reveal that there are two distinct
+uncertainties in LMs under the multiple-choice setting, which are responsible
+for the answer decision and the format preference of the LMs, respectively.
+Then, we investigate the role of these two uncertainties on aligned LM's
+calibration through fine-tuning in simple synthetic alignment schemes and
+conclude that one reason for aligned LMs' overconfidence is the conflation of
+these two types of uncertainty. Furthermore, we examine the utility of common
+post-hoc calibration methods for aligned LMs and propose an easy-to-implement
+and sample-efficient method to calibrate aligned LMs. We hope our findings
+could provide insights into the design of more reliable alignment processes for
+LMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated Heterogeneous Graph Neural Network for Privacy-preserving
+  Recommendation <span class="chip">WWW 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11730v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11730v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Yan, Yang Cao, Haoyu Wang, Wenchuan Yang, Junping Du, Chuan Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Heterogeneous information network (HIN), which contains rich semantics
+depicted by meta-paths, has become a powerful tool to alleviate data sparsity
+in recommender systems. Existing HIN-based recommendations hold the data
+centralized storage assumption and conduct centralized model training. However,
+the real-world data is often stored in a distributed manner for privacy
+concerns, resulting in the failure of centralized HIN-based recommendations. In
+this paper, we suggest the HIN is partitioned into private HINs stored in the
+client side and shared HINs in the server. Following this setting, we propose a
+federated heterogeneous graph neural network (FedHGNN) based framework, which
+can collaboratively train a recommendation model on distributed HINs without
+leaking user privacy. Specifically, we first formalize the privacy definition
+in the light of differential privacy for HIN-based federated recommendation,
+which aims to protect user-item interactions of private HIN as well as user's
+high-order patterns from shared HINs. To recover the broken meta-path based
+semantics caused by distributed data storage and satisfy the proposed privacy,
+we elaborately design a semantic-preserving user interactions publishing
+method, which locally perturbs user's high-order patterns as well as related
+user-item interactions for publishing. After that, we propose a HGNN model for
+recommendation, which conducts node- and semantic-level aggregations to capture
+recovered semantics. Extensive experiments on three datasets demonstrate our
+model outperforms existing methods by a large margin (up to 34% in HR@10 and
+42% in NDCG@10) under an acceptable privacy budget.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submit to WWW 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Chain-of-Thought Tuning: Masked Language Models can also Think Step By
+  Step in Natural Language Understanding <span class="chip">EMNLP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11721v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11721v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Caoyun Fan, Jidong Tian, Yitian Li, Wenqing Chen, Hao He, Yaohui Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Chain-of-Thought (CoT) is a technique that guides Large Language Models
+(LLMs) to decompose complex tasks into multi-step reasoning through
+intermediate steps in natural language form. Briefly, CoT enables LLMs to think
+step by step. However, although many Natural Language Understanding (NLU) tasks
+also require thinking step by step, LLMs perform less well than small-scale
+Masked Language Models (MLMs). To migrate CoT from LLMs to MLMs, we propose
+Chain-of-Thought Tuning (CoTT), a two-step reasoning framework based on prompt
+tuning, to implement step-by-step thinking for MLMs on NLU tasks. From the
+perspective of CoT, CoTT's two-step framework enables MLMs to implement task
+decomposition; CoTT's prompt tuning allows intermediate steps to be used in
+natural language form. Thereby, the success of CoT can be extended to NLU tasks
+through MLMs. To verify the effectiveness of CoTT, we conduct experiments on
+two NLU tasks: hierarchical classification and relation extraction, and the
+results show that CoTT outperforms baselines and achieves state-of-the-art
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP2023 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Evaluation of Generative Models in Distributed Learning Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11714v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11714v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixiao Wang, Farzan Farnia, Zhenghao Lin, Yunheng Shen, Bei Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The evaluation of deep generative models including generative adversarial
+networks (GANs) and diffusion models has been extensively studied in the
+literature. While the existing evaluation methods mainly target a centralized
+learning problem with training data stored by a single client, many
+applications of generative models concern distributed learning settings, e.g.
+the federated learning scenario, where training data are collected by and
+distributed among several clients. In this paper, we study the evaluation of
+generative models in distributed learning tasks with heterogeneous data
+distributions. First, we focus on the Fr\'echet inception distance (FID) and
+consider the following FID-based aggregate scores over the clients: 1) FID-avg
+as the mean of clients' individual FID scores, 2) FID-all as the FID distance
+of the trained model to the collective dataset containing all clients' data. We
+prove that the model rankings according to the FID-all and FID-avg scores could
+be inconsistent, which can lead to different optimal generative models
+according to the two aggregate scores. Next, we consider the kernel inception
+distance (KID) and similarly define the KID-avg and KID-all aggregations.
+Unlike the FID case, we prove that KID-all and KID-avg result in the same
+rankings of generative models. We perform several numerical experiments on
+standard image datasets and training schemes to support our theoretical
+findings on the evaluation of generative models in distributed learning
+problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning under Label Proportions for Text Classification <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11707v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11707v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jatin Chauhan, Xiaoxuan Wang, Wei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present one of the preliminary NLP works under the challenging setup of
+Learning from Label Proportions (LLP), where the data is provided in an
+aggregate form called bags and only the proportion of samples in each class as
+the ground truth. This setup is inline with the desired characteristics of
+training models under Privacy settings and Weakly supervision. By
+characterizing some irregularities of the most widely used baseline technique
+DLLP, we propose a novel formulation that is also robust. This is accompanied
+with a learnability result that provides a generalization bound under LLP.
+Combining this formulation with a self-supervised objective, our method
+achieves better results as compared to the baselines in almost 87% of the
+experimental configurations which include large scale models for both long and
+short range texts across multiple metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted as long paper in Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Runner re-identification from single-view video in the open-world
+  setting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11700v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11700v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomohiro Suzuki, Kazushi Tsutsui, Kazuya Takeda, Keisuke Fujii
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In many sports, player re-identification is crucial for automatic video
+processing and analysis. However, most of the current studies on player
+re-identification in multi- or single-view sports videos focus on
+re-identification in the closed-world setting using labeled image dataset, and
+player re-identification in the open-world setting for automatic video analysis
+is not well developed. In this paper, we propose a runner re-identification
+system that directly processes single-view video to address the open-world
+setting. In the open-world setting, we cannot use labeled dataset and have to
+process video directly. The proposed system automatically processes raw video
+as input to identify runners, and it can identify runners even when they are
+framed out multiple times. For the automatic processing, we first detect the
+runners in the video using the pre-trained YOLOv8 and the fine-tuned
+EfficientNet. We then track the runners using ByteTrack and detect their shoes
+with the fine-tuned YOLOv8. Finally, we extract the image features of the
+runners using an unsupervised method using the gated recurrent unit autoencoder
+model. To improve the accuracy of runner re-identification, we use dynamic
+features of running sequence images. We evaluated the system on a running
+practice video dataset and showed that the proposed method identified runners
+with higher accuracy than one of the state-of-the-art models in unsupervised
+re-identification. We also showed that our unsupervised running dynamic feature
+extractor was effective for runner re-identification. Our runner
+re-identification system can be useful for the automatic analysis of running
+videos.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AUC-mixup: Deep AUC Maximization with Mixup 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11693v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11693v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianzhi Xv, Gang Li, Tianbao Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While deep AUC maximization (DAM) has shown remarkable success on imbalanced
+medical tasks, e.g., chest X-rays classification and skin lesions
+classification, it could suffer from severe overfitting when applied to small
+datasets due to its aggressive nature of pushing prediction scores of positive
+data away from that of negative data. This paper studies how to improve
+generalization of DAM by mixup data augmentation -- an approach that is widely
+used for improving generalization of the cross-entropy loss based deep learning
+methods. %For overfitting issues arising from limited data, the common approach
+is to employ mixup data augmentation to boost the models' generalization
+performance by enriching the training data. However, AUC is defined over
+positive and negative pairs, which makes it challenging to incorporate mixup
+data augmentation into DAM algorithms. To tackle this challenge, we employ the
+AUC margin loss and incorporate soft labels into the formulation to effectively
+learn from data generated by mixup augmentation, which is referred to as the
+AUC-mixup loss. Our experimental results demonstrate the effectiveness of the
+proposed AUC-mixup methods on imbalanced benchmark and medical image datasets
+compared to standard DAM training methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>3 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep learning based on <span class="highlight-title">Transformer</span> architecture for power system
+  short-term voltage stability assessment with class imbalance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11690v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11690v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Li, Jiting Cao, Yan Xu, Lipeng Zhu, Zhao Yang Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most existing data-driven power system short-term voltage stability
+assessment (STVSA) approaches presume class-balanced input data. However, in
+practical applications, the occurrence of short-term voltage instability
+following a disturbance is minimal, leading to a significant class imbalance
+problem and a consequent decline in classifier performance. This work proposes
+a Transformer-based STVSA method to address this challenge. By utilizing the
+basic Transformer architecture, a stability assessment Transformer (StaaT) is
+developed {as a classification model to reflect the correlation between the
+operational states of the system and the resulting stability outcomes}. To
+combat the negative impact of imbalanced datasets, this work employs a
+conditional Wasserstein generative adversarial network with gradient penalty
+(CWGAN-GP) for synthetic data generation, aiding in the creation of a balanced,
+representative training set for the classifier. Semi-supervised clustering
+learning is implemented to enhance clustering quality, addressing the lack of a
+unified quantitative criterion for short-term voltage stability. {Numerical
+tests on the IEEE 39-bus test system extensively demonstrate that the proposed
+method exhibits robust performance under class imbalances up to 100:1 and noisy
+environments, and maintains consistent effectiveness even with an increased
+penetration of renewable energy}. Comparative results reveal that the CWGAN-GP
+generates more balanced datasets than traditional oversampling methods and that
+the StaaT outperforms other deep learning algorithms. This study presents a
+compelling solution for real-world STVSA applications that often face class
+imbalance and data noise challenges.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Renewable and Sustainable Energy Reviews</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptation with Self-Evaluation to Improve Selective Prediction in LLMs <span class="chip">EMNLP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11689v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11689v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiefeng Chen, Jinsung Yoon, Sayna Ebrahimi, Sercan O Arik, Tomas Pfister, Somesh Jha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have recently shown great advances in a variety
+of tasks, including natural language understanding and generation. However,
+their use in high-stakes decision-making scenarios is still limited due to the
+potential for errors. Selective prediction is a technique that can be used to
+improve the reliability of the LLMs by allowing them to abstain from making
+predictions when they are unsure of the answer. In this work, we propose a
+novel framework for adaptation with self-evaluation to improve the selective
+prediction performance of LLMs. Our framework is based on the idea of using
+parameter-efficient tuning to adapt the LLM to the specific task at hand while
+improving its ability to perform self-evaluation. We evaluate our method on a
+variety of question-answering (QA) datasets and show that it outperforms
+state-of-the-art selective prediction methods. For example, on the CoQA
+benchmark, our method improves the AUACC from 91.23% to 92.63% and improves the
+AUROC from 74.61% to 80.25%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper published at Findings of the Association for Computational
+  Linguistics: EMNLP, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Superiority of Softmax: Unveiling the Performance Edge Over Linear
+  Attention 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11685v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11685v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichuan Deng, Zhao Song, Tianyi Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large transformer models have achieved state-of-the-art results in numerous
+natural language processing tasks. Among the pivotal components of the
+transformer architecture, the attention mechanism plays a crucial role in
+capturing token interactions within sequences through the utilization of
+softmax function.
+  Conversely, linear attention presents a more computationally efficient
+alternative by approximating the softmax operation with linear complexity.
+However, it exhibits substantial performance degradation when compared to the
+traditional softmax attention mechanism.
+  In this paper, we bridge the gap in our theoretical understanding of the
+reasons behind the practical performance gap between softmax and linear
+attention. By conducting a comprehensive comparative analysis of these two
+attention mechanisms, we shed light on the underlying reasons for why softmax
+attention outperforms linear attention in most scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantum Acceleration of Infinite Horizon Average-Reward Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11684v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11684v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bhargav Ganguly, Vaneet Aggarwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the potential of quantum acceleration in addressing
+infinite horizon Markov Decision Processes (MDPs) to enhance average reward
+outcomes. We introduce an innovative quantum framework for the agent's
+engagement with an unknown MDP, extending the conventional interaction
+paradigm. Our approach involves the design of an optimism-driven tabular
+Reinforcement Learning algorithm that harnesses quantum signals acquired by the
+agent through efficient quantum mean estimation techniques. Through thorough
+theoretical analysis, we demonstrate that the quantum advantage in mean
+estimation leads to exponential advancements in regret guarantees for infinite
+horizon Reinforcement Learning. Specifically, the proposed Quantum algorithm
+achieves a regret bound of $\tilde{\mathcal{O}}(1)$, a significant improvement
+over the $\tilde{\mathcal{O}}(\sqrt{T})$ bound exhibited by classical
+counterparts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Using Experience Classification for Training Non-Markovian Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11678v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11678v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruixuan Miao, Xu Lu, Cong Tian, Bin Yu, Zhenhua Duan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unlike the standard Reinforcement Learning (RL) model, many real-world tasks
+are non-Markovian, whose rewards are predicated on state history rather than
+solely on the current state. Solving a non-Markovian task, frequently applied
+in practical applications such as autonomous driving, financial trading, and
+medical diagnosis, can be quite challenging. We propose a novel RL approach to
+achieve non-Markovian rewards expressed in temporal logic LTL$_f$ (Linear
+Temporal Logic over Finite Traces). To this end, an encoding of linear
+complexity from LTL$_f$ into MDPs (Markov Decision Processes) is introduced to
+take advantage of advanced RL algorithms. Then, a prioritized experience replay
+technique based on the automata structure (semantics equivalent to LTL$_f$
+specification) is utilized to improve the training process. We empirically
+evaluate several benchmark problems augmented with non-Markovian tasks to
+demonstrate the feasibility and effectiveness of our approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improved Sample Complexity Analysis of Natural Policy Gradient Algorithm
+  with General Parameterization for Infinite Horizon Discounted Reward Markov
+  Decision Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11677v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11677v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Washim Uddin Mondal, Vaneet Aggarwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of designing sample efficient learning algorithms for
+infinite horizon discounted reward Markov Decision Process. Specifically, we
+propose the Accelerated Natural Policy Gradient (ANPG) algorithm that utilizes
+an accelerated stochastic gradient descent process to obtain the natural policy
+gradient. ANPG achieves $\mathcal{O}({\epsilon^{-2}})$ sample complexity and
+$\mathcal{O}(\epsilon^{-1})$ iteration complexity with general parameterization
+where $\epsilon$ defines the optimality error. This improves the
+state-of-the-art sample complexity by a $\log(\frac{1}{\epsilon})$ factor. ANPG
+is a first-order algorithm and unlike some existing literature, does not
+require the unverifiable assumption that the variance of importance sampling
+(IS) weights is upper bounded. In the class of Hessian-free and IS-free
+algorithms, ANPG beats the best-known sample complexity by a factor of
+$\mathcal{O}(\epsilon^{-\frac{1}{2}})$ and simultaneously matches their
+state-of-the-art iteration complexity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PREM: A Simple Yet Effective Approach for Node-Level Graph Anomaly
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11676v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11676v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junjun Pan, Yixin Liu, Yizhen Zheng, Shirui Pan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Node-level graph anomaly detection (GAD) plays a critical role in identifying
+anomalous nodes from graph-structured data in various domains such as medicine,
+social networks, and e-commerce. However, challenges have arisen due to the
+diversity of anomalies and the dearth of labeled data. Existing methodologies -
+reconstruction-based and contrastive learning - while effective, often suffer
+from efficiency issues, stemming from their complex objectives and elaborate
+modules. To improve the efficiency of GAD, we introduce a simple method termed
+PREprocessing and Matching (PREM for short). Our approach streamlines GAD,
+reducing time and memory consumption while maintaining powerful anomaly
+detection capabilities. Comprising two modules - a pre-processing module and an
+ego-neighbor matching module - PREM eliminates the necessity for
+message-passing propagation during training, and employs a simple contrastive
+loss, leading to considerable reductions in training time and memory usage.
+Moreover, through rigorous evaluations of five real-world datasets, our method
+demonstrated robustness and effectiveness. Notably, when validated on the ACM
+dataset, PREM achieved a 5% improvement in AUC, a 9-fold increase in training
+speed, and sharply reduce memory usage compared to the most efficient baseline.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Direction-oriented Multi-objective Learning: Simple and Provable
+  Stochastic Algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18409v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18409v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peiyao Xiao, Hao Ban, Kaiyi Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-objective optimization (MOO) has become an influential framework in
+many machine learning problems with multiple objectives such as learning with
+multiple criteria and multi-task learning (MTL). In this paper, we propose a
+new direction-oriented multi-objective problem by regularizing the common
+descent direction within a neighborhood of a direction that optimizes a linear
+combination of objectives such as the average loss in MTL. This formulation
+includes GD and MGDA as special cases, enjoys the direction-oriented benefit as
+in CAGrad, and facilitates the design of stochastic algorithms. To solve this
+problem, we propose Stochastic Direction-oriented Multi-objective Gradient
+descent (SDMGrad) with simple SGD type of updates, and its variant SDMGrad-OS
+with an efficient objective sampling in the setting where the number of
+objectives is large. For a constant-level regularization parameter $\lambda$,
+we show that SDMGrad and SDMGrad-OS provably converge to a Pareto stationary
+point with improved complexities and milder assumptions. For an increasing
+$\lambda$, this convergent point reduces to a stationary point of the linear
+combination of objectives. We demonstrate the superior performance of the
+proposed methods in a series of tasks on multi-task supervised learning and
+reinforcement learning. Code is provided at
+https://github.com/ml-opt-lab/sdmgrad.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bayesian longitudinal tensor response regression for modeling
+  neuroplasticity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10065v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10065v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suprateek Kundu, Alec Reinhardt, Serena Song, Joo Han, M. Lawson Meadows, Bruce Crosson, Venkatagiri Krishnamurthy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A major interest in longitudinal neuroimaging studies involves investigating
+voxel-level neuroplasticity due to treatment and other factors across visits.
+However, traditional voxel-wise methods are beset with several pitfalls, which
+can compromise the accuracy of these approaches. We propose a novel Bayesian
+tensor response regression approach for longitudinal imaging data, which pools
+information across spatially-distributed voxels to infer significant changes
+while adjusting for covariates. The proposed method, which is implemented using
+Markov chain Monte Carlo (MCMC) sampling, utilizes low-rank decomposition to
+reduce dimensionality and preserve spatial configurations of voxels when
+estimating coefficients. It also enables feature selection via joint credible
+regions which respect the shape of the posterior distributions for more
+accurate inference. In addition to group level inferences, the method is able
+to infer individual-level neuroplasticity, allowing for examination of
+personalized disease or recovery trajectories. The advantages of the proposed
+approach in terms of prediction and feature selection over voxel-wise
+regression are highlighted via extensive simulation studies. Subsequently, we
+apply the approach to a longitudinal Aphasia dataset consisting of task
+functional MRI images from a group of subjects who were administered either a
+control intervention or intention treatment at baseline and were followed up
+over subsequent visits. Our analysis revealed that while the control therapy
+showed long-term increases in brain activity, the intention treatment produced
+predominantly short-term changes, both of which were concentrated in distinct
+localized regions. In contrast, the voxel-wise regression failed to detect any
+significant neuroplasticity after multiplicity adjustments, which is
+biologically implausible and implies lack of power.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 8 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Limited-Memory Greedy Quasi-Newton Method with Non-asymptotic
+  Superlinear Convergence Rate 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15444v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15444v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhan Gao, Aryan Mokhtari, Alec Koppel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Non-asymptotic convergence analysis of quasi-Newton methods has gained
+attention with a landmark result establishing an explicit local superlinear
+rate of O$((1/\sqrt{t})^t)$. The methods that obtain this rate, however,
+exhibit a well-known drawback: they require the storage of the previous Hessian
+approximation matrix or all past curvature information to form the current
+Hessian inverse approximation. Limited-memory variants of quasi-Newton methods
+such as the celebrated L-BFGS alleviate this issue by leveraging a limited
+window of past curvature information to construct the Hessian inverse
+approximation. As a result, their per iteration complexity and storage
+requirement is O$(\tau d)$ where $\tau\le d$ is the size of the window and $d$
+is the problem dimension reducing the O$(d^2)$ computational cost and memory
+requirement of standard quasi-Newton methods. However, to the best of our
+knowledge, there is no result showing a non-asymptotic superlinear convergence
+rate for any limited-memory quasi-Newton method. In this work, we close this
+gap by presenting a Limited-memory Greedy BFGS (LG-BFGS) method that can
+achieve an explicit non-asymptotic superlinear rate. We incorporate
+displacement aggregation, i.e., decorrelating projection, in post-processing
+gradient variations, together with a basis vector selection scheme on variable
+variations, which greedily maximizes a progress measure of the Hessian estimate
+to the true Hessian. Their combination allows past curvature information to
+remain in a sparse subspace while yielding a valid representation of the full
+history. Interestingly, our established non-asymptotic superlinear convergence
+rate demonstrates an explicit trade-off between the convergence speed and
+memory requirement, which to our knowledge, is the first of its kind. Numerical
+results corroborate our theoretical findings and demonstrate the effectiveness
+of our method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Marich: A Query-efficient Distributionally Equivalent Model Extraction
+  Attack using Public Data <span class="chip">AAAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08466v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08466v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pratik Karmakar, Debabrota Basu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study design of black-box model extraction attacks that can send minimal
+number of queries from a publicly available dataset to a target ML model
+through a predictive API with an aim to create an informative and
+distributionally equivalent replica of the target. First, we define
+distributionally equivalent and Max-Information model extraction attacks, and
+reduce them into a variational optimisation problem. The attacker sequentially
+solves this optimisation problem to select the most informative queries that
+simultaneously maximise the entropy and reduce the mismatch between the target
+and the stolen models. This leads to an active sampling-based query selection
+algorithm, Marich, which is model-oblivious. Then, we evaluate Marich on
+different text and image data sets, and different models, including CNNs and
+BERT. Marich extracts models that achieve $\sim 60-95\%$ of true model's
+accuracy and uses $\sim 1,000 - 8,500$ queries from the publicly available
+datasets, which are different from the private training datasets. Models
+extracted by Marich yield prediction distributions, which are $\sim 2-4\times$
+closer to the target's distribution in comparison to the existing active
+sampling-based attacks. The extracted models also lead to $84-96\%$ accuracy
+under membership inference attacks. Experimental results validate that Marich
+is query-efficient, and capable of performing task-accurate, high-fidelity, and
+informative model extraction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented in the Privacy-Preserving AI (PPAI) workshop at AAAI 2023
+  as a spotlight talk and published in NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Convergence of No-Regret Learning Dynamics in Time-Varying Games <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.11241v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.11241v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ioannis Anagnostides, Ioannis Panageas, Gabriele Farina, Tuomas Sandholm
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most of the literature on learning in games has focused on the restrictive
+setting where the underlying repeated game does not change over time. Much less
+is known about the convergence of no-regret learning algorithms in dynamic
+multiagent settings. In this paper, we characterize the convergence of
+optimistic gradient descent (OGD) in time-varying games. Our framework yields
+sharp convergence bounds for the equilibrium gap of OGD in zero-sum games
+parameterized on natural variation measures of the sequence of games, subsuming
+known results for static games. Furthermore, we establish improved second-order
+variation bounds under strong convexity-concavity, as long as each game is
+repeated multiple times. Our results also apply to time-varying general-sum
+multi-player games via a bilinear formulation of correlated equilibria, which
+has novel implications for meta-learning and for obtaining refined
+variation-dependent regret bounds, addressing questions left open in prior
+papers. Finally, we leverage our framework to also provide new insights on
+dynamic regret guarantees in static games.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at NeurIPS 2023; V3 incorporates reviewers' feedback and
+  minor corrections</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ What model does MuZero learn? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.00840v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.00840v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinke He, Thomas M. Moerland, Frans A. Oliehoek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model-based reinforcement learning has drawn considerable interest in recent
+years, given its promise to improve sample efficiency. Moreover, when using
+deep-learned models, it is potentially possible to learn compact models from
+complex sensor data. However, the effectiveness of these learned models,
+particularly their capacity to plan, i.e., to improve the current policy,
+remains unclear. In this work, we study MuZero, a well-known deep model-based
+reinforcement learning algorithm, and explore how far it achieves its learning
+objective of a value-equivalent model and how useful the learned models are for
+policy improvement. Amongst various other insights, we conclude that the model
+learned by MuZero cannot effectively generalize to evaluate unseen policies,
+which limits the extent to which we can additionally improve the current policy
+by planning with the model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Systematic Study of Joint Representation Learning on Protein Sequences
+  and Structures 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06275v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06275v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zuobai Zhang, Chuanrui Wang, Minghao Xu, Vijil Chenthamarakshan, Aurélie Lozano, Payel Das, Jian Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning effective protein representations is critical in a variety of tasks
+in biology such as predicting protein functions. Recent sequence representation
+learning methods based on Protein Language Models (PLMs) excel in
+sequence-based tasks, but their direct adaptation to tasks involving protein
+structures remains a challenge. In contrast, structure-based methods leverage
+3D structural information with graph neural networks and geometric pre-training
+methods show potential in function prediction tasks, but still suffers from the
+limited number of available structures. To bridge this gap, our study
+undertakes a comprehensive exploration of joint protein representation learning
+by integrating a state-of-the-art PLM (ESM-2) with distinct structure encoders
+(GVP, GearNet, CDConv). We introduce three representation fusion strategies and
+explore different pre-training techniques. Our method achieves significant
+improvements over existing sequence- and structure-based methods, setting new
+state-of-the-art for function annotation. This study underscores several
+important design choices for fusing protein sequence and structure information.
+Our implementation is available at
+https://github.com/DeepGraphLearning/ESM-GearNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Global k-Space Interpolation for Dynamic MRI Reconstruction using Masked
+  Image Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12672v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12672v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiazhen Pan, Suprosanna Shit, Özgün Turgut, Wenqi Huang, Hongwei Bran Li, Nil Stolt-Ansó, Thomas Küstner, Kerstin Hammernik, Daniel Rueckert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In dynamic Magnetic Resonance Imaging (MRI), k-space is typically
+undersampled due to limited scan time, resulting in aliasing artifacts in the
+image domain. Hence, dynamic MR reconstruction requires not only modeling
+spatial frequency components in the x and y directions of k-space but also
+considering temporal redundancy. Most previous works rely on image-domain
+regularizers (priors) to conduct MR reconstruction. In contrast, we focus on
+interpolating the undersampled k-space before obtaining images with Fourier
+transform. In this work, we connect masked image modeling with k-space
+interpolation and propose a novel Transformer-based k-space Global
+Interpolation Network, termed k-GIN. Our k-GIN learns global dependencies among
+low- and high-frequency components of 2D+t k-space and uses it to interpolate
+unsampled data. Further, we propose a novel k-space Iterative Refinement Module
+(k-IRM) to enhance the high-frequency components learning. We evaluate our
+approach on 92 in-house 2D+t cardiac MR subjects and compare it to MR
+reconstruction methods with image-domain regularizers. Experiments show that
+our proposed k-space interpolation method quantitatively and qualitatively
+outperforms baseline methods. Importantly, the proposed approach achieves
+substantially higher robustness and generalizability in cases of
+highly-undersampled MR data. For video presentation, poster, GIF results and
+code please check our project page:
+https://jzpeterpan.github.io/k-gin.github.io/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Clifford Group Equivariant Neural Networks <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11141v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11141v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Ruhe, Johannes Brandstetter, Patrick Forré
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Clifford Group Equivariant Neural Networks: a novel approach for
+constructing $\mathrm{O}(n)$- and $\mathrm{E}(n)$-equivariant models. We
+identify and study the $\textit{Clifford group}$, a subgroup inside the
+Clifford algebra whose definition we adjust to achieve several favorable
+properties. Primarily, the group's action forms an orthogonal automorphism that
+extends beyond the typical vector space to the entire Clifford algebra while
+respecting the multivector grading. This leads to several non-equivalent
+subrepresentations corresponding to the multivector decomposition. Furthermore,
+we prove that the action respects not just the vector space structure of the
+Clifford algebra but also its multiplicative structure, i.e., the geometric
+product. These findings imply that every polynomial in multivectors, An
+advantage worth mentioning is that we obtain expressive layers that can
+elegantly generalize to inner-product spaces of any dimension. We demonstrate,
+notably from a single core implementation, state-of-the-art performance on
+several distinct tasks, including a three-dimensional $n$-body experiment, a
+four-dimensional Lorentz-equivariant high-energy physics experiment, and a
+five-dimensional convex hull experiment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at NeurIPS 2023 (Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Edge-InversionNet: Enabling Efficient Inference of InversionNet on Edge
+  Devices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09667v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09667v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhepeng Wang, Isaacshubhanand Putla, Weiwen Jiang, Youzuo Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Seismic full waveform inversion (FWI) is a widely used technique in
+geophysics for inferring subsurface structures from seismic data. And
+InversionNet is one of the most successful data-driven machine learning models
+that is applied to seismic FWI. However, the high computing costs to run
+InversionNet have made it challenging to be efficiently deployed on edge
+devices that are usually resource-constrained. Therefore, we propose to employ
+the structured pruning algorithm to get a lightweight version of InversionNet,
+which can make an efficient inference on edge devices. And we also made a
+prototype with Raspberry Pi to run the lightweight InversionNet. Experimental
+results show that the pruned InversionNet can achieve up to 98.2 % reduction in
+computing resources with moderate model performance degradation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mixed-Type Wafer Classification For Low Memory Devices Using Knowledge
+  Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13974v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13974v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nitish Shukla, Anurima Dey, Srivatsan K
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Manufacturing wafers is an intricate task involving thousands of steps.
+Defect Pattern Recognition (DPR) of wafer maps is crucial for determining the
+root cause of production defects, which may further provide insight for yield
+improvement in wafer foundry. During manufacturing, various defects may appear
+standalone in the wafer or may appear as different combinations. Identifying
+multiple defects in a wafer is generally harder compared to identifying a
+single defect. Recently, deep learning methods have gained significant traction
+in mixed-type DPR. However, the complexity of defects requires complex and
+large models making them very difficult to operate on low-memory embedded
+devices typically used in fabrication labs. Another common issue is the
+unavailability of labeled data to train complex networks. In this work, we
+propose an unsupervised training routine to distill the knowledge of complex
+pre-trained models to lightweight deployment-ready models. We empirically show
+that this type of training compresses the model without sacrificing accuracy
+despite being up to 10 times smaller than the teacher model. The compressed
+model also manages to outperform contemporary state-of-the-art models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Study is not relevant</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MAD Max Beyond Single-Node: Enabling Large Machine Learning Model
+  Acceleration on Distributed Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02784v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02784v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Hsia, Alicia Golden, Bilge Acun, Newsha Ardalani, Zachary DeVito, Gu-Yeon Wei, David Brooks, Carole-Jean Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training and deploying large machine learning (ML) models is time-consuming
+and requires significant distributed computing infrastructures. Based on
+real-world large model training on datacenter-scale infrastructures, we show
+14~32% of all GPU hours are spent on communication with no overlapping
+computation. To minimize the outstanding communication latency, in this work,
+we develop an agile performance modeling framework to guide parallelization and
+hardware-software co-design strategies. Using the suite of real-world large ML
+models on state-of-the-art GPU training hardware, we demonstrate 2.24x and
+5.27x throughput improvement potential for pre-training and inference
+scenarios, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Don't throw away your value model! Making PPO even better via
+  Value-Guided Monte-Carlo Tree Search decoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15028v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15028v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiacheng Liu, Andrew Cohen, Ramakanth Pasunuru, Yejin Choi, Hannaneh Hajishirzi, Asli Celikyilmaz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inference-time search algorithms such as Monte-Carlo Tree Search (MCTS) may
+seem unnecessary when generating natural language text based on
+state-of-the-art reinforcement learning such as Proximal Policy Optimization
+(PPO). In this paper, we demonstrate that it is possible to get extra mileage
+out of PPO by integrating MCTS on top. The key idea is not to throw out the
+value network, a byproduct of PPO training for evaluating partial output
+sequences, when decoding text out of the policy network. More concretely, we
+present a novel value-guided decoding algorithm called PPO-MCTS, which can
+integrate the value network from PPO to work closely with the policy network
+during inference-time generation. Compared to prior approaches based on MCTS
+for controlled text generation, the key strength of our approach is to reduce
+the fundamental mismatch of the scoring mechanisms of the partial outputs
+between training and test. Evaluation on four text generation tasks demonstrate
+that PPO-MCTS greatly improves the preferability of generated text compared to
+the standard practice of using only the PPO policy. Our results demonstrate the
+promise of search algorithms even on top of the aligned language models from
+PPO, and the under-explored benefit of the value network.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Crystal: Introspective Reasoners Reinforced with Self-Feedback <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04921v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04921v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiacheng Liu, Ramakanth Pasunuru, Hannaneh Hajishirzi, Yejin Choi, Asli Celikyilmaz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extensive work has shown that the performance and interpretability of
+commonsense reasoning can be improved via knowledge-augmented reasoning
+methods, where the knowledge that underpins the reasoning process is explicitly
+verbalized and utilized. However, existing implementations, including
+"chain-of-thought" and its variants, fall short in capturing the introspective
+nature of knowledge required in commonsense reasoning, and in accounting for
+the mutual adaptation between the generation and utilization of knowledge. We
+propose a novel method to develop an introspective commonsense reasoner,
+Crystal. To tackle commonsense problems, it first introspects for knowledge
+statements related to the given question, and subsequently makes an informed
+prediction that is grounded in the previously introspected knowledge. The
+knowledge introspection and knowledge-grounded reasoning modes of the model are
+tuned via reinforcement learning to mutually adapt, where the reward derives
+from the feedback given by the model itself. Experiments show that Crystal
+significantly outperforms both the standard supervised finetuning and
+chain-of-thought distilled methods, and enhances the transparency of the
+commonsense reasoning process. Our work ultimately validates the feasibility
+and potential of reinforcing a neural model with self-feedback.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Policy Optimization for Continuous Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18901v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18901v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanyang Zhao, Wenpin Tang, David D. Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study reinforcement learning (RL) in the setting of continuous time and
+space, for an infinite horizon with a discounted objective and the underlying
+dynamics driven by a stochastic differential equation. Built upon recent
+advances in the continuous approach to RL, we develop a notion of occupation
+time (specifically for a discounted objective), and show how it can be
+effectively used to derive performance-difference and local-approximation
+formulas. We further extend these results to illustrate their applications in
+the PG (policy gradient) and TRPO/PPO (trust region policy optimization/
+proximal policy optimization) methods, which have been familiar and powerful
+tools in the discrete RL setting but under-developed in continuous RL. Through
+numerical experiments, we demonstrate the effectiveness and advantages of our
+approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Surrogate Active Subspaces for Jump-Discontinuous Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10907v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10907v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathan Wycoff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Surrogate modeling and active subspaces have emerged as powerful paradigms in
+computational science and engineering. Porting such techniques to computational
+models in the social sciences brings into sharp relief their limitations in
+dealing with discontinuous simulators, such as Agent-Based Models, which have
+discrete outputs. Nevertheless, prior applied work has shown that surrogate
+estimates of active subspaces for such estimators can yield interesting
+results. But given that active subspaces are defined by way of gradients, it is
+not clear what quantity is being estimated when this methodology is applied to
+a discontinuous simulator. We begin this article by showing some pathologies
+that can arise when conducting such an analysis. This motivates an extension of
+active subspaces to discontinuous functions, clarifying what is actually being
+estimated in such analyses. We also conduct numerical experiments on synthetic
+test functions to compare Gaussian process estimates of active subspaces on
+continuous and discontinuous functions. Finally, we deploy our methodology on
+Flee, an agent-based model of refugee movement, yielding novel insights into
+which parameters of the simulation are most important across 8 displacement
+crises in Africa and the Middle East.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>comments very welcome</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Koopa: Learning Non-stationary Time Series Dynamics with Koopman
+  Predictors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.18803v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.18803v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yong Liu, Chenyu Li, Jianmin Wang, Mingsheng Long
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world time series are characterized by intrinsic non-stationarity that
+poses a principal challenge for deep forecasting models. While previous models
+suffer from complicated series variations induced by changing temporal
+distribution, we tackle non-stationary time series with modern Koopman theory
+that fundamentally considers the underlying time-variant dynamics. Inspired by
+Koopman theory of portraying complex dynamical systems, we disentangle
+time-variant and time-invariant components from intricate non-stationary series
+by Fourier Filter and design Koopman Predictor to advance respective dynamics
+forward. Technically, we propose Koopa as a novel Koopman forecaster composed
+of stackable blocks that learn hierarchical dynamics. Koopa seeks measurement
+functions for Koopman embedding and utilizes Koopman operators as linear
+portraits of implicit transition. To cope with time-variant dynamics that
+exhibits strong locality, Koopa calculates context-aware operators in the
+temporal neighborhood and is able to utilize incoming ground truth to scale up
+forecast horizon. Besides, by integrating Koopman Predictors into deep residual
+structure, we ravel out the binding reconstruction loss in previous Koopman
+forecasters and achieve end-to-end forecasting objective optimization. Compared
+with the state-of-the-art model, Koopa achieves competitive performance while
+saving 77.3% training time and 76.0% memory.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reaching the Limit in Autonomous Racing: Optimal Control versus
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10943v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10943v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunlong Song, Angel Romero, Matthias Mueller, Vladlen Koltun, Davide Scaramuzza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A central question in robotics is how to design a control system for an agile
+mobile robot. This paper studies this question systematically, focusing on a
+challenging setting: autonomous drone racing. We show that a neural network
+controller trained with reinforcement learning (RL) outperformed optimal
+control (OC) methods in this setting. We then investigated which fundamental
+factors have contributed to the success of RL or have limited OC. Our study
+indicates that the fundamental advantage of RL over OC is not that it optimizes
+its objective better but that it optimizes a better objective. OC decomposes
+the problem into planning and control with an explicit intermediate
+representation, such as a trajectory, that serves as an interface. This
+decomposition limits the range of behaviors that can be expressed by the
+controller, leading to inferior control performance when facing unmodeled
+effects. In contrast, RL can directly optimize a task-level objective and can
+leverage domain randomization to cope with model uncertainty, allowing the
+discovery of more robust control responses. Our findings allowed us to push an
+agile drone to its maximum performance, achieving a peak acceleration greater
+than 12 times the gravitational acceleration and a peak velocity of 108
+kilometers per hour. Our policy achieved superhuman control within minutes of
+training on a standard workstation. This work presents a milestone in agile
+robotics and sheds light on the role of RL and OC in robot control.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Prompt</span>Bench: Towards Evaluating the Robustness of Large Language Models
+  on Adversarial <span class="highlight-title">Prompt</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.04528v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.04528v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaijie Zhu, Jindong Wang, Jiaheng Zhou, Zichen Wang, Hao Chen, Yidong Wang, Linyi Yang, Wei Ye, Yue Zhang, Neil Zhenqiang Gong, Xing Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing reliance on Large Language Models (LLMs) across academia and
+industry necessitates a comprehensive understanding of their robustness to
+prompts. In response to this vital need, we introduce PromptBench, a robustness
+benchmark designed to measure LLMs' resilience to adversarial prompts. This
+study uses a plethora of adversarial textual attacks targeting prompts across
+multiple levels: character, word, sentence, and semantic. The adversarial
+prompts, crafted to mimic plausible user errors like typos or synonyms, aim to
+evaluate how slight deviations can affect LLM outcomes while maintaining
+semantic integrity. These prompts are then employed in diverse tasks, such as
+sentiment analysis, natural language inference, reading comprehension, machine
+translation, and math problem-solving. Our study generates 4788 adversarial
+prompts, meticulously evaluated over 8 tasks and 13 datasets. Our findings
+demonstrate that contemporary LLMs are not robust to adversarial prompts.
+Furthermore, we present comprehensive analysis to understand the mystery behind
+prompt robustness and its transferability. We then offer insightful robustness
+analysis and pragmatic recommendations for prompt composition, beneficial to
+both researchers and everyday users. Code is available at:
+https://github.com/microsoft/promptbench.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report; code is at:
+  https://github.com/microsoft/promptbench</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comparative Evaluation of Quantification Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2103.03223v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2103.03223v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias Schumacher, Markus Strohmaier, Florian Lemmerich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantification represents the problem of predicting class distributions in a
+dataset. It also represents a growing research field in supervised machine
+learning, for which a large variety of different algorithms has been proposed
+in recent years. However, a comprehensive empirical comparison of
+quantification methods that supports algorithm selection is not available yet.
+In this work, we close this research gap by conducting a thorough empirical
+performance comparison of 24 different quantification methods on overall more
+than 40 data sets, considering binary as well as multiclass quantification
+settings. We observe that no single algorithm generally outperforms all
+competitors, but identify a group of methods including the threshold
+selection-based Median Sweep and TSMax methods, the DyS framework, and
+Friedman's method that performs best in the binary setting. For the multiclass
+setting, we observe that a different group of algorithms yields good
+performance, including the Generalized Probabilistic Adjusted Count, the readme
+method, the energy distance minimization method, the EM algorithm for
+quantification, and Friedman's method. We also find that tuning the underlying
+classifiers has in most cases only a limited impact on the quantification
+performance. More generally, we find that the performance on multiclass
+quantification is inferior to the results obtained in the binary setting. Our
+results can guide practitioners who intend to apply quantification algorithms
+and help researchers to identify opportunities for future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages, 18 figures, 10 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Comparative Performance Evaluation of Large Language Models for
+  Extracting Molecular Interactions and Pathway Knowledge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.08813v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.08813v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gilchan Park, Byung-Jun Yoon, Xihaier Luo, Vanessa López-Marrero, Shinjae Yoo, Shantenu Jha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding protein interactions and pathway knowledge is crucial for
+unraveling the complexities of living systems and investigating the underlying
+mechanisms of biological functions and complex diseases. While existing
+databases provide curated biological data from literature and other sources,
+they are often incomplete and their maintenance is labor-intensive,
+necessitating alternative approaches. In this study, we propose to harness the
+capabilities of large language models to address these issues by automatically
+extracting such knowledge from the relevant scientific literature. Toward this
+goal, in this work, we investigate the effectiveness of different large
+language models in tasks that involve recognizing protein interactions,
+identifying genes associated with pathways affected by low-dose radiation, and
+gene regulatory relations. We thoroughly evaluate the performance of various
+models, highlight the significant findings, and discuss both the future
+opportunities and the remaining challenges associated with this approach. The
+code and data are available at: https://github.com/boxorange/BioIE-LLM
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ordinal Potential-based Player Rating 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05366v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05366v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nelson Vadori, Rahul Savani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It was recently observed that Elo ratings fail at preserving transitive
+relations among strategies and therefore cannot correctly extract the
+transitive component of a game. We provide a characterization of transitive
+games as a weak variant of ordinal potential games and show that Elo ratings
+actually do preserve transitivity when computed in the right space, using
+suitable invertible mappings. Leveraging this insight, we introduce a new game
+decomposition of an arbitrary game into transitive and cyclic components that
+is learnt using a neural network-based architecture and that prioritises
+capturing the sign pattern of the game, namely transitive and cyclic relations
+among strategies. We link our approach to the known concept of sign-rank, and
+evaluate our methodology using both toy examples and empirical data from
+real-world games.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IMAP: Intrinsically Motivated Adversarial Policy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02605v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02605v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Zheng, Xingjun Ma, Shengjie Wang, Xinyu Wang, Chao Shen, Cong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning agents are susceptible to evasion attacks during
+deployment. In single-agent environments, these attacks can occur through
+imperceptible perturbations injected into the inputs of the victim policy
+network. In multi-agent environments, an attacker can manipulate an adversarial
+opponent to influence the victim policy's observations indirectly. While
+adversarial policies offer a promising technique to craft such attacks, current
+methods are either sample-inefficient due to poor exploration strategies or
+require extra surrogate model training under the black-box assumption. To
+address these challenges, in this paper, we propose Intrinsically Motivated
+Adversarial Policy (IMAP) for efficient black-box adversarial policy learning
+in both single- and multi-agent environments. We formulate four types of
+adversarial intrinsic regularizers -- maximizing the adversarial state
+coverage, policy coverage, risk, or divergence -- to discover potential
+vulnerabilities of the victim policy in a principled way. We also present a
+novel Bias-Reduction (BR) method to boost IMAP further. Our experiments
+validate the effectiveness of the four types of adversarial intrinsic
+regularizers and BR in enhancing black-box adversarial policy learning across a
+variety of environments. Our IMAP successfully evades two types of defense
+methods, adversarial training and robust regularizer, decreasing the
+performance of the state-of-the-art robust WocaR-PPO agents by 34%-54% across
+four single-agent tasks. IMAP also achieves a state-of-the-art attacking
+success rate of 83.91% in the multi-agent game YouShallNotPass.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Schooling to Exploit Foolish Contracts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.10737v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.10737v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tamer Abdelaziz, Aquinas Hobor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce SCooLS, our Smart Contract Learning (Semi-supervised) engine.
+SCooLS uses neural networks to analyze Ethereum contract bytecode and
+identifies specific vulnerable functions. SCooLS incorporates two key elements:
+semi-supervised learning and graph neural networks (GNNs). Semi-supervised
+learning produces more accurate models than unsupervised learning, while not
+requiring the large oracle-labeled training set that supervised learning
+requires. GNNs enable direct analysis of smart contract bytecode without any
+manual feature engineering, predefined patterns, or expert rules. SCooLS is the
+first application of semi-supervised learning to smart contract vulnerability
+analysis, as well as the first deep learning-based vulnerability analyzer to
+identify specific vulnerable functions. SCooLS's performance is better than
+existing tools, with an accuracy level of 98.4%, an F1 score of 90.5%, and an
+exceptionally low false positive rate of only 0.8%. Furthermore, SCooLS is
+fast, analyzing a typical function in 0.05 seconds. We leverage SCooLS's
+ability to identify specific vulnerable functions to build an exploit
+generator, which was successful in stealing Ether from 76.9% of the true
+positives.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Taylor TD-learning <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.14182v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.14182v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michele Garibbo, Maxime Robeyns, Laurence Aitchison
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many reinforcement learning approaches rely on temporal-difference (TD)
+learning to learn a critic. However, TD-learning updates can be high variance.
+Here, we introduce a model-based RL framework, Taylor TD, which reduces this
+variance in continuous state-action settings. Taylor TD uses a first-order
+Taylor series expansion of TD updates. This expansion allows Taylor TD to
+analytically integrate over stochasticity in the action-choice, and some
+stochasticity in the state distribution for the initial state and action of
+each TD update. We include theoretical and empirical evidence that Taylor TD
+updates are indeed lower variance than standard TD updates. Additionally, we
+show Taylor TD has the same stable learning guarantees as standard TD-learning
+with linear function approximation under a reasonable assumption. Next, we
+combine Taylor TD with the TD3 algorithm, forming TaTD3. We show TaTD3 performs
+as well, if not better, than several state-of-the art model-free and
+model-based baseline algorithms on a set of standard benchmark tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published as a conference paper at NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Addressing Maximization Bias in Reinforcement Learning with Two-Sample
+  Testing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.08078v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.08078v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Waltz, Ostap Okhrin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Value-based reinforcement-learning algorithms have shown strong results in
+games, robotics, and other real-world applications. Overestimation bias is a
+known threat to those algorithms and can lead to dramatic performance decreases
+or even complete algorithmic failure. We frame the bias problem statistically
+and consider it an instance of estimating the maximum expected value (MEV) of a
+set of random variables. We propose the $T$-Estimator (TE) based on two-sample
+testing for the mean, that flexibly interpolates between over- and
+underestimation by adjusting the significance level of the underlying
+hypothesis tests. A generalization, termed $K$-Estimator (KE), obeys the same
+bias and variance bounds as the TE while relying on a nearly arbitrary kernel
+function. We introduce modifications of $Q$-Learning and the Bootstrapped Deep
+$Q$-Network (BDQN) using the TE and the KE, and prove convergence in the
+tabular setting. Furthermore, we propose an adaptive variant of the TE-based
+BDQN that dynamically adjusts the significance level to minimize the absolute
+estimation bias. All proposed estimators and algorithms are thoroughly tested
+and validated on diverse tasks and environments, illustrating the bias control
+and performance potential of the TE and KE.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Focus Your Attention (with Adaptive IIR Filters) <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14952v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14952v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shahar Lutati, Itamar Zimerman, Lior Wolf
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a new layer in which dynamic (i.e.,input-dependent) Infinite
+Impulse Response (IIR) filters of order two are used to process the input
+sequence prior to applying conventional attention. The input is split into
+chunks, and the coefficients of these filters are determined based on previous
+chunks to maintain causality. Despite their relatively low order, the causal
+adaptive filters are shown to focus attention on the relevant sequence
+elements. The new layer is grounded in control theory, and is shown to
+generalize diagonal state-space layers. The layer performs on-par with
+state-of-the-art networks, with a fraction of their parameters and with time
+complexity that is sub-quadratic with input size. The obtained layer is
+favorable to layers such as Heyna, GPT2, and Mega, both with respect to the
+number of parameters and the obtained level of performance on multiple
+long-range sequence problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uncertainty-aware Traffic Prediction under Missing Data <span class="chip">ICDM
+  2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.06800v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.06800v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Mei, Junxian Li, Zhiming Liang, Guanjie Zheng, Bin Shi, Hua Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traffic prediction is a crucial topic because of its broad scope of
+applications in the transportation domain. Recently, various studies have
+achieved promising results. However, most studies assume the prediction
+locations have complete or at least partial historical records and cannot be
+extended to non-historical recorded locations. In real-life scenarios, the
+deployment of sensors could be limited due to budget limitations and
+installation availability, which makes most current models not applicable.
+Though few pieces of literature tried to impute traffic states at the missing
+locations, these methods need the data simultaneously observed at the locations
+with sensors, making them not applicable to prediction tasks. Another drawback
+is the lack of measurement of uncertainty in prediction, making prior works
+unsuitable for risk-sensitive tasks or involving decision-making. To fill the
+gap, inspired by the previous inductive graph neural network, this work
+proposed an uncertainty-aware framework with the ability to 1) extend
+prediction to missing locations with no historical records and significantly
+extend spatial coverage of prediction locations while reducing deployment of
+sensors and 2) generate probabilistic prediction with uncertainty
+quantification to help the management of risk and decision making in the
+down-stream tasks. Through extensive experiments on real-life datasets, the
+result shows our method achieved promising results on prediction tasks, and the
+uncertainty quantification gives consistent results which highly correlated
+with the locations with and without historical data. We also show that our
+model could help support sensor deployment tasks in the transportation field to
+achieve higher accuracy with a limited sensor deployment budget.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 3 figures, extended version of our short paper
+  "Uncertainty-aware Traffic Prediction under Missing Data" submited to ICDM
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uncertainty in GNN Learning Evaluations: The Importance of a Consistent
+  Benchmark for Community Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.06026v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.06026v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        William Leeney, Ryan McConville
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks (GNNs) have improved unsupervised community detection
+of clustered nodes due to their ability to encode the dual dimensionality of
+the connectivity and feature information spaces of graphs. Identifying the
+latent communities has many practical applications from social networks to
+genomics. Current benchmarks of real world performance are confusing due to the
+variety of decisions influencing the evaluation of GNNs at this task. To
+address this, we propose a framework to establish a common evaluation protocol.
+We motivate and justify it by demonstrating the differences with and without
+the protocol. The W Randomness Coefficient is a metric proposed for assessing
+the consistency of algorithm rankings to quantify the reliability of results
+under the presence of randomness. We find that by ensuring the same evaluation
+criteria is followed, there may be significant differences from the reported
+performance of methods at this task, but a more complete evaluation and
+comparison of methods is possible.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Twelfth International Conference on Complex Networks &
+  Their Applications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Foundational Models for Molecular Learning on Large-Scale
+  Multi-Task <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04292v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04292v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominique Beaini, Shenyang Huang, Joao Alex Cunha, Zhiyi Li, Gabriela Moisescu-Pareja, Oleksandr Dymov, Samuel Maddrell-Mander, Callum McLean, Frederik Wenkel, Luis Müller, Jama Hussein Mohamud, Ali Parviz, Michael Craig, Michał Koziarski, Jiarui Lu, Zhaocheng Zhu, Cristian Gabellini, Kerstin Klaser, Josef Dean, Cas Wognum, Maciej Sypetkowski, Guillaume Rabusseau, Reihaneh Rabbany, Jian Tang, Christopher Morris, Ioannis Koutis, Mirco Ravanelli, Guy Wolf, Prudencio Tossou, Hadrien Mary, Therence Bois, Andrew Fitzgibbon, Błażej Banaszewski, Chad Martin, Dominic Masters
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, pre-trained foundation models have enabled significant advancements
+in multiple fields. In molecular machine learning, however, where datasets are
+often hand-curated, and hence typically small, the lack of datasets with
+labeled features, and codebases to manage those datasets, has hindered the
+development of foundation models. In this work, we present seven novel datasets
+categorized by size into three distinct categories: ToyMix, LargeMix and
+UltraLarge. These datasets push the boundaries in both the scale and the
+diversity of supervised labels for molecular learning. They cover nearly 100
+million molecules and over 3000 sparsely defined tasks, totaling more than 13
+billion individual labels of both quantum and biological nature. In comparison,
+our datasets contain 300 times more data points than the widely used OGB-LSC
+PCQM4Mv2 dataset, and 13 times more than the quantum-only QM1B dataset. In
+addition, to support the development of foundational models based on our
+proposed datasets, we present the Graphium graph machine learning library which
+simplifies the process of building and training molecular machine learning
+models for multi-task and multi-level molecular datasets. Finally, we present a
+range of baseline results as a starting point of multi-task and multi-level
+training on these datasets. Empirically, we observe that performance on
+low-resource biological datasets show improvement by also training on large
+amounts of quantum data. This indicates that there may be potential in
+multi-task and multi-level training of a foundation model and fine-tuning it to
+resource-constrained downstream tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bridging Trustworthiness and Open-World Learning: An Exploratory Neural
+  Approach for Enhancing Interpretability, Generalization, and Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03666v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03666v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shide Du, Zihan Fang, Shiyang Lan, Yanchao Tan, Manuel Günther, Shiping Wang, Wenzhong Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As researchers strive to narrow the gap between machine intelligence and
+human through the development of artificial intelligence technologies, it is
+imperative that we recognize the critical importance of trustworthiness in
+open-world, which has become ubiquitous in all aspects of daily life for
+everyone. However, several challenges may create a crisis of trust in current
+artificial intelligence systems that need to be bridged: 1) Insufficient
+explanation of predictive results; 2) Inadequate generalization for learning
+models; 3) Poor adaptability to uncertain environments. Consequently, we
+explore a neural program to bridge trustworthiness and open-world learning,
+extending from single-modal to multi-modal scenarios for readers. 1) To enhance
+design-level interpretability, we first customize trustworthy networks with
+specific physical meanings; 2) We then design environmental well-being
+task-interfaces via flexible learning regularizers for improving the
+generalization of trustworthy learning; 3) We propose to increase the
+robustness of trustworthy learning by integrating open-world recognition losses
+with agent mechanisms. Eventually, we enhance various trustworthy properties
+through the establishment of design-level explainability, environmental
+well-being task-interfaces and open-world recognition programs. These designed
+open-world protocols are applicable across a wide range of surroundings, under
+open-world multimedia recognition scenarios with significant performance
+improvements observed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Intelligent Client Selection for Federated Learning using Cellular
+  Automata 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00627v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00627v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikolaos Pavlidis, Vasileios Perifanis, Theodoros Panagiotis Chatzinikolaou, Georgios Ch. Sirakoulis, Pavlos S. Efraimidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) has emerged as a promising solution for
+privacy-enhancement and latency minimization in various real-world
+applications, such as transportation, communications, and healthcare. FL
+endeavors to bring Machine Learning (ML) down to the edge by harnessing data
+from million of devices and IoT sensors, thus enabling rapid responses to
+dynamic environments and yielding highly personalized results. However, the
+increased amount of sensors across diverse applications poses challenges in
+terms of communication and resource allocation, hindering the participation of
+all devices in the federated process and prompting the need for effective FL
+client selection. To address this issue, we propose Cellular Automaton-based
+Client Selection (CA-CS), a novel client selection algorithm, which leverages
+Cellular Automata (CA) as models to effectively capture spatio-temporal changes
+in a fast-evolving environment. CA-CS considers the computational resources and
+communication capacity of each participating client, while also accounting for
+inter-client interactions between neighbors during the client selection
+process, enabling intelligent client selection for online FL processes on data
+streams that closely resemble real-world scenarios. In this paper, we present a
+thorough evaluation of the proposed CA-CS algorithm using MNIST and CIFAR-10
+datasets, while making a direct comparison against a uniformly random client
+selection scheme. Our results demonstrate that CA-CS achieves comparable
+accuracy to the random selection approach, while effectively avoiding
+high-latency clients.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18th IEEE International Workshop on Cellular Nanoscale Networks and
+  their Applications</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Relation Graph: A Unified Framework for Identifying Label Noise
+  and Outlier Data <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.12321v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.12321v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jang-Hyun Kim, Sangdoo Yun, Hyun Oh Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diagnosing and cleaning data is a crucial step for building robust machine
+learning systems. However, identifying problems within large-scale datasets
+with real-world distributions is challenging due to the presence of complex
+issues such as label errors, under-representation, and outliers. In this paper,
+we propose a unified approach for identifying the problematic data by utilizing
+a largely ignored source of information: a relational structure of data in the
+feature-embedded space. To this end, we present scalable and effective
+algorithms for detecting label errors and outlier data based on the relational
+graph structure of data. We further introduce a visualization tool that
+provides contextual information of a data point in the feature-embedded space,
+serving as an effective tool for interactively diagnosing data. We evaluate the
+label error and outlier/out-of-distribution (OOD) detection performances of our
+approach on the large-scale image, speech, and language domain tasks, including
+ImageNet, ESC-50, and SST2. Our approach achieves state-of-the-art detection
+performance on all tasks considered and demonstrates its effectiveness in
+debugging large-scale real-world datasets across various domains. We release
+codes at https://github.com/snu-mllab/Neural-Relation-Graph.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Network Initialization for Medical AI Models Using
+  Large-Scale, Unlabeled Natural Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07688v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07688v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soroosh Tayebi Arasteh, Leo Misera, Jakob Nikolas Kather, Daniel Truhn, Sven Nebelung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-training datasets, like ImageNet, have become the gold standard in
+medical image analysis. However, the emergence of self-supervised learning
+(SSL), which leverages unlabeled data to learn robust features, presents an
+opportunity to bypass the intensive labeling process. In this study, we
+explored if SSL for pre-training on non-medical images can be applied to chest
+radiographs and how it compares to supervised pre-training on non-medical
+images and on medical images. We utilized a vision transformer and initialized
+its weights based on (i) SSL pre-training on natural images (DINOv2), (ii) SL
+pre-training on natural images (ImageNet dataset), and (iii) SL pre-training on
+chest radiographs from the MIMIC-CXR database. We tested our approach on over
+800,000 chest radiographs from six large global datasets, diagnosing more than
+20 different imaging findings. Our SSL pre-training on curated images not only
+outperformed ImageNet-based pre-training (P<0.001 for all datasets) but, in
+certain cases, also exceeded SL on the MIMIC-CXR dataset. Our findings suggest
+that selecting the right pre-training strategy, especially with SSL, can be
+pivotal for improving artificial intelligence (AI)'s diagnostic accuracy in
+medical imaging. By demonstrating the promise of SSL in chest radiograph
+analysis, we underline a transformative shift towards more efficient and
+accurate AI models in medical imaging.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Limits of Actor-Critic Algorithms for Decision Tree Policies Learning in
+  IBMDPs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.13365v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.13365v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hecotr Kohler, Riad Akrour, Philippe Preux
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Interpretability of AI models allows for user safety checks to build trust in
+such AIs. In particular, Decision Trees (DTs) provide a global look at the
+learned model and transparently reveal which features of the input are critical
+for making a decision. However, interpretability is hindered if the DT is too
+large. To learn compact trees, a recent Reinforcement Learning (RL) framework
+has been proposed to explore the space of DTs using deep RL. This framework
+augments a decision problem (e.g. a supervised classification task) with
+additional actions that gather information about the features of an otherwise
+hidden input. By appropriately penalizing these actions, the agent learns to
+optimally trade-off size and performance of DTs. In practice, a reactive policy
+for a partially observable Markov decision process (MDP) needs to be learned,
+which is still an open problem. We show in this paper that deep RL can fail
+even on simple toy tasks of this class. However, when the underlying decision
+problem is a supervised classification task, we show that finding the optimal
+tree can be cast as a fully observable Markov decision problem and be solved
+efficiently, giving rise to a new family of algorithms for learning DTs that go
+beyond the classical greedy maximization ones.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be included in an other submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating <span class="highlight-title">Self-Supervised</span> Learning for Molecular Graph Embeddings <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.08005v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.08005v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanchen Wang, Jean Kaddour, Shengchao Liu, Jian Tang, Joan Lasenby, Qi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Self-Supervised Learning (GSSL) provides a robust pathway for acquiring
+embeddings without expert labelling, a capability that carries profound
+implications for molecular graphs due to the staggering number of potential
+molecules and the high cost of obtaining labels. However, GSSL methods are
+designed not for optimisation within a specific domain but rather for
+transferability across a variety of downstream tasks. This broad applicability
+complicates their evaluation. Addressing this challenge, we present "Molecular
+Graph Representation Evaluation" (MOLGRAPHEVAL), generating detailed profiles
+of molecular graph embeddings with interpretable and diversified attributes.
+MOLGRAPHEVAL offers a suite of probing tasks grouped into three categories: (i)
+generic graph, (ii) molecular substructure, and (iii) embedding space
+properties. By leveraging MOLGRAPHEVAL to benchmark existing GSSL methods
+against both current downstream datasets and our suite of tasks, we uncover
+significant inconsistencies between inferences drawn solely from existing
+datasets and those derived from more nuanced probing. These findings suggest
+that current evaluation methodologies fail to capture the entirety of the
+landscape.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Camera ready, NeurIPS Benchmark 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model-Agnostic Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.04906v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.04906v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gianluca Mittone, Walter Riviera, Iacopo Colonnelli, Robert Birke, Marco Aldinucci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since its debut in 2016, Federated Learning (FL) has been tied to the inner
+workings of Deep Neural Networks (DNNs). On the one hand, this allowed its
+development and widespread use as DNNs proliferated. On the other hand, it
+neglected all those scenarios in which using DNNs is not possible or
+advantageous. The fact that most current FL frameworks only allow training DNNs
+reinforces this problem. To address the lack of FL solutions for non-DNN-based
+use cases, we propose MAFL (Model-Agnostic Federated Learning). MAFL marries a
+model-agnostic FL algorithm, AdaBoost.F, with an open industry-grade FL
+framework: Intel OpenFL. MAFL is the first FL system not tied to any specific
+type of machine learning model, allowing exploration of FL scenarios beyond
+DNNs and trees. We test MAFL from multiple points of view, assessing its
+correctness, flexibility and scaling properties up to 64 nodes. We optimised
+the base software achieving a 5.5x speedup on a standard FL scenario. MAFL is
+compatible with x86-64, ARM-v8, Power and RISC-V.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at the EuroPar'23 conference, Limassol, Cyprus</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bootstrapping Vision-Language Learning with Decoupled Language
+  <span class="highlight-title">Pre-train</span>ing <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07063v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07063v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiren Jian, Chongyang Gao, Soroush Vosoughi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel methodology aimed at optimizing the application of frozen
+large language models (LLMs) for resource-intensive vision-language (VL)
+pre-training. The current paradigm uses visual features as prompts to guide
+language models, with a focus on determining the most relevant visual features
+for corresponding text. Our approach diverges by concentrating on the language
+component, specifically identifying the optimal prompts to align with visual
+features. We introduce the Prompt-Transformer (P-Former), a model that predicts
+these ideal prompts, which is trained exclusively on linguistic data, bypassing
+the need for image-text pairings. This strategy subtly bifurcates the
+end-to-end VL training process into an additional, separate stage. Our
+experiments reveal that our framework significantly enhances the performance of
+a robust image-to-text baseline (BLIP-2), and effectively narrows the
+performance gap between models trained with either 4M or 129M image-text pairs.
+Importantly, our framework is modality-agnostic and flexible in terms of
+architectural design, as validated by its successful application in a video
+learning task using varied base modules. The code will be made available at
+https://github.com/yiren-jian/BLIText.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2023 (spotlight)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Experimenting with Emerging RISC-V Systems for Decentralised Machine
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.07946v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.07946v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gianluca Mittone, Nicolò Tonci, Robert Birke, Iacopo Colonnelli, Doriana Medić, Andrea Bartolini, Roberto Esposito, Emanuele Parisi, Francesco Beneventi, Mirko Polato, Massimo Torquati, Luca Benini, Marco Aldinucci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decentralised Machine Learning (DML) enables collaborative machine learning
+without centralised input data. Federated Learning (FL) and Edge Inference are
+examples of DML. While tools for DML (especially FL) are starting to flourish,
+many are not flexible and portable enough to experiment with novel processors
+(e.g., RISC-V), non-fully connected network topologies, and asynchronous
+collaboration schemes. We overcome these limitations via a domain-specific
+language allowing us to map DML schemes to an underlying middleware, i.e. the
+FastFlow parallel programming library. We experiment with it by generating
+different working DML schemes on x86-64 and ARM platforms and an emerging
+RISC-V one. We characterise the performance and energy efficiency of the
+presented schemes and systems. As a byproduct, we introduce a RISC-V porting of
+the PyTorch framework, the first publicly available to our knowledge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is the accepted version of ACM copyrighted material
+  presented at the CF'23 conference in Bologna, Italy</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Modelling of functional profiles and explainable shape shifts detection:
+  An approach combining the notion of the Fréchet mean with the shape
+  invariant model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2010.02968v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2010.02968v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Georgios I. Papayiannis, Stelios Psarakis, Athanasios N. Yannacopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A modelling framework suitable for detecting shape shifts in functional
+profiles combining the notion of Fr\'echet mean and the concept of deformation
+models is developed and proposed. The generalized mean sense offered by the
+Fr\'echet mean notion is employed to capture the typical pattern of the
+profiles under study, while the concept of deformation models, and in
+particular of the shape invariant model, allows for interpretable
+parameterizations of profile's deviations from the typical shape. EWMA-type
+control charts compatible with the functional nature of data and the employed
+deformation model are built and proposed, exploiting certain shape
+characteristics of the profiles under study with respect to the generalized
+mean sense, allowing for the identification of potential shifts concerning the
+shape and/or the deformation process. Potential shifts in the shape deformation
+process, are further distinguished to significant shifts with respect to
+amplitude and/or the phase of the profile under study. The proposed modelling
+and shift detection framework is implemented to a real world case study, where
+daily concentration profiles concerning air pollutants from an area in the city
+of Athens are modelled, while profiles indicating hazardous concentration
+levels are successfully identified in most of the cases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Federated Learning Benchmark for Drug-Target Interaction <span class="chip">WWW'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.07684v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.07684v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gianluca Mittone, Filip Svoboda, Marco Aldinucci, Nicholas D. Lane, Pietro Lio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aggregating pharmaceutical data in the drug-target interaction (DTI) domain
+has the potential to deliver life-saving breakthroughs. It is, however,
+notoriously difficult due to regulatory constraints and commercial interests.
+This work proposes the application of federated learning, which we argue to be
+reconcilable with the industry's constraints, as it does not require sharing of
+any information that would reveal the entities' data or any other high-level
+summary of it. When used on a representative GraphDTA model and the KIBA
+dataset it achieves up to 15% improved performance relative to the best
+available non-privacy preserving alternative. Our extensive battery of
+experiments shows that, unlike in other domains, the non-IID data distribution
+in the DTI datasets does not deteriorate FL performance. Additionally, we
+identify a material trade-off between the benefits of adding new data, and the
+cost of adding more clients.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is the accepted version of ACM copyrighted material
+  published at the WWW'23 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptive Optimizers with Sparse Group Lasso for Neural Networks in CTR
+  Prediction <span class="chip">ECML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2107.14432v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2107.14432v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yun Yue, Yongchao Liu, Suo Tong, Minghao Li, Zhen Zhang, Chunyang Wen, Huanjun Bao, Lihong Gu, Jinjie Gu, Yixiang Mu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We develop a novel framework that adds the regularizers of the sparse group
+lasso to a family of adaptive optimizers in deep learning, such as Momentum,
+Adagrad, Adam, AMSGrad, AdaHessian, and create a new class of optimizers, which
+are named Group Momentum, Group Adagrad, Group Adam, Group AMSGrad and Group
+AdaHessian, etc., accordingly. We establish theoretically proven convergence
+guarantees in the stochastic convex settings, based on primal-dual methods. We
+evaluate the regularized effect of our new optimizers on three large-scale
+real-world ad click datasets with state-of-the-art deep learning models. The
+experimental results reveal that compared with the original optimizers with the
+post-processing procedure which uses the magnitude pruning method, the
+performance of the models can be significantly improved on the same sparsity
+level. Furthermore, in comparison to the cases without magnitude pruning, our
+methods can achieve extremely high sparsity with significantly better or highly
+competitive performance. The code is available at
+https://github.com/intelligent-machine-learning/dlrover/blob/master/tfplus.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages. Published as a conference paper at ECML PKDD 2021. This
+  version includes Appendix which was not included in the published version
+  because of page limit</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tokenizer Choice For LLM Training: Negligible or Crucial? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08754v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08754v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehdi Ali, Michael Fromm, Klaudia Thellmann, Richard Rutmann, Max Lübbering, Johannes Leveling, Katrin Klug, Jan Ebert, Niclas Doll, Jasper Schulze Buschhoff, Charvi Jain, Alexander Arno Weber, Lena Jurkschat, Hammam Abdelwahab, Chelsea John, Pedro Ortiz Suarez, Malte Ostendorff, Samuel Weinbach, Rafet Sifa, Stefan Kesselheim, Nicolas Flores-Herr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent success of LLMs has been predominantly driven by curating the
+training dataset composition, scaling of model architectures and dataset sizes
+and advancements in pretraining objectives, leaving tokenizer influence as a
+blind spot. Shedding light on this underexplored area, we conduct a
+comprehensive study on the influence of tokenizer choice on LLM downstream
+performance by training 24 mono- and multilingual LLMs at a 2.6B parameter
+scale, ablating different tokenizer algorithms and parameterizations. Our
+studies highlight that the tokenizer choice can significantly impact the
+model's downstream performance, training and inference costs. In particular, we
+find that the common tokenizer evaluation metrics fertility and parity are not
+always predictive of model downstream performance, rendering these metrics a
+questionable proxy for the model's downstream performance. Furthermore, we show
+that multilingual tokenizers trained on the five most frequent European
+languages require vocabulary size increases of factor three in comparison to
+English. While English-only tokenizers have been applied to the training of
+multi-lingual LLMs, we find that this approach results in a severe downstream
+performance degradation and additional training costs of up to 68%, due to an
+inefficient tokenization vocabulary.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chain of Hindsight Aligns Language Models with Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.02676v8">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.02676v8.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Liu, Carmelo Sferrazza, Pieter Abbeel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning from human preferences is important for language models to match
+human needs and to align with human and social values. Prior works have
+achieved remarkable successes by learning from human feedback to understand and
+follow instructions. Nonetheless, these methods are either founded on
+hand-picked model generations that are favored by human annotators, rendering
+them inefficient in terms of data utilization and challenging to apply in
+general, or they depend on reinforcement learning, which often suffers from
+imperfect reward functions and relies on extremely challenging optimizations.
+In this work, we propose a novel technique, Chain of Hindsight, that is easy to
+optimize and can learn from any form of feedback, regardless of its polarity.
+Our idea is inspired by how humans learn from extensive feedback presented in
+the form of languages. We convert all types of feedback into sequences of
+sentences, which are then used to fine-tune the model, allowing us to take
+advantage of the language comprehension capabilities of language models. We
+condition the model on a sequence of model generations paired with feedback. By
+doing so, the model is trained to generate outputs based on feedback, while
+learning to identify and correct negative attributes or errors. Applying our
+method to large language models, we observed that Chain of Hindsight
+significantly surpasses previous methods in aligning language models with human
+preferences. We report significant improvements on summarization and dialogue
+benchmarks, with our approach markedly preferred in human evaluations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machine learning for phase-resolved reconstruction of nonlinear ocean
+  wave surface elevations from sparse remote sensing data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11913v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11913v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Svenja Ehlers, Marco Klein, Alexander Heinlein, Mathies Wedler, Nicolas Desmars, Norbert Hoffmann, Merten Stender
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate short-term predictions of phase-resolved water wave conditions are
+crucial for decision-making in ocean engineering. However, the initialization
+of remote-sensing-based wave prediction models first requires a reconstruction
+of wave surfaces from sparse measurements like radar. Existing reconstruction
+methods either rely on computationally intensive optimization procedures or
+simplistic modelling assumptions that compromise the real-time capability or
+accuracy of the subsequent prediction process. We therefore address these
+issues by proposing a novel approach for phase-resolved wave surface
+reconstruction using neural networks based on the U-Net and Fourier neural
+operator (FNO) architectures. Our approach utilizes synthetic yet highly
+realistic training data on uniform one-dimensional grids, that is generated by
+the high-order spectral method for wave simulation and a geometric radar
+modelling approach. The investigation reveals that both models deliver accurate
+wave reconstruction results and show good generalization for different sea
+states when trained with spatio-temporal radar data containing multiple
+historic radar snapshots in each input. Notably, the FNO demonstrates superior
+performance in handling the data structure imposed by wave physics due to its
+global approach to learn the mapping between input and output in Fourier space.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 13 figures (without appendix), final version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Semantic Parsing by Large Language Models for Intricate Updating
+  Strategies of Zero-Shot Dialogue State Tracking <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10520v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10520v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxiang Wu, Guanting Dong, Weiran Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot Dialogue State Tracking (DST) addresses the challenge of acquiring
+and annotating task-oriented dialogues, which can be time consuming and costly.
+However, DST extends beyond simple slot-filling and requires effective updating
+strategies for tracking dialogue state as conversations progress. In this
+paper, we propose ParsingDST, a new In-Context Learning (ICL) method, to
+introduce additional intricate updating strategies in zero-shot DST. Our
+approach reformulates the DST task by leveraging powerful Large Language Models
+(LLMs) and translating the original dialogue text to JSON through semantic
+parsing as an intermediate state. We also design a novel framework that
+includes more modules to ensure the effectiveness of updating strategies in the
+text-to-JSON process. Experimental results demonstrate that our approach
+outperforms existing zero-shot DST methods on MultiWOZ, exhibiting significant
+improvements in Joint Goal Accuracy (JGA) and slot accuracy compared to
+existing ICL methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the Findings of EMNLP 2023 (Short Paper)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adopting Robustness and Optimality in Fitting and Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1510.03826v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1510.03826v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiguang Wang, Tim Oates, James Lo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We generalized a modified exponentialized estimator by pushing the
+robust-optimal (RO) index $\lambda$ to $-\infty$ for achieving robustness to
+outliers by optimizing a quasi-Minimin function. The robustness is realized and
+controlled adaptively by the RO index without any predefined threshold.
+Optimality is guaranteed by expansion of the convexity region in the Hessian
+matrix to largely avoid local optima. Detailed quantitative analysis on both
+robustness and optimality are provided. The results of proposed experiments on
+fitting tasks for three noisy non-convex functions and the digits recognition
+task on the MNIST dataset consolidate the conclusions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:1506.02690</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evading Watermark based Detection of AI-Generated Content <span class="chip">CCS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03807v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03807v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengyuan Jiang, Jinghuai Zhang, Neil Zhenqiang Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A generative AI model can generate extremely realistic-looking content,
+posing growing challenges to the authenticity of information. To address the
+challenges, watermark has been leveraged to detect AI-generated content.
+Specifically, a watermark is embedded into an AI-generated content before it is
+released. A content is detected as AI-generated if a similar watermark can be
+decoded from it. In this work, we perform a systematic study on the robustness
+of such watermark-based AI-generated content detection. We focus on
+AI-generated images. Our work shows that an attacker can post-process a
+watermarked image via adding a small, human-imperceptible perturbation to it,
+such that the post-processed image evades detection while maintaining its
+visual quality. We show the effectiveness of our attack both theoretically and
+empirically. Moreover, to evade detection, our adversarial post-processing
+method adds much smaller perturbations to AI-generated images and thus better
+maintain their visual quality than existing popular post-processing methods
+such as JPEG compression, Gaussian blur, and Brightness/Contrast. Our work
+shows the insufficiency of existing watermark-based detection of AI-generated
+content, highlighting the urgent needs of new methods. Our code is publicly
+available: https://github.com/zhengyuan-jiang/WEvade.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in ACM Conference on Computer and Communications Security
+  (CCS), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Compression with Bayesian Implicit Neural Representations <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19185v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19185v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zongyu Guo, Gergely Flamich, Jiajun He, Zhibo Chen, José Miguel Hernández-Lobato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many common types of data can be represented as functions that map
+coordinates to signal values, such as pixel locations to RGB values in the case
+of an image. Based on this view, data can be compressed by overfitting a
+compact neural network to its functional representation and then encoding the
+network weights. However, most current solutions for this are inefficient, as
+quantization to low-bit precision substantially degrades the reconstruction
+quality. To address this issue, we propose overfitting variational Bayesian
+neural networks to the data and compressing an approximate posterior weight
+sample using relative entropy coding instead of quantizing and entropy coding
+it. This strategy enables direct optimization of the rate-distortion
+performance by minimizing the $\beta$-ELBO, and target different
+rate-distortion trade-offs for a given network architecture by adjusting
+$\beta$. Moreover, we introduce an iterative algorithm for learning prior
+weight distributions and employ a progressive refinement process for the
+variational posterior that significantly enhances performance. Experiments show
+that our method achieves strong performance on image and audio compression
+while retaining simplicity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a Spotlight paper in NeurIPS 2023. Camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Distance Sensitivity Oracles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.02681v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.02681v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Davin Jeong, Allison Gunby-Mann, Sarel Cohen, Maximilian Katzmann, Chau Pham, Arnav Bhakta, Tobias Friedrich, Sang Chin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the most fundamental graph problems is finding a shortest path from a
+source to a target node. While in its basic forms the problem has been studied
+extensively and efficient algorithms are known, it becomes significantly harder
+as soon as parts of the graph are susceptible to failure. Although one can
+recompute a shortest replacement path after every outage, this is rather
+inefficient both in time and/or storage. One way to overcome this problem is to
+shift computational burden from the queries into a pre-processing step, where a
+data structure is computed that allows for fast querying of replacement paths,
+typically referred to as a Distance Sensitivity Oracle (DSO). While DSOs have
+been extensively studied in the theoretical computer science community, to the
+best of our knowledge this is the first work to construct DSOs using deep
+learning techniques. We show how to use deep learning to utilize a
+combinatorial structure of replacement paths. More specifically, we utilize the
+combinatorial structure of replacement paths as a concatenation of shortest
+paths and use deep learning to find the pivot nodes for stitching shortest
+paths into replacement paths.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2007.11495 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Online Resource Allocation in Episodic Markov Decision Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10744v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10744v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Duksang Lee, Dabeen Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies a long-term resource allocation problem over multiple
+periods where each period requires a multi-stage decision-making process. We
+formulate the problem as an online allocation problem in an episodic
+finite-horizon constrained Markov decision process with an unknown
+non-stationary transition function and stochastic non-stationary reward and
+resource consumption functions. We propose the observe-then-decide regime and
+improve the existing decide-then-observe regime, while the two settings differ
+in how the observations and feedback about the reward and resource consumption
+functions are given to the decision-maker. We develop an online dual mirror
+descent algorithm that achieves near-optimal regret bounds for both settings.
+For the observe-then-decide regime, we prove that the expected regret against
+the dynamic clairvoyant optimal policy is bounded by $\tilde
+O(\rho^{-1}{H^{3/2}}S\sqrt{AT})$ where $\rho\in(0,1)$ is the budget parameter,
+$H$ is the length of the horizon, $S$ and $A$ are the numbers of states and
+actions, and $T$ is the number of episodes. For the decide-then-observe regime,
+we show that the regret against the static optimal policy that has access to
+the mean reward and mean resource consumption functions is bounded by $\tilde
+O(\rho^{-1}{H^{3/2}}S\sqrt{AT})$ with high probability. We test the numerical
+efficiency of our method for a variant of the resource-constrained inventory
+management problem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reconstructing the Hubble parameter with future Gravitational Wave
+  missions using Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.05169v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.05169v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Purba Mukherjee, Rahul Shah, Arko Bhaumik, Supratik Pal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the prospects of Gaussian processes (GP), a machine learning (ML)
+algorithm, as a tool to reconstruct the Hubble parameter $H(z)$ with two
+upcoming gravitational wave missions, namely the evolved Laser Interferometer
+Space Antenna (eLISA) and the Einstein Telescope (ET). Assuming various
+background cosmological models, the Hubble parameter has been reconstructed in
+a non-parametric manner with the help of GP using realistically generated
+catalogs for each mission. The effects of early-time and late-time priors on
+the reconstruction of $H(z)$, and hence on the Hubble constant ($H_0$), have
+also been focused on separately. Our analysis reveals that GP is quite robust
+in reconstructing the expansion history of the Universe within the
+observational window of the specific missions under consideration. We further
+confirm that both eLISA and ET would be able to provide constraints on $H(z)$
+and $H_0$ which would be competitive to those inferred from current datasets.
+In particular, we observe that an eLISA run of $\sim10$-year duration with
+$\sim80$ detected bright siren events would be able to constrain $H_0$ as good
+as a $\sim3$-year ET run assuming $\sim 1000$ bright siren event detections.
+Further improvement in precision is expected for longer eLISA mission durations
+such as a $\sim15$-year time-frame having $\sim120$ events. Lastly, we discuss
+the possible role of these future gravitational wave missions in addressing the
+Hubble tension, for each model, on a case-by-case basis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 8 sets of figures, Accepted in ApJ</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluation of feature selection performance for identification of best
+  effective technical indicators on stock market price prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09903v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09903v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fatemeh Moodi, Amir Jahangard-Rafsanjani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the influence of many factors, including technical indicators on stock
+market prediction, feature selection is important to choose the best
+indicators. One of the feature selection methods that consider the performance
+of models during feature selection is the wrapper feature selection method. The
+aim of this research is to identify a combination of the best stock market
+indicators through feature selection to predict the stock market price with the
+least error. In order to evaluate the impact of wrapper feature selection
+techniques on stock market prediction, in this paper SFS and SBS with 10
+estimators and 123 technical indicators have been examined on the last 13 years
+of Apple Company. Also, by the proposed method, the data created by the 3-day
+time window were converted to the appropriate input for regression methods.
+Based on the results observed: (1) Each wrapper feature selection method has
+different results with different machine learning methods, and each method is
+more correlated with a specific set of technical indicators of the stock
+market. (2) Ridge and LR estimates alone, and with two methods of the wrapper
+feature selection, namely SFS and SBS; They had the best results with all
+assessment criteria for market forecast. (3)The Ridge and LR method with all
+the R2, MSE, RMSE, MAE and MAPE have the best stock market prediction results.
+Also, the MLP Regression Method, along with the Sequential Forwards Selection
+and the MSE, had the best performance. SVR regression, along with the SFS and
+the MSE, has improved greatly compared to the SVR regression with all
+indicators. (4) It was also observed that different features are selected by
+different ML methods with different evaluation parameters. (5) Most ML methods
+have used the Squeeze_pro, Percentage Price Oscillator, Thermo, Decay, Archer
+On-Balance Volume, Bollinger Bands, Squeeze and Ichimoku indicator.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 8 figures,4 tables,45 references</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Euclidean-Norm-Induced Schatten-p Quasi-Norm Regularization for Low-Rank
+  Tensor Completion and Tensor Robust Principal Component Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2012.03436v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2012.03436v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jicong Fan, Lijun Ding, Chengrun Yang, Zhao Zhang, Madeleine Udell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The nuclear norm and Schatten-$p$ quasi-norm are popular rank proxies in
+low-rank matrix recovery. However, computing the nuclear norm or Schatten-$p$
+quasi-norm of a tensor is hard in both theory and practice, hindering their
+application to low-rank tensor completion (LRTC) and tensor robust principal
+component analysis (TRPCA). In this paper, we propose a new class of tensor
+rank regularizers based on the Euclidean norms of the CP component vectors of a
+tensor and show that these regularizers are monotonic transformations of tensor
+Schatten-$p$ quasi-norm. This connection enables us to minimize the
+Schatten-$p$ quasi-norm in LRTC and TRPCA implicitly via the component vectors.
+The method scales to big tensors and provides an arbitrarily sharper rank proxy
+for low-rank tensor recovery compared to the nuclear norm. On the other hand,
+we study the generalization abilities of LRTC with the Schatten-$p$ quasi-norm
+regularizer and LRTC with the proposed regularizers. The theorems show that a
+relatively sharper regularizer leads to a tighter error bound, which is
+consistent with our numerical results. Particularly, we prove that for LRTC
+with Schatten-$p$ quasi-norm regularizer on $d$-order tensors, $p=1/d$ is
+always better than any $p>1/d$ in terms of the generalization ability. We also
+provide a recovery error bound to verify the usefulness of small $p$ in the
+Schatten-$p$ quasi-norm for TRPCA. Numerical results on synthetic data and real
+data demonstrate the effectiveness of the regularization methods and theorems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published by Transactions on Machine Learning Research, January 2023;
+  https://openreview.net/forum?id=Grhi800jVz</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quantization-based Optimization with Perspective of Quantum Mechanics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.11594v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.11594v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinwuk Seok, Changsik Cho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Statistical and stochastic analysis based on thermodynamics has been the main
+analysis framework for stochastic global optimization. Recently, appearing
+quantum annealing or quantum tunneling algorithm for global optimization, we
+require a new researching framework for global optimization algorithms. In this
+paper, we provide the analysis for quantization-based optimization based on the
+Schr\"odinger equation to reveal what property in quantum mechanics enables
+global optimization. We present that the tunneling effect derived by the
+Schr\"odinger equation in quantization-based optimization enables to escape of
+a local minimum. Additionally, we confirm that this tunneling effect is the
+same property included in quantum mechanics-based global optimization.
+Experiments with standard multi-modal benchmark functions represent that the
+proposed analysis is valid.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in ICTC 2023 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ JoinGym: An Efficient Query Optimization Environment for Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.11704v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.11704v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaiwen Wang, Junxiong Wang, Yueying Li, Nathan Kallus, Immanuel Trummer, Wen Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Join order selection (JOS) is the problem of ordering join operations to
+minimize total query execution cost and it is the core NP-hard combinatorial
+optimization problem of query optimization. In this paper, we present JoinGym,
+a lightweight and easy-to-use query optimization environment for reinforcement
+learning (RL) that captures both the left-deep and bushy variants of the JOS
+problem. Compared to existing query optimization environments, the key
+advantages of JoinGym are usability and significantly higher throughput which
+we accomplish by simulating query executions entirely offline. Under the hood,
+JoinGym simulates a query plan's cost by looking up intermediate result
+cardinalities from a pre-computed dataset. We release a novel cardinality
+dataset for $3300$ SQL queries based on real IMDb workloads which may be of
+independent interest, e.g., for cardinality estimation. Finally, we extensively
+benchmark four RL algorithms and find that their cost distributions are
+heavy-tailed, which motivates future work in risk-sensitive RL. In sum, JoinGym
+enables users to rapidly prototype RL algorithms on realistic database problems
+without needing to setup and run live systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>JoinGym is available at https://github.com/kaiwenw/JoinGym!</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bi-fidelity Variational Auto-encoder for Uncertainty Quantification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16530v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16530v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nuojin Cheng, Osman Asif Malik, Subhayan De, Stephen Becker, Alireza Doostan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantifying the uncertainty of quantities of interest (QoIs) from physical
+systems is a primary objective in model validation. However, achieving this
+goal entails balancing the need for computational efficiency with the
+requirement for numerical accuracy. To address this trade-off, we propose a
+novel bi-fidelity formulation of variational auto-encoders (BF-VAE) designed to
+estimate the uncertainty associated with a QoI from low-fidelity (LF) and
+high-fidelity (HF) samples of the QoI. This model allows for the approximation
+of the statistics of the HF QoI by leveraging information derived from its LF
+counterpart. Specifically, we design a bi-fidelity auto-regressive model in the
+latent space that is integrated within the VAE's probabilistic encoder-decoder
+structure. An effective algorithm is proposed to maximize the variational lower
+bound of the HF log-likelihood in the presence of limited HF data, resulting in
+the synthesis of HF realizations with a reduced computational cost.
+Additionally, we introduce the concept of the bi-fidelity information
+bottleneck (BF-IB) to provide an information-theoretic interpretation of the
+proposed BF-VAE model. Our numerical results demonstrate that BF-VAE leads to
+considerably improved accuracy, as compared to a VAE trained using only HF
+data, when limited HF data is available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HGCVAE: Integrating Generative and Contrastive Learning for
+  Heterogeneous Graph Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11102v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11102v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yulan Hu, Zhirui Yang, Sheng Ouyang, Junchen Wan, Fuzheng Zhang, Zhongyuan Wang, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative self-supervised learning (SSL) has exhibited significant potential
+and garnered increasing interest in graph learning. In this study, we aim to
+explore the problem of generative SSL in the context of heterogeneous graph
+learning (HGL). The previous SSL approaches for heterogeneous graphs have
+primarily relied on contrastive learning, necessitating the design of complex
+views to capture heterogeneity. However, existing generative SSL methods have
+not fully leveraged the capabilities of generative models to address the
+challenges of HGL. In this paper, we present HGCVAE, a novel contrastive
+variational graph auto-encoder that liberates HGL from the burden of intricate
+heterogeneity capturing. Instead of focusing on complicated heterogeneity,
+HGCVAE harnesses the full potential of generative SSL. HGCVAE innovatively
+consolidates contrastive learning with generative SSL, introducing several key
+innovations. Firstly, we employ a progressive mechanism to generate
+high-quality hard negative samples for contrastive learning, utilizing the
+power of variational inference. Additionally, we present a dynamic mask
+strategy to ensure effective and stable learning. Moreover, we propose an
+enhanced scaled cosine error as the criterion for better attribute
+reconstruction. As an initial step in combining generative and contrastive SSL,
+HGCVAE achieves remarkable results compared to various state-of-the-art
+baselines, confirming its superiority.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FedFA: Federated Learning with Feature Anchors to Align Features and
+  Classifiers for Heterogeneous Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.09299v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.09299v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tailin Zhou, Jun Zhang, Danny H. K. Tsang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning allows multiple clients to collaboratively train a model
+without exchanging their data, thus preserving data privacy. Unfortunately, it
+suffers significant performance degradation due to heterogeneous data at
+clients. Common solutions involve designing an auxiliary loss to regularize
+weight divergence or feature inconsistency during local training. However, we
+discover that these approaches fall short of the expected performance because
+they ignore the existence of a vicious cycle between feature inconsistency and
+classifier divergence across clients. This vicious cycle causes client models
+to be updated in inconsistent feature spaces with more diverged classifiers. To
+break the vicious cycle, we propose a novel framework named Federated learning
+with Feature Anchors (FedFA). FedFA utilizes feature anchors to align features
+and calibrate classifiers across clients simultaneously. This enables client
+models to be updated in a shared feature space with consistent classifiers
+during local training. Theoretically, we analyze the non-convex convergence
+rate of FedFA. We also demonstrate that the integration of feature alignment
+and classifier calibration in FedFA brings a virtuous cycle between feature and
+classifier updates, which breaks the vicious cycle existing in current
+approaches. Extensive experiments show that FedFA significantly outperforms
+existing approaches on various classification datasets under label distribution
+skew and feature distribution skew.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Transactions on Mobile Computing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Expressive Power of <span class="highlight-title">Transformer</span>s with Chain of Thought 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07923v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07923v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        William Merrill, Ashish Sabharwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent theoretical work has identified surprisingly simple reasoning
+problems, such as checking if two nodes in a graph are connected or simulating
+finite-state machines, that are provably unsolvable by standard transformers
+that answer immediately after reading their input. However, in practice,
+transformers' reasoning can be improved by allowing them to use a "chain of
+thought" or "scratchpad", i.e., generate and condition on a sequence of
+intermediate tokens before answering. Motivated by this, we ask: Does such
+intermediate generation fundamentally extend the computational power of a
+decoder-only transformer? We show that the answer is yes, but the amount of
+increase depends crucially on the amount of intermediate generation. For
+instance, we find that transformer decoders with a logarithmic number of
+decoding steps (w.r.t. the input length) push the limits of standard
+transformers only slightly, while a linear number of decoding steps adds a
+clear new ability (under standard complexity conjectures): recognizing all
+regular languages. Our results also imply that linear steps keep transformer
+decoders within context-sensitive languages, and polynomial steps make them
+recognize exactly the class of polynomial-time solvable problems -- the first
+exact characterization of a type of transformers in terms of standard
+complexity classes. Together, our results provide a nuanced framework for
+understanding how the length of a transformer's chain of thought or scratchpad
+impacts its reasoning power.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9-page preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding Fairness Surrogate Functions in Algorithmic Fairness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11211v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11211v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Yao, Zhanke Zhou, Zhicong Li, Bo Han, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It has been observed that machine learning algorithms exhibit biased
+predictions against certain population groups. To mitigate such bias while
+achieving comparable accuracy, a promising approach is to introduce surrogate
+functions of the concerned fairness definition and solve a constrained
+optimization problem. However, an intriguing issue in previous work is that
+such fairness surrogate functions may yield unfair results. In this work, in
+order to deeply understand this issue, taking a widely used fairness
+definition, demographic parity as an example, we both theoretically and
+empirically show that there is a surrogate-fairness gap between the fairness
+definition and the fairness surrogate function. The "gap" directly determines
+whether a surrogate function is an appropriate substitute for a fairness
+definition. Also, the theoretical analysis and experimental results about the
+"gap" motivate us that the unbounded surrogate functions will be affected by
+the points far from the decision boundary, which is the large margin points
+issue investigated in this paper. To address it, we propose the general sigmoid
+surrogate with a rigorous and reliable fairness guarantee. Interestingly, the
+theory also provides insights into two important issues that deal with the
+large margin points as well as obtaining a more balanced dataset are beneficial
+to fairness. Furthermore, we elaborate a novel and general algorithm called
+Balanced Surrogate, which iteratively reduces the "gap" to improve fairness.
+Finally, we provide empirical evidence showing that our methods achieve better
+fairness performance in three real-world datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Paraphrasing evades detectors of AI-generated text, but retrieval is an
+  effective defense <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13408v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13408v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kalpesh Krishna, Yixiao Song, Marzena Karpinska, John Wieting, Mohit Iyyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rise in malicious usage of large language models, such as fake content
+creation and academic plagiarism, has motivated the development of approaches
+that identify AI-generated text, including those based on watermarking or
+outlier detection. However, the robustness of these detection algorithms to
+paraphrases of AI-generated text remains unclear. To stress test these
+detectors, we build a 11B parameter paraphrase generation model (DIPPER) that
+can paraphrase paragraphs, condition on surrounding context, and control
+lexical diversity and content reordering. Using DIPPER to paraphrase text
+generated by three large language models (including GPT3.5-davinci-003)
+successfully evades several detectors, including watermarking, GPTZero,
+DetectGPT, and OpenAI's text classifier. For example, DIPPER drops detection
+accuracy of DetectGPT from 70.3% to 4.6% (at a constant false positive rate of
+1%), without appreciably modifying the input semantics.
+  To increase the robustness of AI-generated text detection to paraphrase
+attacks, we introduce a simple defense that relies on retrieving
+semantically-similar generations and must be maintained by a language model API
+provider. Given a candidate text, our algorithm searches a database of
+sequences previously generated by the API, looking for sequences that match the
+candidate text within a certain threshold. We empirically verify our defense
+using a database of 15M generations from a fine-tuned T5-XXL model and find
+that it can detect 80% to 97% of paraphrased generations across different
+settings while only classifying 1% of human-written sequences as AI-generated.
+We open-source our models, code and data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023 camera ready (32 pages). Code, models, data available in
+  https://github.com/martiansideofthemoon/ai-detection-paraphrases</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">6</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Non-Intrusive Adaptation: Input-Centric Parameter-efficient Fine-Tuning
+  for Versatile Multimodal Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.12100v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.12100v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaqing Wang, Jialin Wu, Tanmaya Dabral, Jiageng Zhang, Geoff Brown, Chun-Ta Lu, Frederick Liu, Yi Liang, Bo Pang, Michael Bendersky, Radu Soricut
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) and vision language models (VLMs) demonstrate
+excellent performance on a wide range of tasks by scaling up parameter counts
+from O(10^9) to O(10^{12}) levels and further beyond. These large scales make
+it impossible to adapt and deploy fully specialized models given a task of
+interest. Parameter-efficient fine-tuning (PEFT) emerges as a promising
+direction to tackle the adaptation and serving challenges for such large
+models. We categorize PEFT techniques into two types: intrusive and
+non-intrusive. Intrusive PEFT techniques directly change a model's internal
+architecture. Though more flexible, they introduce significant complexities for
+training and serving. Non-intrusive PEFT techniques leave the internal
+architecture unchanged and only adapt model-external parameters, such as
+embeddings for input. In this work, we describe AdaLink as a non-intrusive PEFT
+technique that achieves competitive performance compared to SoTA intrusive PEFT
+(LoRA) and full model fine-tuning (FT) on various tasks. We evaluate using both
+text-only and multimodal tasks, with experiments that account for both
+parameter-count scaling and training regime (with and without instruction
+tuning).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MusicAgent: An AI Agent for Music Understanding and Generation with
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11954v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11954v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dingyao Yu, Kaitao Song, Peiling Lu, Tianyu He, Xu Tan, Wei Ye, Shikun Zhang, Jiang Bian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI-empowered music processing is a diverse field that encompasses dozens of
+tasks, ranging from generation tasks (e.g., timbre synthesis) to comprehension
+tasks (e.g., music classification). For developers and amateurs, it is very
+difficult to grasp all of these task to satisfy their requirements in music
+processing, especially considering the huge differences in the representations
+of music data and the model applicability across platforms among various tasks.
+Consequently, it is necessary to build a system to organize and integrate these
+tasks, and thus help practitioners to automatically analyze their demand and
+call suitable tools as solutions to fulfill their requirements. Inspired by the
+recent success of large language models (LLMs) in task automation, we develop a
+system, named MusicAgent, which integrates numerous music-related tools and an
+autonomous workflow to address user requirements. More specifically, we build
+1) toolset that collects tools from diverse sources, including Hugging Face,
+GitHub, and Web API, etc. 2) an autonomous workflow empowered by LLMs (e.g.,
+ChatGPT) to organize these tools and automatically decompose user requests into
+multiple sub-tasks and invoke corresponding music tools. The primary goal of
+this system is to free users from the intricacies of AI-music tools, enabling
+them to concentrate on the creative aspect. By granting users the freedom to
+effortlessly combine tools, the system offers a seamless and enriching music
+experience.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLARA: Multilingual Contrastive Learning for Audio Representation
+  Acquisition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11830v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11830v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kari A Noriy, Xiaosong Yang, Marcin Budka, Jian Jun Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes a novel framework for multilingual speech and sound
+representation learning using contrastive learning. The lack of sizeable
+labelled datasets hinders speech-processing research across languages. Recent
+advances in contrastive learning provide self-supervised techniques to learn
+from unlabelled data. Motivated by reducing data dependence and improving
+generalisation across diverse languages and conditions, we develop a
+multilingual contrastive framework. This framework enables models to acquire
+shared representations across languages, facilitating cross-lingual transfer
+with limited target language data.
+  Additionally, capturing emotional cues within speech is challenging due to
+subjective perceptual assessments. By learning expressive representations from
+diverse, multilingual data in a self-supervised manner, our approach aims to
+develop speech representations that encode emotive dimensions.
+  Our method trains encoders on a large corpus of multi-lingual audio data.
+Data augmentation techniques are employed to expand the dataset. The
+contrastive learning approach trains the model to maximise agreement between
+positive pairs and minimise agreement between negative pairs. Extensive
+experiments demonstrate state-of-the-art performance of the proposed model on
+emotion recognition, audio classification, and retrieval benchmarks under
+zero-shot and few-shot conditions. This provides an effective approach for
+acquiring shared and generalised speech representations across languages and
+acoustic conditions while encoding latent emotional dimensions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VKIE: The Application of Key Information Extraction on Video Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11650v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11650v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyu An, Ye Liu, Haoyuan Peng, Di Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Extracting structured information from videos is critical for numerous
+downstream applications in the industry. In this paper, we define a significant
+task of extracting hierarchical key information from visual texts on videos. To
+fulfill this task, we decouples it into four subtasks and introduce two
+implementation solutions called PipVKIE and UniVKIE. PipVKIE sequentially
+completes the four subtasks in continuous stages, while UniVKIE is improved by
+unifying all the subtasks into one backbone. Both PipVKIE and UniVKIE leverage
+multimodal information from vision, text, and coordinates for feature
+representation. Extensive experiments on one well-defined dataset demonstrate
+that our solutions can achieve remarkable performance and efficient inference
+speed. The code and dataset will be publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VERITE: A Robust Benchmark for Multimodal Misinformation Detection
+  Accounting for Unimodal Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14133v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14133v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefanos-Iordanis Papadopoulos, Christos Koutlis, Symeon Papadopoulos, Panagiotis C. Petrantonakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimedia content has become ubiquitous on social media platforms, leading
+to the rise of multimodal misinformation (MM) and the urgent need for effective
+strategies to detect and prevent its spread. In recent years, the challenge of
+multimodal misinformation detection (MMD) has garnered significant attention by
+researchers and has mainly involved the creation of annotated, weakly
+annotated, or synthetically generated training datasets, along with the
+development of various deep learning MMD models. However, the problem of
+unimodal bias has been overlooked, where specific patterns and biases in MMD
+benchmarks can result in biased or unimodal models outperforming their
+multimodal counterparts on an inherently multimodal task; making it difficult
+to assess progress. In this study, we systematically investigate and identify
+the presence of unimodal bias in widely-used MMD benchmarks, namely VMU-Twitter
+and COSMOS. To address this issue, we introduce the "VERification of Image-TExt
+pairs" (VERITE) benchmark for MMD which incorporates real-world data, excludes
+"asymmetric multimodal misinformation" and utilizes "modality balancing". We
+conduct an extensive comparative study with a Transformer-based architecture
+that shows the ability of VERITE to effectively address unimodal bias,
+rendering it a robust evaluation framework for MMD. Furthermore, we introduce a
+new method -- termed Crossmodal HArd Synthetic MisAlignment (CHASMA) -- for
+generating realistic synthetic training data that preserve crossmodal relations
+between legitimate images and false human-written captions. By leveraging
+CHASMA in the training process, we observe consistent and notable improvements
+in predictive performance on VERITE; with a 9.2% increase in accuracy. We
+release our code at: https://github.com/stevejpapad/image-text-verification
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LiveVV: Human-Centered Live Volumetric Video Streaming System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08205v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08205v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaiyuan Hu, Yongting Chen, Kaiying Han, Junhua Liu, Haowen Yang, Yili Jin, Boyan Li, Fangxin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Volumetric video has emerged as a prominent medium within the realm of
+eXtended Reality (XR) with the advancements in computer graphics and depth
+capture hardware. Users can fully immersive themselves in volumetric video with
+the ability to switch their viewport in six degree-of-freedom (DOF), including
+three rotational dimensions (yaw, pitch, roll) and three translational
+dimensions (X, Y, Z). Different from traditional 2D videos that are composed of
+pixel matrices, volumetric videos employ point clouds, meshes, or voxels to
+represent a volumetric scene, resulting in significantly larger data sizes.
+While previous works have successfully achieved volumetric video streaming in
+video-on-demand scenarios, the live streaming of volumetric video remains an
+unresolved challenge due to the limited network bandwidth and stringent latency
+constraints. In this paper, we for the first time propose a holistic live
+volumetric video streaming system, LiveVV, which achieves multi-view capture,
+scene segmentation \& reuse, adaptive transmission, and rendering. LiveVV
+contains multiple lightweight volumetric video capture modules that are capable
+of being deployed without prior preparation. To reduce bandwidth consumption,
+LiveVV processes static and dynamic volumetric content separately by reusing
+static data with low disparity and decimating data with low visual saliency.
+Besides, to deal with network fluctuation, LiveVV integrates a volumetric video
+adaptive bitrate streaming algorithm (VABR) to enable fluent playback with the
+maximum quality of experience. Extensive real-world experiment shows that
+LiveVV can achieve live volumetric video streaming at a frame rate of 24 fps
+with a latency of less than 350ms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-10-17T00:00:00Z">2023-10-17</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">128</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VeRA: Vector-based Random Matrix Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11454v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11454v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dawid Jan Kopiczko, Tijmen Blankevoort, Yuki Markus Asano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low-rank adapation (LoRA) is a popular method that reduces the number of
+trainable parameters when finetuning large language models, but still faces
+acute storage challenges when scaling to even larger models or deploying
+numerous per-user or per-task adapted models. In this work, we present
+Vector-based Random Matrix Adaptation (VeRA), which reduces the number of
+trainable parameters by 10x compared to LoRA, yet maintains the same
+performance. It achieves this by using a single pair of low-rank matrices
+shared across all layers and learning small scaling vectors instead. We
+demonstrate its effectiveness on the GLUE and E2E benchmarks, and show its
+application in instruction-following with just 1.4M parameters using the Llama2
+7B model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BitNet: Scaling 1-bit <span class="highlight-title">Transformer</span>s for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11453v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11453v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongyu Wang, Shuming Ma, Li Dong, Shaohan Huang, Huaijie Wang, Lingxiao Ma, Fan Yang, Ruiping Wang, Yi Wu, Furu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing size of large language models has posed challenges for
+deployment and raised concerns about environmental impact due to high energy
+consumption. In this work, we introduce BitNet, a scalable and stable 1-bit
+Transformer architecture designed for large language models. Specifically, we
+introduce BitLinear as a drop-in replacement of the nn.Linear layer in order to
+train 1-bit weights from scratch. Experimental results on language modeling
+show that BitNet achieves competitive performance while substantially reducing
+memory footprint and energy consumption, compared to state-of-the-art 8-bit
+quantization methods and FP16 Transformer baselines. Furthermore, BitNet
+exhibits a scaling law akin to full-precision Transformers, suggesting its
+potential for effective scaling to even larger language models while
+maintaining efficiency and performance benefits.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Seeking Neural Nuggets: Knowledge Transfer in Large Language Models from
+  a Parametric Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11451v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11451v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Zhong, Chenxin An, Weizhu Chen, Jiawei Han, Pengcheng He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) inherently encode a wealth of knowledge within
+their parameters through pre-training on extensive corpora. While prior
+research has delved into operations on these parameters to manipulate the
+underlying implicit knowledge (encompassing detection, editing, and merging),
+there remains an ambiguous understanding regarding their transferability across
+models with varying scales. In this paper, we seek to empirically investigate
+knowledge transfer from larger to smaller models through a parametric
+perspective. To achieve this, we employ sensitivity-based techniques to extract
+and align knowledge-specific parameters between different LLMs. Moreover, the
+LoRA module is used as the intermediary mechanism for injecting the extracted
+knowledge into smaller models. Evaluations across four benchmarks validate the
+efficacy of our proposed method. Our findings highlight the critical factors
+contributing to the process of parametric knowledge transfer, underscoring the
+transferability of model parameters across LLMs of different scales. We release
+code and data at \url{https://github.com/maszhongming/ParaKnowTransfer}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Functional Invariants to Watermark Large <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11446v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11446v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fernandez Pierre, Couairon Guillaume, Furon Teddy, Douze Matthijs
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid growth of transformer-based models increases the concerns about
+their integrity and ownership insurance. Watermarking addresses this issue by
+embedding a unique identifier into the model, while preserving its performance.
+However, most existing approaches require to optimize the weights to imprint
+the watermark signal, which is not suitable at scale due to the computational
+cost. This paper explores watermarks with virtually no computational cost,
+applicable to a non-blind white-box setting (assuming access to both the
+original and watermarked networks). They generate functionally equivalent
+copies by leveraging the models' invariance, via operations like dimension
+permutations or scaling/unscaling. This enables to watermark models without any
+change in their outputs and remains stealthy. Experiments demonstrate the
+effectiveness of the approach and its robustness against various model
+transformations (fine-tuning, quantization, pruning), making it a practical
+solution to protect the integrity of large models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Set-of-Mark <span class="highlight-title">Prompt</span>ing Unleashes Extraordinary Visual Grounding in <span class="highlight-title">GPT</span>-4V 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11441v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11441v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianwei Yang, Hao Zhang, Feng Li, Xueyan Zou, Chunyuan Li, Jianfeng Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Set-of-Mark (SoM), a new visual prompting method, to unleash the
+visual grounding abilities of large multimodal models (LMMs), such as GPT-4V.
+As illustrated in Fig. 1 (right), we employ off-the-shelf interactive
+segmentation models, such as SAM, to partition an image into regions at
+different levels of granularity, and overlay these regions with a set of marks
+e.g., alphanumerics, masks, boxes. Using the marked image as input, GPT-4V can
+answer the questions that require visual grounding. We perform a comprehensive
+empirical study to validate the effectiveness of SoM on a wide range of
+fine-grained vision and multimodal tasks. For example, our experiments show
+that GPT-4V with SoM outperforms the state-of-the-art fully-finetuned referring
+segmentation model on RefCOCOg in a zero-shot setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Empirical Study of Translation Hypothesis Ensembling with Large
+  Language Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11430v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11430v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        António Farinhas, José G. C. de Souza, André F. T. Martins
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are becoming a one-fits-many solution, but they
+sometimes hallucinate or produce unreliable output. In this paper, we
+investigate how hypothesis ensembling can improve the quality of the generated
+text for the specific problem of LLM-based machine translation. We experiment
+with several techniques for ensembling hypotheses produced by LLMs such as
+ChatGPT, LLaMA, and Alpaca. We provide a comprehensive study along multiple
+dimensions, including the method to generate hypotheses (multiple prompts,
+temperature-based sampling, and beam search) and the strategy to produce the
+final translation (instruction-based, quality-based reranking, and minimum
+Bayes risk (MBR) decoding). Our results show that MBR decoding is a very
+effective method, that translation quality can be improved using a small number
+of samples, and that instruction tuning has a strong impact on the relation
+between the diversity of the hypotheses and the sampling temperature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 (main conference)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Attention: Enhancing QKV Calculation in Self-Attention Mechanism
+  with Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11398v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11398v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the realm of deep learning, the self-attention mechanism has substantiated
+its pivotal role across a myriad of tasks, encompassing natural language
+processing and computer vision. Despite achieving success across diverse
+applications, the traditional self-attention mechanism primarily leverages
+linear transformations for the computation of query, key, and value (QKV),
+which may not invariably be the optimal choice under specific circumstances.
+This paper probes into a novel methodology for QKV computation-implementing a
+specially-designed neural network structure for the calculation. Utilizing a
+modified Marian model, we conducted experiments on the IWSLT 2017
+German-English translation task dataset and juxtaposed our method with the
+conventional approach. The experimental results unveil a significant
+enhancement in BLEU scores with our method. Furthermore, our approach also
+manifested superiority when training the Roberta model with the Wikitext-103
+dataset, reflecting a notable reduction in model perplexity compared to its
+original counterpart. These experimental outcomes not only validate the
+efficacy of our method but also reveal the immense potential in optimizing the
+self-attention mechanism through neural network-based QKV computation, paving
+the way for future research and practical applications. The source code and
+implementation details for our proposed method can be accessed at
+https://github.com/ocislyjrti/NeuralAttention.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Wake-Up Word Detection by Two-stage Multi-resolution Ensembles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11379v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11379v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fernando López, Jordi Luque, Carlos Segura, Pablo Gómez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Voice-based interfaces rely on a wake-up word mechanism to initiate
+communication with devices. However, achieving a robust, energy-efficient, and
+fast detection remains a challenge. This paper addresses these real production
+needs by enhancing data with temporal alignments and using detection based on
+two phases with multi-resolution. It employs two models: a lightweight
+on-device model for real-time processing of the audio stream and a verification
+model on the server-side, which is an ensemble of heterogeneous architectures
+that refine detection. This scheme allows the optimization of two operating
+points. To protect privacy, audio features are sent to the cloud instead of raw
+audio. The study investigated different parametric configurations for feature
+extraction to select one for on-device detection and another for the
+verification model. Furthermore, thirteen different audio classifiers were
+compared in terms of performance and inference time. The proposed ensemble
+outperforms our stronger classifier in every noise condition.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DialogueLLM: Context and Emotion Knowledge-Tuned LLaMA Models for
+  Emotion Recognition in Conversations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11374v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11374v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yazhou Zhang, Mengyao Wang, Prayag Tiwari, Qiuchi Li, Benyou Wang, Jing Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) and their variants have shown extraordinary
+efficacy across numerous downstream natural language processing (NLP) tasks,
+which has presented a new vision for the development of NLP. Despite their
+remarkable performance in natural language generating (NLG), LLMs lack a
+distinct focus on the emotion understanding domain. As a result, using LLMs for
+emotion recognition may lead to suboptimal and inadequate precision. Another
+limitation of LLMs is that they are typical trained without leveraging
+multi-modal information. To overcome these limitations, we propose DialogueLLM,
+a context and emotion knowledge tuned LLM that is obtained by fine-tuning LLaMA
+models with 13,638 multi-modal (i.e., texts and videos) emotional dialogues.
+The visual information is considered as the supplementary knowledge to
+construct high-quality instructions. We offer a comprehensive evaluation of our
+proposed model on three benchmarking emotion recognition in conversations (ERC)
+datasets and compare the results against the SOTA baselines and other SOTA
+LLMs. Additionally, DialogueLLM-7B can be easily trained using LoRA on a 40GB
+A100 GPU in 5 hours, facilitating reproducibility for other researchers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VECHR: A <span class="highlight-title">Dataset</span> for Explainable and Robust Classification of
+  Vulnerability Type in the European Court of Human Rights <span class="chip">EMNLP 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11368v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11368v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shanshan Xu, Leon Staufer, Santosh T. Y. S. S, Oana Ichim, Corina Heri, Matthias Grabmair
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recognizing vulnerability is crucial for understanding and implementing
+targeted support to empower individuals in need. This is especially important
+at the European Court of Human Rights (ECtHR), where the court adapts
+Convention standards to meet actual individual needs and thus ensures effective
+human rights protection. However, the concept of vulnerability remains elusive
+at the ECtHR and no prior NLP research has dealt with it. To enable future
+research in this area, we present VECHR, a novel expert-annotated multi-label
+dataset comprising of vulnerability type classification and explanation
+rationale. We benchmark the performance of state-of-the-art models on VECHR
+from both prediction and explainability perspectives. Our results demonstrate
+the challenging nature of the task with lower prediction performance and
+limited agreement between models and experts. Further, we analyze the
+robustness of these models in dealing with out-of-domain (OOD) data and observe
+overall limited performance. Our dataset poses unique challenges offering
+significant room for improvement regarding performance, explainability, and
+robustness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Disentangling the Linguistic Competence of Privacy-Preserving <span class="highlight-title">BERT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11363v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11363v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stefan Arnold, Nils Kemmerzell, Annika Schreiner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Differential Privacy (DP) has been tailored to address the unique challenges
+of text-to-text privatization. However, text-to-text privatization is known for
+degrading the performance of language models when trained on perturbed text.
+Employing a series of interpretation techniques on the internal representations
+extracted from BERT trained on perturbed pre-text, we intend to disentangle at
+the linguistic level the distortion induced by differential privacy.
+Experimental results from a representational similarity analysis indicate that
+the overall similarity of internal representations is substantially reduced.
+Using probing tasks to unpack this dissimilarity, we find evidence that
+text-to-text privatization affects the linguistic competence across several
+formalisms, encoding localized properties of words while falling short at
+encoding the contextual relationships between spans of words.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Neural Machine Translation with Semantic Units <span class="chip">EMNLP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11360v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11360v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Langlin Huang, Shuhao Gu, Zhuocheng Zhang, Yang Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conventional neural machine translation (NMT) models typically use subwords
+and words as the basic units for model input and comprehension. However,
+complete words and phrases composed of several tokens are often the fundamental
+units for expressing semantics, referred to as semantic units. To address this
+issue, we propose a method Semantic Units for Machine Translation (SU4MT) which
+models the integral meanings of semantic units within a sentence, and then
+leverages them to provide a new perspective for understanding the sentence.
+Specifically, we first propose Word Pair Encoding (WPE), a phrase extraction
+method to help identify the boundaries of semantic units. Next, we design an
+Attentive Semantic Fusion (ASF) layer to integrate the semantics of multiple
+subwords into a single vector: the semantic unit representation. Lastly, the
+semantic-unit-level sentence representation is concatenated to the token-level
+one, and they are combined as the input of encoder. Experimental results
+demonstrate that our method effectively models and leverages
+semantic-unit-level information and outperforms the strong baselines. The code
+is available at https://github.com/ictnlp/SU4MT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP findings 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The effect of stemming and lemmatization on Portuguese fake news text
+  classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11344v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11344v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucca de Freitas Santos, Murilo Varges da Silva
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the popularization of the internet, smartphones and social media,
+information is being spread quickly and easily way, which implies bigger
+traffic of information in the world, but there is a problem that is harming
+society with the dissemination of fake news. With a bigger flow of information,
+some people are trying to disseminate deceptive information and fake news. The
+automatic detection of fake news is a challenging task because to obtain a good
+result is necessary to deal with linguistics problems, especially when we are
+dealing with languages that not have been comprehensively studied yet, besides
+that, some techniques can help to reach a good result when we are dealing with
+text data, although, the motivation of detecting this deceptive information it
+is in the fact that the people need to know which information is true and
+trustful and which one is not. In this work, we present the effect the
+pre-processing methods such as lemmatization and stemming have on fake news
+classification, for that we designed some classifier models applying different
+pre-processing techniques. The results show that the pre-processing step is
+important to obtain betters results, the stemming and lemmatization techniques
+are interesting methods and need to be more studied to develop techniques
+focused on the Portuguese language so we can reach better results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantifying Language Models' Sensitivity to Spurious Features in <span class="highlight-title">Prompt</span>
+  Design or: How I learned to start worrying about <span class="highlight-title">prompt</span> formatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11324v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11324v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Melanie Sclar, Yejin Choi, Yulia Tsvetkov, Alane Suhr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large language models (LLMs) are adopted as a fundamental component of
+language technologies, it is crucial to accurately characterize their
+performance. Because choices in prompt design can strongly influence model
+behavior, this design process is critical in effectively using any modern
+pre-trained generative language model. In this work, we focus on LLM
+sensitivity to a quintessential class of meaning-preserving design choices:
+prompt formatting. We find that several widely used open-source LLMs are
+extremely sensitive to subtle changes in prompt formatting in few-shot
+settings, with performance differences of up to 76 accuracy points when
+evaluated using LLaMA-2-13B. Sensitivity remains even when increasing model
+size, the number of few-shot examples, or performing instruction tuning. Our
+analysis suggests that work evaluating LLMs with prompting-based methods would
+benefit from reporting a range of performance across plausible prompt formats,
+instead of the currently-standard practice of reporting performance on a single
+format. We also show that format performance only weakly correlates between
+models, which puts into question the methodological validity of comparing
+models with an arbitrarily chosen, fixed prompt format. To facilitate
+systematic analysis we propose FormatSpread, an algorithm that rapidly
+evaluates a sampled set of plausible prompt formats for a given task, and
+reports the interval of expected performance without accessing model weights.
+Furthermore, we present a suite of analyses that characterize the nature of
+this sensitivity, including exploring the influence of particular atomic
+perturbations and the internal representation of particular formats.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Utilising a Large Language Model to Annotate Subject Metadata: A Case
+  Study in an Australian National Research Data Catalogue 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11318v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11318v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiwei Zhang, Mingfang Wu, Xiuzhen Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In support of open and reproducible research, there has been a rapidly
+increasing number of datasets made available for research. As the availability
+of datasets increases, it becomes more important to have quality metadata for
+discovering and reusing them. Yet, it is a common issue that datasets often
+lack quality metadata due to limited resources for data curation. Meanwhile,
+technologies such as artificial intelligence and large language models (LLMs)
+are progressing rapidly. Recently, systems based on these technologies, such as
+ChatGPT, have demonstrated promising capabilities for certain data curation
+tasks. This paper proposes to leverage LLMs for cost-effective annotation of
+subject metadata through the LLM-based in-context learning. Our method employs
+GPT-3.5 with prompts designed for annotating subject metadata, demonstrating
+promising performance in automatic metadata annotation. However, models based
+on in-context learning cannot acquire discipline-specific rules, resulting in
+lower performance in several categories. This limitation arises from the
+limited contextual information available for subject inference. To the best of
+our knowledge, we are introducing, for the first time, an in-context learning
+method that harnesses large language models for automated subject metadata
+annotation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ QADYNAMICS: Training Dynamics-Driven Synthetic QA Diagnostic for
+  Zero-Shot Commonsense Question Answering <span class="chip">EMNLP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11303v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11303v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haochen Shi, Weiqi Wang, Tianqing Fang, Baixuan Xu, Wenxuan Ding, Xin Liu, Yangqiu Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot commonsense Question-Answering (QA) requires models to reason about
+general situations beyond specific benchmarks. State-of-the-art approaches
+fine-tune language models on QA pairs constructed from CommonSense Knowledge
+Bases (CSKBs) to equip the models with more commonsense knowledge in a QA
+context. However, current QA synthesis protocols may introduce noise from the
+CSKBs and generate ungrammatical questions and false negative options, which
+impede the model's ability to generalize. To address these issues, we propose
+QADYNAMICS, a training dynamics-driven framework for QA diagnostics and
+refinement. Our approach analyzes the training dynamics of each QA pair at both
+the question level and option level, discarding machine-detectable artifacts by
+removing uninformative QA pairs and mislabeled or false-negative options.
+Extensive experiments demonstrate the effectiveness of our approach, which
+outperforms all baselines while using only 33% of the synthetic data, even
+including LLMs such as ChatGPT. Moreover, expert evaluations confirm that our
+framework significantly improves the quality of QA synthesis. Our codes and
+model checkpoints are available at
+https://github.com/HKUST-KnowComp/QaDynamics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of EMNLP2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ChapGTP, ILLC's Attempt at Raising a BabyLM: Improving Data Efficiency
+  by Automatic Task Formation <span class="chip">CoNLL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11282v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11282v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaap Jumelet, Michael Hanna, Marianne de Heer Kloots, Anna Langedijk, Charlotte Pouw, Oskar van der Wal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the submission of the ILLC at the University of Amsterdam to the
+BabyLM challenge (Warstadt et al., 2023), in the strict-small track. Our final
+model, ChapGTP, is a masked language model that was trained for 200 epochs,
+aided by a novel data augmentation technique called Automatic Task Formation.
+We discuss in detail the performance of this model on the three evaluation
+suites: BLiMP, (Super)GLUE, and MSGS. Furthermore, we present a wide range of
+methods that were ultimately not included in the model, but may serve as
+inspiration for training LMs in low-resource settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Part of the BabyLM challenge at CoNLL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ xMEN: A Modular Toolkit for Cross-Lingual Medical Entity Normalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11275v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11275v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Borchert, Ignacio Llorca, Roland Roller, Bert Arnrich, Matthieu-P. Schapranow
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objective: To improve performance of medical entity normalization across many
+languages, especially when fewer language resources are available compared to
+English.
+  Materials and Methods: We introduce xMEN, a modular system for cross-lingual
+medical entity normalization, which performs well in both low- and
+high-resource scenarios. When synonyms in the target language are scarce for a
+given terminology, we leverage English aliases via cross-lingual candidate
+generation. For candidate ranking, we incorporate a trainable cross-encoder
+model if annotations for the target task are available. We also evaluate
+cross-encoders trained in a weakly supervised manner based on
+machine-translated datasets from a high resource domain. Our system is publicly
+available as an extensible Python toolkit.
+  Results: xMEN improves the state-of-the-art performance across a wide range
+of multilingual benchmark datasets. Weakly supervised cross-encoders are
+effective when no training data is available for the target task. Through the
+compatibility of xMEN with the BigBIO framework, it can be easily used with
+existing and prospective datasets.
+  Discussion: Our experiments show the importance of balancing the output of
+general-purpose candidate generators with subsequent trainable re-rankers,
+which we achieve through a rank regularization term in the loss function of the
+cross-encoder. However, error analysis reveals that multi-word expressions and
+other complex entities are still challenging.
+  Conclusion: xMEN exhibits strong performance for medical entity normalization
+in multiple languages, even when no labeled data and few terminology aliases
+for the target language are available. Its configuration system and evaluation
+modules enable reproducible benchmarks. Models and code are available online at
+the following URL: https://github.com/hpi-dhc/xmen
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Emulating Human Cognitive Processes for Expert-Level Medical
+  Question-Answering with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11266v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11266v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khushboo Verma, Marina Moore, Stephanie Wottrich, Karla Robles López, Nishant Aggarwal, Zeel Bhatt, Aagamjit Singh, Bradford Unroe, Salah Basheer, Nitish Sachdeva, Prinka Arora, Harmanjeet Kaur, Tanupreet Kaur, Tevon Hood, Anahi Marquez, Tushar Varshney, Nanfu Deng, Azaan Ramani, Pawanraj Ishwara, Maimoona Saeed, Tatiana López Velarde Peña, Bryan Barksdale, Sushovan Guha, Satwant Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In response to the pressing need for advanced clinical problem-solving tools
+in healthcare, we introduce BooksMed, a novel framework based on a Large
+Language Model (LLM). BooksMed uniquely emulates human cognitive processes to
+deliver evidence-based and reliable responses, utilizing the GRADE (Grading of
+Recommendations, Assessment, Development, and Evaluations) framework to
+effectively quantify evidence strength. For clinical decision-making to be
+appropriately assessed, an evaluation metric that is clinically aligned and
+validated is required. As a solution, we present ExpertMedQA, a multispecialty
+clinical benchmark comprised of open-ended, expert-level clinical questions,
+and validated by a diverse group of medical professionals. By demanding an
+in-depth understanding and critical appraisal of up-to-date clinical
+literature, ExpertMedQA rigorously evaluates LLM performance. BooksMed
+outperforms existing state-of-the-art models Med-PaLM 2, Almanac, and ChatGPT
+in a variety of medical scenarios. Therefore, a framework that mimics human
+cognitive stages could be a useful tool for providing reliable and
+evidence-based responses to clinical inquiries.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Utilizing Weak Supervision To Generate Indonesian Conservation <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11258v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11258v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mega Fransiska, Diah Pitaloka,  Saripudin, Satrio Putra, Lintang Sutawika
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weak supervision has emerged as a promising approach for rapid and
+large-scale dataset creation in response to the increasing demand for
+accelerated NLP development. By leveraging labeling functions, weak supervision
+allows practitioners to generate datasets quickly by creating learned label
+models that produce soft-labeled datasets. This paper aims to show how such an
+approach can be utilized to build an Indonesian NLP dataset from conservation
+news text. We construct two types of datasets: multi-class classification and
+sentiment classification. We then provide baseline experiments using various
+pretrained language models. These baseline results demonstrate test
+performances of 59.79% accuracy and 55.72% F1-score for sentiment
+classification, 66.87% F1-score-macro, 71.5% F1-score-micro, and 83.67% ROC-AUC
+for multi-class classification. Additionally, we release the datasets and
+labeling functions used in this work for further research and exploration.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revealing the Unwritten: Visual Investigation of Beam Search Trees to
+  Address Language Model <span class="highlight-title">Prompt</span>ing Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11252v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11252v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thilo Spinner, Rebecca Kehlbeck, Rita Sevastjanova, Tobias Stähle, Daniel A. Keim, Oliver Deussen, Andreas Spitz, Mennatallah El-Assady
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The growing popularity of generative language models has amplified interest
+in interactive methods to guide model outputs. Prompt refinement is considered
+one of the most effective means to influence output among these methods. We
+identify several challenges associated with prompting large language models,
+categorized into data- and model-specific, linguistic, and socio-linguistic
+challenges. A comprehensive examination of model outputs, including runner-up
+candidates and their corresponding probabilities, is needed to address these
+issues. The beam search tree, the prevalent algorithm to sample model outputs,
+can inherently supply this information. Consequently, we introduce an
+interactive visual method for investigating the beam search tree, facilitating
+analysis of the decisions made by the model during generation. We
+quantitatively show the value of exposing the beam search tree and present five
+detailed analysis scenarios addressing the identified challenges. Our
+methodology validates existing results and offers additional insights.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages paper, 2 pages references, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CrossCodeEval: A Diverse and Multilingual Benchmark for Cross-File Code
+  Completion <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11248v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11248v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yangruibo Ding, Zijian Wang, Wasi Uddin Ahmad, Hantian Ding, Ming Tan, Nihal Jain, Murali Krishna Ramanathan, Ramesh Nallapati, Parminder Bhatia, Dan Roth, Bing Xiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Code completion models have made significant progress in recent years, yet
+current popular evaluation datasets, such as HumanEval and MBPP, predominantly
+focus on code completion tasks within a single file. This over-simplified
+setting falls short of representing the real-world software development
+scenario where repositories span multiple files with numerous cross-file
+dependencies, and accessing and understanding cross-file context is often
+required to complete the code correctly.
+  To fill in this gap, we propose CrossCodeEval, a diverse and multilingual
+code completion benchmark that necessitates an in-depth cross-file contextual
+understanding to complete the code accurately. CrossCodeEval is built on a
+diverse set of real-world, open-sourced, permissively-licensed repositories in
+four popular programming languages: Python, Java, TypeScript, and C#. To create
+examples that strictly require cross-file context for accurate completion, we
+propose a straightforward yet efficient static-analysis-based approach to
+pinpoint the use of cross-file context within the current file.
+  Extensive experiments on state-of-the-art code language models like CodeGen
+and StarCoder demonstrate that CrossCodeEval is extremely challenging when the
+relevant cross-file context is absent, and we see clear improvements when
+adding these context into the prompt. However, despite such improvements, the
+pinnacle of performance remains notably unattained even with the
+highest-performing model, indicating that CrossCodeEval is also capable of
+assessing model's capability in leveraging extensive context to make better
+code completion. Finally, we benchmarked various methods in retrieving
+cross-file context, and show that CrossCodeEval can also be used to measure the
+capability of code retrievers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at NeurIPS 2023 (Datasets and Benchmarks Track)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Entity Matching using Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11244v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11244v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ralph Peeters, Christian Bizer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Entity Matching is the task of deciding whether two entity descriptions refer
+to the same real-world entity. Entity Matching is a central step in most data
+integration pipelines and an enabler for many e-commerce applications which
+require to match products offers from different vendors. State-of-the-art
+entity matching methods often rely on pre-trained language models (PLMs) such
+as BERT or RoBERTa. Two major drawbacks of these models for entity matching are
+that (i) the models require significant amounts of task-specific training data
+and (ii) the fine-tuned models are not robust concerning out-of-distribution
+entities. In this paper, we investigate using large language models (LLMs) for
+entity matching as a less domain-specific training data reliant and more robust
+alternative to PLM-based matchers. Our study covers hosted LLMs, such as GPT3.5
+and GPT4, as well as open source LLMs based on Llama2 which can be run locally.
+We evaluate these models in a zero-shot scenario as well as a scenario where
+task-specific training data is available. We compare different prompt designs
+as well as the prompt sensitivity of the models in the zero-shot scenario. We
+investigate (i) the selection of in-context demonstrations, (ii) the generation
+of matching rules, as well as (iii) fine-tuning GPT3.5 in the second scenario
+using the same pool of training data across the different approaches. Our
+experiments show that GPT4 without any task-specific training data outperforms
+fine-tuned PLMs (RoBERTa and Ditto) on three out of five benchmark datasets
+reaching F1 scores around 90%. The experiments with in-context learning and
+rule generation show that all models beside of GPT4 benefit from these
+techniques (on average 5.9% and 2.2% F1), while GPT4 does not need such
+additional guidance in most cases...
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Watermarking LLMs with Weight Quantization <span class="chip">EMNLP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11237v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11237v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linyang Li, Botian Jiang, Pengyu Wang, Ke Ren, Hang Yan, Xipeng Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Abuse of large language models reveals high risks as large language models
+are being deployed at an astonishing speed. It is important to protect the
+model weights to avoid malicious usage that violates licenses of open-source
+large language models. This paper proposes a novel watermarking strategy that
+plants watermarks in the quantization process of large language models without
+pre-defined triggers during inference. The watermark works when the model is
+used in the fp32 mode and remains hidden when the model is quantized to int8,
+in this way, the users can only inference the model without further supervised
+fine-tuning of the model. We successfully plant the watermark into open-source
+large language model weights including GPT-Neo and LLaMA. We hope our proposed
+method can provide a potential direction for protecting model weights in the
+era of large language model applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Findings of EMNLP2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RealBehavior: A Framework for Faithfully Characterizing Foundation
+  Models' Human-like Behavior Mechanisms <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11227v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11227v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Enyu Zhou, Rui Zheng, Zhiheng Xi, Songyang Gao, Xiaoran Fan, Zichu Fei, Jingting Ye, Tao Gui, Qi Zhang, Xuanjing Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reports of human-like behaviors in foundation models are growing, with
+psychological theories providing enduring tools to investigate these behaviors.
+However, current research tends to directly apply these human-oriented tools
+without verifying the faithfulness of their outcomes. In this paper, we
+introduce a framework, RealBehavior, which is designed to characterize the
+humanoid behaviors of models faithfully. Beyond simply measuring behaviors, our
+framework assesses the faithfulness of results based on reproducibility,
+internal and external consistency, and generalizability. Our findings suggest
+that a simple application of psychological tools cannot faithfully characterize
+all human-like behaviors. Moreover, we discuss the impacts of aligning models
+with human and social values, arguing for the necessity of diversifying
+alignment objectives to prevent the creation of models with restricted
+characteristics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KG-<span class="highlight-title">GPT</span>: A General Framework for Reasoning on Knowledge Graphs Using
+  Large Language Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11220v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11220v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiho Kim, Yeonsu Kwon, Yohan Jo, Edward Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While large language models (LLMs) have made considerable advancements in
+understanding and generating unstructured text, their application in structured
+data remains underexplored. Particularly, using LLMs for complex reasoning
+tasks on knowledge graphs (KGs) remains largely untouched. To address this, we
+propose KG-GPT, a multi-purpose framework leveraging LLMs for tasks employing
+KGs. KG-GPT comprises three steps: Sentence Segmentation, Graph Retrieval, and
+Inference, each aimed at partitioning sentences, retrieving relevant graph
+components, and deriving logical conclusions, respectively. We evaluate KG-GPT
+using KG-based fact verification and KGQA benchmarks, with the model showing
+competitive and robust performance, even outperforming several fully-supervised
+models. Our work, therefore, marks a significant step in unifying structured
+and unstructured data processing within the realm of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP 2023 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can Large Language Models Explain Themselves? A Study of LLM-Generated
+  Self-Explanations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11207v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11207v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiyuan Huang, Siddarth Mamidanna, Shreedhar Jangam, Yilun Zhou, Leilani H. Gilpin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) such as ChatGPT have demonstrated superior
+performance on a variety of natural language processing (NLP) tasks including
+sentiment analysis, mathematical reasoning and summarization. Furthermore,
+since these models are instruction-tuned on human conversations to produce
+"helpful" responses, they can and often will produce explanations along with
+the response, which we call self-explanations. For example, when analyzing the
+sentiment of a movie review, the model may output not only the positivity of
+the sentiment, but also an explanation (e.g., by listing the sentiment-laden
+words such as "fantastic" and "memorable" in the review). How good are these
+automatically generated self-explanations? In this paper, we investigate this
+question on the task of sentiment analysis and for feature attribution
+explanation, one of the most commonly studied settings in the interpretability
+literature (for pre-ChatGPT models). Specifically, we study different ways to
+elicit the self-explanations, evaluate their faithfulness on a set of
+evaluation metrics, and compare them to traditional explanation methods such as
+occlusion or LIME saliency maps. Through an extensive set of experiments, we
+find that ChatGPT's self-explanations perform on par with traditional ones, but
+are quite different from them according to various agreement metrics, meanwhile
+being much cheaper to produce (as they are generated along with the
+prediction). In addition, we identified several interesting characteristics of
+them, which prompt us to rethink many current model interpretability practices
+in the era of ChatGPT(-like) LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Medical Text Simplification: Optimizing for Readability with
+  Unlikelihood Training and Reranked Beam Search Decoding <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11191v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11191v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenzo Jaime Yu Flores, Heyuan Huang, Kejian Shi, Sophie Chheang, Arman Cohan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text simplification has emerged as an increasingly useful application of AI
+for bridging the communication gap in specialized fields such as medicine,
+where the lexicon is often dominated by technical jargon and complex
+constructs. Despite notable progress, methods in medical simplification
+sometimes result in the generated text having lower quality and diversity. In
+this work, we explore ways to further improve the readability of text
+simplification in the medical domain. We propose (1) a new unlikelihood loss
+that encourages generation of simpler terms and (2) a reranked beam search
+decoding method that optimizes for simplicity, which achieve better performance
+on readability metrics on three datasets. This study's findings offer promising
+avenues for improving text simplification in the medical field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ViSo<span class="highlight-title">BERT</span>: A <span class="highlight-title">Pre-Train</span>ed Language Model for Vietnamese Social Media Text
+  Processing <span class="chip">EMNLP'2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11166v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11166v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quoc-Nam Nguyen, Thang Chau Phan, Duc-Vu Nguyen, Kiet Van Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  English and Chinese, known as resource-rich languages, have witnessed the
+strong development of transformer-based language models for natural language
+processing tasks. Although Vietnam has approximately 100M people speaking
+Vietnamese, several pre-trained models, e.g., PhoBERT, ViBERT, and vELECTRA,
+performed well on general Vietnamese NLP tasks, including POS tagging and named
+entity recognition. These pre-trained language models are still limited to
+Vietnamese social media tasks. In this paper, we present the first monolingual
+pre-trained language model for Vietnamese social media texts, ViSoBERT, which
+is pre-trained on a large-scale corpus of high-quality and diverse Vietnamese
+social media texts using XLM-R architecture. Moreover, we explored our
+pre-trained model on five important natural language downstream tasks on
+Vietnamese social media texts: emotion recognition, hate speech detection,
+sentiment analysis, spam reviews detection, and hate speech spans detection.
+Our experiments demonstrate that ViSoBERT, with far fewer parameters, surpasses
+the previous state-of-the-art models on multiple Vietnamese social media tasks.
+Our ViSoBERT model is
+available\footnote{\url{https://huggingface.co/uitnlp/visobert}} only for
+research purposes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at EMNLP'2023 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IMTLab: An Open-Source Platform for Building, Evaluating, and Diagnosing
+  Interactive Machine Translation Systems <span class="chip">EMNLP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11163v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11163v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Huang, Zhirui Zhang, Ruize Gao, Yichao Du, Lemao Liu, Gouping Huang, Shuming Shi, Jiajun Chen, Shujian Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present IMTLab, an open-source end-to-end interactive machine translation
+(IMT) system platform that enables researchers to quickly build IMT systems
+with state-of-the-art models, perform an end-to-end evaluation, and diagnose
+the weakness of systems. IMTLab treats the whole interactive translation
+process as a task-oriented dialogue with a human-in-the-loop setting, in which
+human interventions can be explicitly incorporated to produce high-quality,
+error-free translations. To this end, a general communication interface is
+designed to support the flexible IMT architectures and user policies. Based on
+the proposed design, we construct a simulated and real interactive environment
+to achieve end-to-end evaluation and leverage the framework to systematically
+evaluate previous IMT systems. Our simulated and manual experiments show that
+the prefix-constrained decoding approach still gains the lowest editing cost in
+the end-to-end evaluation, while BiTIIMT achieves comparable editing cost with
+a better interactive experience.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by EMNLP2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Probing the Creativity of Large Language Models: Can models produce
+  divergent semantic association? <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11158v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11158v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Honghua Chen, Nai Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models possess remarkable capacity for processing language,
+but it remains unclear whether these models can further generate creative
+content. The present study aims to investigate the creative thinking of large
+language models through a cognitive perspective. We utilize the divergent
+association task (DAT), an objective measurement of creativity that asks models
+to generate unrelated words and calculates the semantic distance between them.
+We compare the results across different models and decoding strategies. Our
+findings indicate that: (1) When using the greedy search strategy, GPT-4
+outperforms 96% of humans, while GPT-3.5-turbo exceeds the average human level.
+(2) Stochastic sampling and temperature scaling are effective to obtain higher
+DAT scores for models except GPT-4, but face a trade-off between creativity and
+stability. These results imply that advanced large language models have
+divergent semantic associations, which is a fundamental process underlying
+creativity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Quo Vadis of the Relationship between Language and Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11146v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11146v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Evelina Leivada, Vittoria Dentella, Elliot Murphy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of Artificial (General) Intelligence (AI), the several recent
+advancements in Natural language processing (NLP) activities relying on Large
+Language Models (LLMs) have come to encourage the adoption of LLMs as
+scientific models of language. While the terminology employed for the
+characterization of LLMs favors their embracing as such, it is not clear that
+they are in a place to offer insights into the target system they seek to
+represent. After identifying the most important theoretical and empirical risks
+brought about by the adoption of scientific models that lack transparency, we
+discuss LLMs relating them to every scientific model's fundamental components:
+the object, the medium, the meaning and the user. We conclude that, at their
+current stage of development, LLMs hardly offer any explanations for language,
+and then we provide an outlook for more informative future research directions
+on this topic.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Long-form Simultaneous Speech Translation: Thesis Proposal <span class="chip">AACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11141v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11141v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Polák
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Simultaneous speech translation (SST) aims to provide real-time translation
+of spoken language, even before the speaker finishes their sentence.
+Traditionally, SST has been addressed primarily by cascaded systems that
+decompose the task into subtasks, including speech recognition, segmentation,
+and machine translation. However, the advent of deep learning has sparked
+significant interest in end-to-end (E2E) systems. Nevertheless, a major
+limitation of most approaches to E2E SST reported in the current literature is
+that they assume that the source speech is pre-segmented into sentences, which
+is a significant obstacle for practical, real-world applications. This thesis
+proposal addresses end-to-end simultaneous speech translation, particularly in
+the long-form setting, i.e., without pre-segmentation. We present a survey of
+the latest advancements in E2E SST, assess the primary obstacles in SST and its
+relevance to long-form scenarios, and suggest approaches to tackle these
+challenges.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IJCNLP-AACL SRW 2023 - camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Experimenting AI Technologies for Disinformation Combat: the IDMO
+  Project 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11097v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11097v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenzo Canale, Alberto Messina
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Italian Digital Media Observatory (IDMO) project, part of a European
+initiative, focuses on countering disinformation and fake news. This report
+outlines contributions from Rai-CRITS to the project, including: (i) the
+creation of novel datasets for testing technologies (ii) development of an
+automatic model for categorizing Pagella Politica verdicts to facilitate
+broader analysis (iii) creation of an automatic model for recognizing textual
+entailment with exceptional accuracy on the FEVER dataset (iv) assessment using
+GPT-4 to identify textual entailmen (v) a game to raise awareness about fake
+news at national events.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ In-Context Few-Shot Relation Extraction via <span class="highlight-title">Pre-Train</span>ed Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11085v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11085v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yilmazcan Ozyurt, Stefan Feuerriegel, Ce Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Relation extraction aims at inferring structured human knowledge from textual
+documents. State-of-the-art methods based on language models commonly have two
+limitations: (1) they require named entities to be either given as input or
+infer them, which introduces additional noise, and (2) they require human
+annotations of documents. As a remedy, we present a novel framework for
+in-context few-shot relation extraction via pre-trained language models. To the
+best of our knowledge, we are the first to reformulate the relation extraction
+task as a tailored in-context few-shot learning paradigm. Thereby, we achieve
+crucial benefits in that we eliminate the need for both named entity
+recognition and human annotation of documents. Unlike existing methods based on
+fine-tuning, our framework is flexible in that it can be easily updated for a
+new set of relations without re-training. We evaluate our framework using
+DocRED, the largest publicly available dataset for document-level relation
+extraction, and demonstrate that our framework achieves state-of-the-art
+performance. Finally, our framework allows us to identify missing annotations,
+and we thus show that our framework actually performs much better than the
+original labels from the development set of DocRED.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding writing style in social media with a supervised
+  contrastively <span class="highlight-title">pre-train</span>ed <span class="highlight-title">transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11081v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11081v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Javier Huertas-Tato, Alejandro Martin, David Camacho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Online Social Networks serve as fertile ground for harmful behavior, ranging
+from hate speech to the dissemination of disinformation. Malicious actors now
+have unprecedented freedom to misbehave, leading to severe societal unrest and
+dire consequences, as exemplified by events such as the Capitol assault during
+the US presidential election and the Antivaxx movement during the COVID-19
+pandemic. Understanding online language has become more pressing than ever.
+While existing works predominantly focus on content analysis, we aim to shift
+the focus towards understanding harmful behaviors by relating content to their
+respective authors. Numerous novel approaches attempt to learn the stylistic
+features of authors in texts, but many of these approaches are constrained by
+small datasets or sub-optimal training losses. To overcome these limitations,
+we introduce the Style Transformer for Authorship Representations (STAR),
+trained on a large corpus derived from public sources of 4.5 x 10^6 authored
+texts involving 70k heterogeneous authors. Our model leverages Supervised
+Contrastive Loss to teach the model to minimize the distance between texts
+authored by the same individual. This author pretext pre-training task yields
+competitive performance at zero-shot with PAN challenges on attribution and
+clustering. Additionally, we attain promising results on PAN verification
+challenges using a single dense layer, with our model serving as an embedding
+encoder. Finally, we present results from our test partition on Reddit. Using a
+support base of 8 documents of 512 tokens, we can discern authors from sets of
+up to 1616 authors with at least 80\% accuracy. We share our pre-trained model
+at huggingface (https://huggingface.co/AIDA-UPM/star) and our code is available
+at (https://github.com/jahuerta92/star)
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning from Red Teaming: Gender Bias Provocation and Mitigation in
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11079v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11079v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hsuan Su, Cheng-Chu Cheng, Hua Farn, Shachi H Kumar, Saurav Sahay, Shang-Tse Chen, Hung-yi Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, researchers have made considerable improvements in dialogue systems
+with the progress of large language models (LLMs) such as ChatGPT and GPT-4.
+These LLM-based chatbots encode the potential biases while retaining
+disparities that can harm humans during interactions. The traditional biases
+investigation methods often rely on human-written test cases. However, these
+test cases are usually expensive and limited. In this work, we propose a
+first-of-its-kind method that automatically generates test cases to detect
+LLMs' potential gender bias. We apply our method to three well-known LLMs and
+find that the generated test cases effectively identify the presence of biases.
+To address the biases identified, we propose a mitigation strategy that uses
+the generated test cases as demonstrations for in-context learning to
+circumvent the need for parameter fine-tuning. The experimental results show
+that LLMs generate fairer responses with the proposed approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VoxArabica: A Robust Dialect-Aware Arabic Speech Recognition System <span class="chip">EMNLP'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11069v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11069v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdul Waheed, Bashar Talafha, Peter Suvellin, Abdelrahman Elmadney, Muhammad Abdul-Mageed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Arabic is a complex language with many varieties and dialects spoken by over
+450 millions all around the world. Due to the linguistic diversity and
+variations, it is challenging to build a robust and generalized ASR system for
+Arabic. In this work, we address this gap by developing and demoing a system,
+dubbed VoxArabica, for dialect identification (DID) as well as automatic speech
+recognition (ASR) of Arabic. We train a wide range of models such as HuBERT
+(DID), Whisper, and XLS-R (ASR) in a supervised setting for Arabic DID and ASR
+tasks. Our DID models are trained to identify 17 different dialects in addition
+to MSA. We finetune our ASR models on MSA, Egyptian, Moroccan, and mixed data.
+Additionally, for the remaining dialects in ASR, we provide the option to
+choose various models such as Whisper and MMS in a zero-shot setting. We
+integrate these models into a single web interface with diverse features such
+as audio recording, file upload, model selection, and the option to raise flags
+for incorrect outputs. Overall, we believe VoxArabica will be useful for a wide
+range of audiences concerned with Arabic research. Our system is currently
+running at https://cdce-206-12-100-168.ngrok.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ArabicNLP conference co-located with EMNLP'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Denevil: Towards Deciphering and Navigating the Ethical Values of Large
+  Language Models via Instruction Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11053v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11053v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shitong Duan, Xiaoyuan Yi, Peng Zhang, Tun Lu, Xing Xie, Ning Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have made unprecedented breakthroughs, yet their
+increasing integration into everyday life might raise societal risks due to
+generated unethical content. Despite extensive study on specific issues like
+bias, the intrinsic values of LLMs remain largely unexplored from a moral
+philosophy perspective. This work delves into ethical values utilizing Moral
+Foundation Theory. Moving beyond conventional discriminative evaluations with
+poor reliability, we propose DeNEVIL, a novel prompt generation algorithm
+tailored to dynamically exploit LLMs' value vulnerabilities and elicit the
+violation of ethics in a generative manner, revealing their underlying value
+inclinations. On such a basis, we construct MoralPrompt, a high-quality dataset
+comprising 2,397 prompts covering 500+ value principles, and then benchmark the
+intrinsic values across a spectrum of LLMs. We discovered that most models are
+essentially misaligned, necessitating further ethical value alignment. In
+response, we develop VILMO, an in-context alignment method that substantially
+enhances the value compliance of LLM outputs by learning to generate
+appropriate value instructions, outperforming existing competitors. Our methods
+are suitable for black-box and open-source models, offering a promising initial
+step in studying the ethical values of LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nonet at SemEval-2023 Task 6: Methodologies for Legal Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11049v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11049v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubham Kumar Nigam, Aniket Deroy, Noel Shallum, Ayush Kumar Mishra, Anup Roy, Shubham Kumar Mishra, Arnab Bhattacharya, Saptarshi Ghosh, Kripabandhu Ghosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper describes our submission to the SemEval-2023 for Task 6 on
+LegalEval: Understanding Legal Texts. Our submission concentrated on three
+subtasks: Legal Named Entity Recognition (L-NER) for Task-B, Legal Judgment
+Prediction (LJP) for Task-C1, and Court Judgment Prediction with Explanation
+(CJPE) for Task-C2. We conducted various experiments on these subtasks and
+presented the results in detail, including data statistics and methodology. It
+is worth noting that legal tasks, such as those tackled in this research, have
+been gaining importance due to the increasing need to automate legal analysis
+and support. Our team obtained competitive rankings of 15$^{th}$, 11$^{th}$,
+and 1$^{st}$ in Task-B, Task-C1, and Task-C2, respectively, as reported on the
+leaderboard.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lyricist-Singer Entropy Affects Lyric-Lyricist Classification
+  Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11035v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11035v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mitsuki Morita, Masato Kikuchi, Tadachika Ozono
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although lyrics represent an essential component of music, few music
+information processing studies have been conducted on the characteristics of
+lyricists. Because these characteristics may be valuable for musical
+applications, such as recommendations, they warrant further study. We
+considered a potential method that extracts features representing the
+characteristics of lyricists from lyrics. Because these features must be
+identified prior to extraction, we focused on lyricists with easily
+identifiable features. We believe that it is desirable for singers to perform
+unique songs that share certain characteristics specific to the singer.
+Accordingly, we hypothesized that lyricists account for the unique
+characteristics of the singers they write lyrics for. In other words,
+lyric-lyricist classification performance or the ease of capturing the features
+of a lyricist from the lyrics may depend on the variety of singers. In this
+study, we observed a relationship between lyricist-singer entropy or the
+variety of singers associated with a single lyricist and lyric-lyricist
+classification performance. As an example, the lyricist-singer entropy is
+minimal when the lyricist writes lyrics for only one singer. In our
+experiments, we grouped lyricists among five groups in terms of lyricist-singer
+entropy and assessed the lyric-lyricist classification performance within each
+group. Consequently, the best F1 score was obtained for the group with the
+lowest lyricist-singer entropy. Our results suggest that further analyses of
+the features contributing to lyric-lyricist classification performance on the
+lowest lyricist-singer entropy group may improve the feature extraction task
+for lyricists.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The 10th International Conference on Advanced Informatics: Concepts,
+  Theory and Applications (ICAICTA 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Automatic Evaluation Methods based on a Decoder-based LLM for
+  Text Generation <span class="chip">AACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11026v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11026v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tomohito Kasahara, Daisuke Kawahara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic evaluation of text generation is essential for improving the
+accuracy of generation tasks. In light of the current trend towards
+increasingly larger decoder-based language models, we investigate automatic
+evaluation methods based on such models for text generation. This paper
+compares various methods, including tuning with encoder-based models and large
+language models under equal conditions, on two different tasks, machine
+translation evaluation and semantic textual similarity, in two languages,
+Japanese and English. Experimental results show that compared to the tuned
+encoder-based models, the tuned decoder-based models perform poorly. The
+analysis of the causes for this suggests that the decoder-based models focus on
+surface word sequences and do not capture meaning. It is also revealed that
+in-context learning of very large decoder-based models such as ChatGPT makes it
+difficult to identify fine-grained semantic differences.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IJCNLP-AACL 2023 SRW</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reading Order Matters: Information Extraction from Visually-rich
+  Documents by Token Path Prediction <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11016v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11016v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chong Zhang, Ya Guo, Yi Tu, Huan Chen, Jinyang Tang, Huijia Zhu, Qi Zhang, Tao Gui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in multimodal pre-trained models have significantly improved
+information extraction from visually-rich documents (VrDs), in which named
+entity recognition (NER) is treated as a sequence-labeling task of predicting
+the BIO entity tags for tokens, following the typical setting of NLP. However,
+BIO-tagging scheme relies on the correct order of model inputs, which is not
+guaranteed in real-world NER on scanned VrDs where text are recognized and
+arranged by OCR systems. Such reading order issue hinders the accurate marking
+of entities by BIO-tagging scheme, making it impossible for sequence-labeling
+methods to predict correct named entities. To address the reading order issue,
+we introduce Token Path Prediction (TPP), a simple prediction head to predict
+entity mentions as token sequences within documents. Alternative to token
+classification, TPP models the document layout as a complete directed graph of
+tokens, and predicts token paths within the graph as entities. For better
+evaluation of VrD-NER systems, we also propose two revised benchmark datasets
+of NER on scanned documents which can reflect real-world scenarios. Experiment
+results demonstrate the effectiveness of our method, and suggest its potential
+to be a universal solution to various information extraction tasks on
+documents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a long paper in the main conference of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Correction Focused Language Model Training for Speech Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11003v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11003v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingyi Ma, Zhe Liu, Ozlem Kalinli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models (LMs) have been commonly adopted to boost the performance of
+automatic speech recognition (ASR) particularly in domain adaptation tasks.
+Conventional way of LM training treats all the words in corpora equally,
+resulting in suboptimal improvements in ASR performance. In this work, we
+introduce a novel correction focused LM training approach which aims to
+prioritize ASR fallible words. The word-level ASR fallibility score,
+representing the likelihood of ASR mis-recognition, is defined and shaped as a
+prior word distribution to guide the LM training. To enable correction focused
+training with text-only corpora, large language models (LLMs) are employed as
+fallibility score predictors and text generators through multi-task
+fine-tuning. Experimental results for domain adaptation tasks demonstrate the
+effectiveness of our proposed method. Compared with conventional LMs,
+correction focused training achieves up to relatively 5.5% word error rate
+(WER) reduction in sufficient text scenarios. In insufficient text scenarios,
+LM training with LLM-generated text achieves up to relatively 13% WER
+reduction, while correction focused training further obtains up to relatively
+6% WER reduction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Instructive Dialogue Summarization with Query Aggregations <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10981v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10981v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bin Wang, Zhengyuan Liu, Nancy F. Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conventional dialogue summarization methods directly generate summaries and
+do not consider user's specific interests. This poses challenges in cases where
+the users are more focused on particular topics or aspects. With the
+advancement of instruction-finetuned language models, we introduce
+instruction-tuning to dialogues to expand the capability set of dialogue
+summarization models. To overcome the scarcity of instructive dialogue
+summarization data, we propose a three-step approach to synthesize high-quality
+query-based summarization triples. This process involves summary-anchored query
+generation, query filtering, and query-based summary generation. By training a
+unified model called InstructDS (Instructive Dialogue Summarization) on three
+summarization datasets with multi-purpose instructive triples, we expand the
+capability of dialogue summarization models. We evaluate our method on four
+datasets, including dialogue summarization and dialogue reading comprehension.
+Experimental results show that our approach outperforms the state-of-the-art
+models and even models with larger sizes. Additionally, our model exhibits
+higher generalizability and faithfulness, as confirmed by human subjective
+evaluations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accept to EMNLP 2023 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EXMODD: An EXplanatory Multimodal Open-Domain Dialogue <span class="highlight-title">dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10967v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10967v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hang Yin, Pinren Lu, Ziang Li, Bin Sun, Kan Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The need for high-quality data has been a key issue hindering the research of
+dialogue tasks. Recent studies try to build datasets through manual, web
+crawling, and large pre-trained models. However, man-made data is expensive and
+data collected from the internet often includes generic responses, meaningless
+statements, and toxic dialogues. Automatic data generation through large models
+is a cost-effective method, but for open-domain multimodal dialogue tasks,
+there are still three drawbacks: 1) There is currently no open-source large
+model that can accept multimodal input; 2) The content generated by the model
+lacks interpretability; 3) The generated data is usually difficult to quality
+control and require extensive resource to collect. To alleviate the significant
+human and resource expenditure in data collection, we propose a Multimodal Data
+Construction Framework (MDCF). MDCF designs proper prompts to spur the
+large-scale pre-trained language model to generate well-formed and satisfactory
+content. Additionally, MDCF also automatically provides explanation for a given
+image and its corresponding dialogue, which can provide a certain degree of
+interpretability and facilitate manual follow-up quality inspection. Based on
+this, we release an Explanatory Multimodal Open-Domain dialogue dataset
+(EXMODD). Experiments indicate a positive correlation between the model's
+ability to generate accurate understandings and high-quality responses. Our
+code and data can be found at https://github.com/poplpr/EXMODD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantic-Aware Contrastive Sentence Representation Learning with Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10962v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10962v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huiming Wang, Liying Cheng, Zhaodonghui Li, De Wen Soh, Lidong Bing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastive learning has been proven to be effective in learning better
+sentence representations. However, to train a contrastive learning model, large
+numbers of labeled sentences are required to construct positive and negative
+pairs explicitly, such as those in natural language inference (NLI) datasets.
+Unfortunately, acquiring sufficient high-quality labeled data can be both
+time-consuming and resource-intensive, leading researchers to focus on
+developing methods for learning unsupervised sentence representations. As there
+is no clear relationship between these unstructured randomly-sampled sentences,
+building positive and negative pairs over them is tricky and problematic. To
+tackle these challenges, in this paper, we propose SemCSR, a semantic-aware
+contrastive sentence representation framework. By leveraging the generation and
+evaluation capabilities of large language models (LLMs), we can automatically
+construct a high-quality NLI-style corpus without any human annotation, and
+further incorporate the generated sentence pairs into learning a contrastive
+sentence representation model. Extensive experiments and comprehensive analyses
+demonstrate the effectiveness of our proposed framework for learning a better
+sentence representation with LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Computing the optimal keyboard through a geometric analysis of the
+  English language 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10956v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10956v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jules Deschamps, Quentin Hubert, Lucas Ryckelynck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the context of a group project for the course COMSW4995 002 - Geometric
+Data Analysis, we bring our attention to the design of fast-typing keyboards.
+Leveraging some geometric tools in an optimization framework allowed us to
+propose novel keyboard layouts that offer a faster typing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A State-Vector Framework for <span class="highlight-title">Dataset</span> Effects <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10955v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10955v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Esmat Sahak, Zining Zhu, Frank Rudzicz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The impressive success of recent deep neural network (DNN)-based systems is
+significantly influenced by the high-quality datasets used in training.
+However, the effects of the datasets, especially how they interact with each
+other, remain underexplored. We propose a state-vector framework to enable
+rigorous studies in this direction. This framework uses idealized probing test
+results as the bases of a vector space. This framework allows us to quantify
+the effects of both standalone and interacting datasets. We show that the
+significant effects of some commonly-used language understanding datasets are
+characteristic and are concentrated on a few linguistic dimensions.
+Additionally, we observe some ``spill-over'' effects: the datasets could impact
+the models along dimensions that may seem unrelated to the intended tasks. Our
+state-vector framework paves the way for a systematic understanding of the
+dataset effects, a crucial component in responsible and robust model
+development.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TEQ: Trainable Equivalent Transformation for Quantization of LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10944v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10944v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenhua Cheng, Yiyang Cai, Kaokao Lv, Haihao Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large language models (LLMs) become more prevalent, there is a growing
+need for new and improved quantization methods that can meet the
+computationalast layer demands of these modern architectures while maintaining
+the accuracy. In this paper, we present TEQ, a trainable equivalent
+transformation that preserves the FP32 precision of the model output while
+taking advantage of low-precision quantization, especially 3 and 4 bits
+weight-only quantization. The training process is lightweight, requiring only
+1K steps and fewer than 0.1 percent of the original model's trainable
+parameters. Furthermore, the transformation does not add any computational
+overhead during inference. Our results are on-par with the state-of-the-art
+(SOTA) methods on typical LLMs. Our approach can be combined with other methods
+to achieve even better performance. The code is available at
+https://github.com/intel/neural-compressor.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MASON-NLP at eRisk 2023: Deep Learning-Based Detection of Depression
+  Symptoms from Social Media Texts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10941v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10941v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fardin Ahsan Sakib, Ahnaf Atef Choudhury, Ozlem Uzuner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depression is a mental health disorder that has a profound impact on people's
+lives. Recent research suggests that signs of depression can be detected in the
+way individuals communicate, both through spoken words and written texts. In
+particular, social media posts are a rich and convenient text source that we
+may examine for depressive symptoms. The Beck Depression Inventory (BDI)
+Questionnaire, which is frequently used to gauge the severity of depression, is
+one instrument that can aid in this study. We can narrow our study to only
+those symptoms since each BDI question is linked to a particular depressive
+symptom. It's important to remember that not everyone with depression exhibits
+all symptoms at once, but rather a combination of them. Therefore, it is
+extremely useful to be able to determine if a sentence or a piece of
+user-generated content is pertinent to a certain condition. With this in mind,
+the eRisk 2023 Task 1 was designed to do exactly that: assess the relevance of
+different sentences to the symptoms of depression as outlined in the BDI
+questionnaire. This report is all about how our team, Mason-NLP, participated
+in this subtask, which involved identifying sentences related to different
+depression symptoms. We used a deep learning approach that incorporated
+MentalBERT, RoBERTa, and LSTM. Despite our efforts, the evaluation results were
+lower than expected, underscoring the challenges inherent in ranking sentences
+from an extensive dataset about depression, which necessitates both appropriate
+methodological choices and significant computational resources. We anticipate
+that future iterations of this shared task will yield improved results as our
+understanding and techniques evolve.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Intent Detection and Slot Filling for Home Assistants: <span class="highlight-title">Dataset</span> and
+  Analysis for Bangla and Sylheti 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10935v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10935v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fardin Ahsan Sakib, A H M Rezaul Karim, Saadat Hasan Khan, Md Mushfiqur Rahman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As voice assistants cement their place in our technologically advanced
+society, there remains a need to cater to the diverse linguistic landscape,
+including colloquial forms of low-resource languages. Our study introduces the
+first-ever comprehensive dataset for intent detection and slot filling in
+formal Bangla, colloquial Bangla, and Sylheti languages, totaling 984 samples
+across 10 unique intents. Our analysis reveals the robustness of large language
+models for tackling downstream tasks with inadequate data. The GPT-3.5 model
+achieves an impressive F1 score of 0.94 in intent detection and 0.51 in slot
+filling for colloquial Bangla.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the First Workshop on Bangla Language Processing, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhanced <span class="highlight-title">Transformer</span> Architecture for Natural Language Processing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10930v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10930v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Woohyeon Moon, Taeyoung Kim, Bumgeun Park, Dongsoo Har
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer is a state-of-the-art model in the field of natural language
+processing (NLP). Current NLP models primarily increase the number of
+transformers to improve processing performance. However, this technique
+requires a lot of training resources such as computing capacity. In this paper,
+a novel structure of Transformer is proposed. It is featured by full layer
+normalization, weighted residual connection, positional encoding exploiting
+reinforcement learning, and zero masked self-attention. The proposed
+Transformer model, which is called Enhanced Transformer, is validated by the
+bilingual evaluation understudy (BLEU) score obtained with the Multi30k
+translation dataset. As a result, the Enhanced Transformer achieves 202.96%
+higher BLEU score as compared to the original transformer with the translation
+dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatial Hu<span class="highlight-title">BERT</span>: <span class="highlight-title">Self-supervised</span> Spatial Speech Representation Learning
+  for a Single Talker from Multi-channel Audio 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10922v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10922v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antoni Dimitriadis, Siqi Pan, Vidhyasaharan Sethu, Beena Ahmed
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning has been used to leverage unlabelled data, improving
+accuracy and generalisation of speech systems through the training of
+representation models. While many recent works have sought to produce effective
+representations across a variety of acoustic domains, languages, modalities and
+even simultaneous speakers, these studies have all been limited to
+single-channel audio recordings. This paper presents Spatial HuBERT, a
+self-supervised speech representation model that learns both acoustic and
+spatial information pertaining to a single speaker in a potentially noisy
+environment by using multi-channel audio inputs. Spatial HuBERT learns
+representations that outperform state-of-the-art single-channel speech
+representations on a variety of spatial downstream tasks, particularly in
+reverberant and noisy environments. We also demonstrate the utility of the
+representations learned by Spatial HuBERT on a speech localisation downstream
+task. Along with this paper, we publicly release a new dataset of 100 000
+simulated first-order ambisonics room impulse responses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NuclearQA: A Human-Made Benchmark for Language Models for the Nuclear
+  Domain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10920v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10920v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anurag Acharya, Sai Munikoti, Aaron Hellinger, Sara Smith, Sridevi Wagle, Sameera Horawalavithana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As LLMs have become increasingly popular, they have been used in almost every
+field. But as the application for LLMs expands from generic fields to narrow,
+focused science domains, there exists an ever-increasing gap in ways to
+evaluate their efficacy in those fields. For the benchmarks that do exist, a
+lot of them focus on questions that don't require proper understanding of the
+subject in question. In this paper, we present NuclearQA, a human-made
+benchmark of 100 questions to evaluate language models in the nuclear domain,
+consisting of a varying collection of questions that have been specifically
+designed by experts to test the abilities of language models. We detail our
+approach and show how the mix of several types of questions makes our benchmark
+uniquely capable of evaluating models in the nuclear domain. We also present
+our own evaluation metric for assessing LLM's performances due to the
+limitations of existing ones. Our experiments on state-of-the-art models
+suggest that even the best LLMs perform less than satisfactorily on our
+benchmark, demonstrating the scientific knowledge gap of existing LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Emergent AI-Assisted Discourse: Case Study of a Second Language Writer
+  Authoring with Chat<span class="highlight-title">GPT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10903v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10903v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sharin Jacob, Tamara Tate, Mark Warschauer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid proliferation of ChatGPT has incited debates regarding its impact
+on human writing. Amid concerns about declining writing standards, this study
+investigates the role of ChatGPT in facilitating academic writing, especially
+among language learners. Using a case study approach, this study examines the
+experiences of Kailing, a doctoral student, who integrates ChatGPT throughout
+their academic writing process. The study employs activity theory as a lens for
+understanding writing with generative AI tools and data analyzed includes
+semi-structured interviews, writing samples, and GPT logs. Results indicate
+that Kailing effectively collaborates with ChatGPT across various writing
+stages while preserving her distinct authorial voice and agency. This
+underscores the potential of AI tools such as ChatGPT to enhance academic
+writing for language learners without overshadowing individual authenticity.
+This case study offers a critical exploration of how ChatGPT is utilized in the
+academic writing process and the preservation of a student's authentic voice
+when engaging with the tool.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learn Your Tokens: Word-Pooled Tokenization for Language Modeling <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11628v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11628v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Avijit Thawani, Saurabh Ghanekar, Xiaoyuan Zhu, Jay Pujara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models typically tokenize text into subwords, using a deterministic,
+hand-engineered heuristic of combining characters into longer surface-level
+strings such as 'ing' or whole words. Recent literature has repeatedly shown
+the limitations of such a tokenization strategy, particularly for documents not
+written in English and for representing numbers. On the other extreme,
+byte/character-level language models are much less restricted but suffer from
+increased sequence description lengths and a subsequent quadratic expansion in
+self-attention computation. Recent attempts to compress and limit these context
+lengths with fixed size convolutions is helpful but completely ignores the word
+boundary. This paper considers an alternative 'learn your tokens' scheme which
+utilizes the word boundary to pool bytes/characters into word representations,
+which are fed to the primary language model, before again decoding individual
+characters/bytes per word in parallel. We find that our moderately expressive
+and moderately fast end-to-end tokenizer outperform by over 300% both subwords
+and byte/character models over the intrinsic language modeling metric of
+next-word prediction across datasets. It particularly outshines on rare words,
+outperforming by a factor of 30! We extensively study the language modeling
+setup for all three categories of tokenizers and theoretically analyze how our
+end-to-end models can also be a strong trade-off in efficiency and robustness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP 2023 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unveiling the General Intelligence Factor in Language Models: A
+  Psychometric Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11616v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11616v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Ilić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study uncovers the factor of general intelligence, or g, in language
+models, extending the psychometric theory traditionally applied to humans and
+certain animal species. Utilizing factor analysis on two extensive datasets -
+Open LLM Leaderboard with 1,232 models and General Language Understanding
+Evaluation (GLUE) Leaderboard with 88 models - we find compelling evidence for
+a unidimensional, highly stable g factor that accounts for 85% of the variance
+in model performance. The study also finds a moderate correlation of .48
+between model size and g. The discovery of g in language models offers a
+unified metric for model evaluation and opens new avenues for more robust,
+g-based model ability assessment. These findings lay the foundation for
+understanding and future research on artificial general intelligence from a
+psychometric perspective and have practical implications for model evaluation
+and development.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages (including appendix), 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language Models as Zero-Shot Trajectory Generators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11604v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11604v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Teyun Kwon, Norman Di Palo, Edward Johns
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have recently shown promise as high-level
+planners for robots when given access to a selection of low-level skills.
+However, it is often assumed that LLMs do not possess sufficient knowledge to
+be used for the low-level trajectories themselves. In this work, we address
+this assumption thoroughly, and investigate if an LLM (GPT-4) can directly
+predict a dense sequence of end-effector poses for manipulation skills, when
+given access to only object detection and segmentation vision models. We study
+how well a single task-agnostic prompt, without any in-context examples, motion
+primitives, or external trajectory optimisers, can perform across 26 real-world
+language-based tasks, such as "open the bottle cap" and "wipe the plate with
+the sponge", and we investigate which design choices in this prompt are the
+most effective. Our conclusions raise the assumed limit of LLMs for robotics,
+and we reveal for the first time that LLMs do indeed possess an understanding
+of low-level robot control sufficient for a range of common tasks, and that
+they can additionally detect failures and then re-plan trajectories
+accordingly. Videos, code, and prompts are available at:
+https://www.robot-learning.uk/language-models-trajectory-generators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 21 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automated Evaluation of Personalized Text Generation using Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11593v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11593v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaqing Wang, Jiepu Jiang, Mingyang Zhang, Cheng Li, Yi Liang, Qiaozhu Mei, Michael Bendersky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized text generation presents a specialized mechanism for delivering
+content that is specific to a user's personal context. While the research
+progress in this area has been rapid, evaluation still presents a challenge.
+Traditional automated metrics such as BLEU and ROUGE primarily measure lexical
+similarity to human-written references, and are not able to distinguish
+personalization from other subtle semantic aspects, thus falling short of
+capturing the nuances of personalized generated content quality. On the other
+hand, human judgments are costly to obtain, especially in the realm of
+personalized evaluation. Inspired by these challenges, we explore the use of
+large language models (LLMs) for evaluating personalized text generation, and
+examine their ability to understand nuanced user context. We present AuPEL, a
+novel evaluation method that distills three major semantic aspects of the
+generated text: personalization, quality and relevance, and automatically
+measures these aspects. To validate the effectiveness of AuPEL, we design
+carefully controlled experiments and compare the accuracy of the evaluation
+judgments made by LLMs versus that of judgements made by human annotators, and
+conduct rigorous analyses of the consistency and sensitivity of the proposed
+metric. We find that, compared to existing evaluation metrics, AuPEL not only
+distinguishes and ranks models based on their personalization abilities more
+accurately, but also presents commendable consistency and efficiency for this
+task. Our work suggests that using LLMs as the evaluators of personalized text
+generation is superior to traditional text similarity metrics, even though
+interesting new challenges still remain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Eliciting Human Preferences with Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11589v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11589v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Belinda Z. Li, Alex Tamkin, Noah Goodman, Jacob Andreas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models (LMs) can be directed to perform target tasks by using
+labeled examples or natural language prompts. But selecting examples or writing
+prompts for can be challenging--especially in tasks that involve unusual edge
+cases, demand precise articulation of nebulous preferences, or require an
+accurate mental model of LM behavior. We propose to use *LMs themselves* to
+guide the task specification process. In this paper, we introduce **Generative
+Active Task Elicitation (GATE)**: a learning framework in which models elicit
+and infer intended behavior through free-form, language-based interaction with
+users. We study GATE in three domains: email validation, content
+recommendation, and moral reasoning. In preregistered experiments, we show that
+LMs prompted to perform GATE (e.g., by generating open-ended questions or
+synthesizing informative edge cases) elicit responses that are often more
+informative than user-written prompts or labels. Users report that interactive
+task elicitation requires less effort than prompting or example labeling and
+surfaces novel considerations not initially anticipated by users. Our findings
+suggest that LM-driven elicitation can be a powerful tool for aligning models
+to complex human preferences and values.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BasahaCorpus: An Expanded Linguistic Resource for Readability Assessment
+  in Central Philippine Languages <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11584v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11584v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joseph Marvin Imperial, Ekaterina Kochmar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current research on automatic readability assessment (ARA) has focused on
+improving the performance of models in high-resource languages such as English.
+In this work, we introduce and release BasahaCorpus as part of an initiative
+aimed at expanding available corpora and baseline models for readability
+assessment in lower resource languages in the Philippines. We compiled a corpus
+of short fictional narratives written in Hiligaynon, Minasbate, Karay-a, and
+Rinconada -- languages belonging to the Central Philippine family tree subgroup
+-- to train ARA models using surface-level, syllable-pattern, and n-gram
+overlap features. We also propose a new hierarchical cross-lingual modeling
+approach that takes advantage of a language's placement in the family tree to
+increase the amount of available training data. Our study yields encouraging
+results that support previous work showcasing the efficacy of cross-lingual
+models in low-resource settings, as well as similarities in highly informative
+linguistic features for mutually intelligible languages.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Final camera-ready paper for EMNLP 2023 (Main)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What is a good question? Task-oriented asking with fact-level masking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11571v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11571v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Toles, Yukun Huang, Zhou Yu, Luis Gravano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Asking questions is an important element of real-life collaboration on
+reasoning tasks like question answering. For example, a legal assistant chatbot
+may be unable to make accurate recommendations without specific information on
+the user's circumstances. However, large language models are usually deployed
+to solve reasoning tasks directly without asking follow-up questions to the
+user or third parties. We term this problem task-oriented asking (TOA).
+Zero-shot chat models can perform TOA, but their training is primarily based on
+next-token prediction rather than whether questions contribute to successful
+collaboration. To enable the training and evaluation of TOA models, we present
+a definition and framework for natural language task-oriented asking, the
+problem of generating questions that result in answers useful for a reasoning
+task. We also present fact-level masking (FLM), a procedure for converting
+natural language datasets into self-supervised TOA datasets by omitting
+particular critical facts. Finally, we generate a TOA dataset from the HotpotQA
+dataset using FLM and evaluate several zero-shot language models on it. Our
+experiments show that current zero-shot models struggle to ask questions that
+retrieve useful information, as compared to human annotators. These results
+demonstrate an opportunity to use FLM datasets and the TOA framework to train
+and evaluate better TOA models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Personalized Soups: Personalized Large Language Model Alignment via
+  Post-hoc Parameter Merging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11564v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11564v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joel Jang, Seungone Kim, Bill Yuchen Lin, Yizhong Wang, Jack Hessel, Luke Zettlemoyer, Hannaneh Hajishirzi, Yejin Choi, Prithviraj Ammanabrolu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While Reinforcement Learning from Human Feedback (RLHF) aligns Large Language
+Models (LLMs) with general, aggregate human preferences, it is suboptimal for
+learning diverse, individual perspectives. In this work, we study Reinforcement
+Learning from Personalized Human Feedback (RLPHF) problem, wherein LLMs are
+aligned to multiple (sometimes conflicting) preferences by modeling alignment
+as a Multi-Objective Reinforcement Learning (MORL) problem. Compared to strong
+single-objective baselines, we show that we can achieve personalized alignment
+by decomposing preferences into multiple dimensions. These dimensions are
+defined based on personalizations that are declared as desirable by the user.
+In this work, we show that they can be efficiently trained independently in a
+distributed manner and combined effectively post-hoc through parameter merging.
+The code is available at https://github.com/joeljang/RLPHF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MUST&P-SRL: Multi-lingual and Unified Syllabification in Text and
+  Phonetic Domains for Speech Representation Learning <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11541v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11541v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Noé Tits
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present a methodology for linguistic feature extraction,
+focusing particularly on automatically syllabifying words in multiple
+languages, with a design to be compatible with a forced-alignment tool, the
+Montreal Forced Aligner (MFA). In both the textual and phonetic domains, our
+method focuses on the extraction of phonetic transcriptions from text, stress
+marks, and a unified automatic syllabification (in text and phonetic domains).
+The system was built with open-source components and resources. Through an
+ablation study, we demonstrate the efficacy of our approach in automatically
+syllabifying words from several languages (English, French and Spanish).
+Additionally, we apply the technique to the transcriptions of the CMU ARCTIC
+dataset, generating valuable annotations available
+online\footnote{\url{https://github.com/noetits/MUST_P-SRL}} that are ideal for
+speech representation learning, speech unit discovery, and disentanglement of
+speech factors in several speech-related fields.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-stage Large Language Model Correction for Speech Recognition <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11532v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11532v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Pu, Thai-Son Nguyen, Sebastian Stüker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigate the usage of large language models (LLMs) to
+improve the performance of competitive speech recognition systems. Different
+from traditional language models that focus on one single data domain, the rise
+of LLMs brings us the opportunity to push the limit of state-of-the-art ASR
+performance, and at the same time to achieve higher robustness and generalize
+effectively across multiple domains. Motivated by this, we propose a novel
+multi-stage approach to combine traditional language model re-scoring and LLM
+prompting. Specifically, the proposed method has two stages: the first stage
+uses a language model to re-score an N-best list of ASR hypotheses and run a
+confidence check; The second stage uses prompts to a LLM to perform ASR error
+correction on less confident results from the first stage. Our experimental
+results demonstrate the effectiveness of the proposed method by showing a 10% ~
+20% relative improvement in WER over a competitive ASR system -- across
+multiple test domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Group Preference Optimization: Few-Shot Alignment of Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11523v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11523v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyan Zhao, John Dang, Aditya Grover
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many applications of large language models (LLMs), ranging from chatbots to
+creative writing, require nuanced subjective judgments that can differ
+significantly across different groups. Existing alignment algorithms can be
+expensive to align for each group, requiring prohibitive amounts of
+group-specific preference data and computation for real-world use cases. We
+introduce Group Preference Optimization (GPO), an alignment framework that
+steers language models to preferences of individual groups in a few-shot
+manner. In GPO, we augment the base LLM with an independent transformer module
+trained to predict the preferences of a group for the LLM generations. For
+few-shot learning, we parameterize this module as an in-context autoregressive
+transformer and train it via meta-learning on several groups. We empirically
+validate the efficacy of GPO through rigorous evaluations using LLMs with
+varied sizes on three human opinion adaptation tasks. These tasks involve
+adapting to the preferences of US demographic groups, global countries, and
+individual users. Our results demonstrate that GPO not only aligns models more
+accurately but also requires fewer group-specific preferences, and less
+training and inference computing resources, outperforming existing strategies
+such as in-context steering and fine-tuning methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic News Summerization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11520v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11520v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kavach Dheer, Arpit Dhankhar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Natural Language Processing is booming with its applications in the real
+world, one of which is Text Summarization for large texts including news
+articles. This research paper provides an extensive comparative evaluation of
+extractive and abstractive approaches for news text summarization, with an
+emphasis on the ROUGE score analysis. The study employs the CNN-Daily Mail
+dataset, which consists of news articles and human-generated reference
+summaries. The evaluation employs ROUGE scores to assess the efficacy and
+quality of generated summaries. After Evaluation, we integrate the
+best-performing models on a web application to assess their real-world
+capabilities and user experience.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-RAG: Learning to Retrieve, Generate, and Critique through
+  Self-Reflection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11511v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11511v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Akari Asai, Zeqiu Wu, Yizhong Wang, Avirup Sil, Hannaneh Hajishirzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite their remarkable capabilities, large language models (LLMs) often
+produce responses containing factual inaccuracies due to their sole reliance on
+the parametric knowledge they encapsulate. Retrieval-Augmented Generation
+(RAG), an ad hoc approach that augments LMs with retrieval of relevant
+knowledge, decreases such issues. However, indiscriminately retrieving and
+incorporating a fixed number of retrieved passages, regardless of whether
+retrieval is necessary, or passages are relevant, diminishes LM versatility or
+can lead to unhelpful response generation. We introduce a new framework called
+Self-Reflective Retrieval-Augmented Generation (Self-RAG) that enhances an LM's
+quality and factuality through retrieval and self-reflection. Our framework
+trains a single arbitrary LM that adaptively retrieves passages on-demand, and
+generates and reflects on retrieved passages and its own generations using
+special tokens, called reflection tokens. Generating reflection tokens makes
+the LM controllable during the inference phase, enabling it to tailor its
+behavior to diverse task requirements. Experiments show that Self-RAG (7B and
+13B parameters) significantly outperforms state-of-the-art LLMs and
+retrieval-augmented models on a diverse set of tasks. Specifically, Self-RAG
+outperforms ChatGPT and retrieval-augmented Llama2-chat on Open-domain QA,
+reasoning and fact verification tasks, and it shows significant gains in
+improving factuality and citation accuracy for long-form generations relative
+to these models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 2 figures, 12 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CoMPosT: Characterizing and Evaluating Caricature in LLM Simulations <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11501v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11501v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Myra Cheng, Tiziano Piccardi, Diyi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has aimed to capture nuances of human behavior by using LLMs to
+simulate responses from particular demographics in settings like social science
+experiments and public opinion surveys. However, there are currently no
+established ways to discuss or evaluate the quality of such LLM simulations.
+Moreover, there is growing concern that these LLM simulations are flattened
+caricatures of the personas that they aim to simulate, failing to capture the
+multidimensionality of people and perpetuating stereotypes. To bridge these
+gaps, we present CoMPosT, a framework to characterize LLM simulations using
+four dimensions: Context, Model, Persona, and Topic. We use this framework to
+measure open-ended LLM simulations' susceptibility to caricature, defined via
+two criteria: individuation and exaggeration. We evaluate the level of
+caricature in scenarios from existing work on LLM simulations. We find that for
+GPT-4, simulations of certain demographics (political and marginalized groups)
+and topics (general, uncontroversial) are highly susceptible to caricature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at EMNLP 2023 (Main)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Is Self-Repair a Silver Bullet for Code Generation? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09896v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09896v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Theo X. Olausson, Jeevana Priya Inala, Chenglong Wang, Jianfeng Gao, Armando Solar-Lezama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models have shown remarkable aptitude in code generation, but
+still struggle on challenging tasks. Self-repair -- in which the model debugs
+and fixes mistakes in its own code -- has recently become a popular way to
+boost performance in these settings. However, only very limited studies on how
+and when self-repair works effectively exist in the literature, and one might
+wonder to what extent a model is really capable of repairing mistakes in code
+which was originally generated by that very same model. In this paper, we
+analyze Code Llama, GPT-3.5 and GPT-4's ability to perform self-repair on
+problems taken from HumanEval or APPS, finding that when the cost of carrying
+out repair is taken into account, gains are often modest, vary significantly
+between subsets of the data, and are sometimes not present at all. We
+hypothesize that this is because self-repair is bottlenecked by the model's
+ability to provide feedback on its own code; boosting the feedback with
+stronger models, we observe performance gains even in settings where the model
+does not benefit from self-repair. Finally, we find that providing the model
+with feedback from human participants greatly benefits repair even for GPT-4,
+and carry out a brief qualitative analysis of the differences observed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Added experiments for HumanEval (dataset) and Code Llama (model)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ R2H: Building Multimodal Navigation Helpers that Respond to Help
+  Requests <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14260v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14260v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Fan, Jing Gu, Kaizhi Zheng, Xin Eric Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intelligent navigation-helper agents are critical as they can navigate users
+in unknown areas through environmental awareness and conversational ability,
+serving as potential accessibility tools for individuals with disabilities. In
+this work, we first introduce a novel benchmark, Respond to Help Requests
+(R2H), to promote the development of multi-modal navigation helpers capable of
+responding to requests for help, utilizing existing dialog-based embodied
+datasets. R2H mainly includes two tasks: (1) Respond to Dialog History (RDH),
+which assesses the helper agent's ability to generate informative responses
+based on a given dialog history, and (2) Respond during Interaction (RdI),
+which evaluates the effectiveness and efficiency of the response during
+consistent cooperation with a task performer. Furthermore, we explore two
+approaches to construct the navigation-helper agent, including fine-tuning a
+novel task-oriented multi-modal response generation model that can see and
+respond, named SeeRee, and employing a multi-modal large language model in a
+zero-shot manner. Analysis of the task and method was conducted based on both
+automatic benchmarking and human evaluations. Project website:
+https://sites.google.com/view/response2helprequests/home.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Effective Long-Context Scaling of Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16039v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16039v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenhan Xiong, Jingyu Liu, Igor Molybog, Hejia Zhang, Prajjwal Bhargava, Rui Hou, Louis Martin, Rashi Rungta, Karthik Abinav Sankararaman, Barlas Oguz, Madian Khabsa, Han Fang, Yashar Mehdad, Sharan Narang, Kshitiz Malik, Angela Fan, Shruti Bhosale, Sergey Edunov, Mike Lewis, Sinong Wang, Hao Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a series of long-context LLMs that support effective context
+windows of up to 32,768 tokens. Our model series are built through continual
+pretraining from Llama 2 with longer training sequences and on a dataset where
+long texts are upsampled. We perform extensive evaluation on language modeling,
+synthetic context probing tasks, and a wide range of research benchmarks. On
+research benchmarks, our models achieve consistent improvements on most regular
+tasks and significant improvements on long-context tasks over Llama 2. Notably,
+with a cost-effective instruction tuning procedure that does not require
+human-annotated long instruction data, the 70B variant can already surpass
+gpt-3.5-turbo-16k's overall performance on a suite of long-context tasks.
+Alongside these results, we provide an in-depth analysis on the individual
+components of our method. We delve into Llama's position encodings and discuss
+its limitation in modeling long dependencies. We also examine the impact of
+various design choices in the pretraining process, including the data mix and
+the training curriculum of sequence lengths -- our ablation experiments suggest
+that having abundant long texts in the pretrain dataset is not the key to
+achieving strong performance, and we empirically verify that long context
+continual pretraining is more efficient and similarly effective compared to
+pretraining from scratch with long sequences.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automatic Personalized Impression Generation for PET Reports Using Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10066v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10066v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Tie, Muheon Shin, Ali Pirasteh, Nevein Ibrahim, Zachary Huemann, Sharon M. Castellino, Kara M. Kelly, John Garrett, Junjie Hu, Steve Y. Cho, Tyler J. Bradshaw
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we aimed to determine if fine-tuned large language models
+(LLMs) can generate accurate, personalized impressions for whole-body PET
+reports. Twelve language models were trained on a corpus of PET reports using
+the teacher-forcing algorithm, with the report findings as input and the
+clinical impressions as reference. An extra input token encodes the reading
+physician's identity, allowing models to learn physician-specific reporting
+styles. Our corpus comprised 37,370 retrospective PET reports collected from
+our institution between 2010 and 2022. To identify the best LLM, 30 evaluation
+metrics were benchmarked against quality scores from two nuclear medicine (NM)
+physicians, with the most aligned metrics selecting the model for expert
+evaluation. In a subset of data, model-generated impressions and original
+clinical impressions were assessed by three NM physicians according to 6
+quality dimensions (3-point scale) and an overall utility score (5-point
+scale). Each physician reviewed 12 of their own reports and 12 reports from
+other physicians. Bootstrap resampling was used for statistical analysis. Of
+all evaluation metrics, domain-adapted BARTScore and PEGASUSScore showed the
+highest Spearman's rank correlations (0.568 and 0.563) with physician
+preferences. Based on these metrics, the fine-tuned PEGASUS model was selected
+as the top LLM. When physicians reviewed PEGASUS-generated impressions in their
+own style, 89% were considered clinically acceptable, with a mean utility score
+of 4.08 out of 5. Physicians rated these personalized impressions as comparable
+in overall utility to the impressions dictated by other physicians (4.03,
+P=0.41). In conclusion, personalized impressions generated by PEGASUS were
+clinically useful, highlighting its potential to expedite PET reporting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages in total. 6 figures and 3 tables in the main body. The
+  manuscript has been submitted to a journal for potential publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Integrating LLM, EEG, and Eye-Tracking Biomarker Analysis for Word-Level
+  Neural State Classification in Semantic Inference Reading Comprehension 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15714v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15714v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhong Zhang, Qin Li, Sujal Nahata, Tasnia Jamal, Shih-kuen Cheng, Gert Cauwenberghs, Tzyy-Ping Jung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the recent proliferation of large language models (LLMs), such as
+Generative Pre-trained Transformers (GPT), there has been a significant shift
+in exploring human and machine comprehension of semantic language meaning. This
+shift calls for interdisciplinary research that bridges cognitive science and
+natural language processing (NLP). This pilot study aims to provide insights
+into individuals' neural states during a semantic relation
+reading-comprehension task. We propose jointly analyzing LLMs, eye-gaze, and
+electroencephalographic (EEG) data to study how the brain processes words with
+varying degrees of relevance to a keyword during reading. We also use a feature
+engineering approach to improve the fixation-related EEG data classification
+while participants read words with high versus low relevance to the keyword.
+The best validation accuracy in this word-level classification is over 60\%
+across 12 subjects. Words of high relevance to the inference keyword had
+significantly more eye fixations per word: 1.0584 compared to 0.6576 when
+excluding no-fixation words, and 1.5126 compared to 1.4026 when including them.
+This study represents the first attempt to classify brain states at a word
+level using LLM knowledge. It provides valuable insights into human cognitive
+abilities and the realm of Artificial General Intelligence (AGI), and offers
+guidance for developing potential reading-assisted technologies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Coarse-to-Fine Contrastive Learning in Image-Text-Graph Space for
+  Improved Vision-Language Compositionality <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13812v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13812v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harman Singh, Pengchuan Zhang, Qifan Wang, Mengjiao Wang, Wenhan Xiong, Jingfei Du, Yu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastively trained vision-language models have achieved remarkable
+progress in vision and language representation learning, leading to
+state-of-the-art models for various downstream multimodal tasks. However,
+recent research has highlighted severe limitations of these models in their
+ability to perform compositional reasoning over objects, attributes, and
+relations. Scene graphs have emerged as an effective way to understand images
+compositionally. These are graph-structured semantic representations of images
+that contain objects, their attributes, and relations with other objects in a
+scene. In this work, we consider the scene graph parsed from text as a proxy
+for the image scene graph and propose a graph decomposition and augmentation
+framework along with a coarse-to-fine contrastive learning objective between
+images and text that aligns sentences of various complexities to the same
+image. Along with this, we propose novel negative mining techniques in the
+scene graph space for improving attribute binding and relation understanding.
+Through extensive experiments, we demonstrate the effectiveness of our approach
+that significantly improves attribute binding, relation understanding,
+systematic generalization, and productivity on multiple recently proposed
+benchmarks (For example, improvements upto $18\%$ for systematic
+generalization, $16.5\%$ for relation understanding over a strong baseline),
+while achieving similar or better performance than CLIP on various general
+multimodal tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 (main)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ties Matter: Meta-Evaluating Modern Metrics with Pairwise Accuracy and
+  Tie Calibration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14324v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14324v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniel Deutsch, George Foster, Markus Freitag
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Kendall's tau is frequently used to meta-evaluate how well machine
+translation (MT) evaluation metrics score individual translations. Its focus on
+pairwise score comparisons is intuitive but raises the question of how ties
+should be handled, a gray area that has motivated different variants in the
+literature. We demonstrate that, in settings like modern MT meta-evaluation,
+existing variants have weaknesses arising from their handling of ties, and in
+some situations can even be gamed. We propose instead to meta-evaluate metrics
+with a version of pairwise accuracy that gives metrics credit for correctly
+predicting ties, in combination with a tie calibration procedure that
+automatically introduces ties into metric scores, enabling fair comparison
+between metrics that do and do not predict ties. We argue and provide
+experimental evidence that these modifications lead to fairer ranking-based
+assessments of metric performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ XSTest: A Test Suite for Identifying Exaggerated Safety Behaviours in
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01263v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01263v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Röttger, Hannah Rose Kirk, Bertie Vidgen, Giuseppe Attanasio, Federico Bianchi, Dirk Hovy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Without proper safeguards, large language models will readily follow
+malicious instructions and generate toxic content. This risk motivates safety
+efforts such as red-teaming and large-scale feedback learning, which aim to
+make models both helpful and harmless. However, there is a tension between
+these two objectives, since harmlessness requires models to refuse to comply
+with unsafe prompts, and thus not be helpful. Recent anecdotal evidence
+suggests that some models may have struck a poor balance, so that even clearly
+safe prompts are refused if they use similar language to unsafe prompts or
+mention sensitive topics. In this paper, we introduce a new test suite called
+XSTest to identify such eXaggerated Safety behaviours in a systematic way.
+XSTest comprises 250 safe prompts across ten prompt types that well-calibrated
+models should not refuse to comply with, and 200 unsafe prompts as contrasts
+that models, for most applications, should refuse. We describe XSTest's
+creation and composition, and then use the test suite to highlight systematic
+failure modes in state-of-the-art language models as well as more general
+challenges in building safer language models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v2 prepared for conference submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Make Every Example Count: On the Stability and Utility of Self-Influence
+  for Learning from Noisy NLP <span class="highlight-title">Dataset</span>s <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.13959v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.13959v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Irina Bejan, Artem Sokolov, Katja Filippova
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Increasingly larger datasets have become a standard ingredient to advancing
+the state-of-the-art in NLP. However, data quality might have already become
+the bottleneck to unlock further gains. Given the diversity and the sizes of
+modern datasets, standard data filtering is not straight-forward to apply,
+because of the multifacetedness of the harmful data and elusiveness of
+filtering rules that would generalize across multiple tasks. We study the
+fitness of task-agnostic self-influence scores of training examples for data
+cleaning, analyze their efficacy in capturing naturally occurring outliers, and
+investigate to what extent self-influence based data cleaning can improve
+downstream performance in machine translation, question answering and text
+classification, building up on recent approaches to self-influence calculation
+and automated curriculum learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fake news detection using parallel <span class="highlight-title">BERT</span> deep neural networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.04793v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.04793v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahmood Farokhian, Vahid Rafe, Hadi Veisi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fake news is a growing challenge for social networks and media. Detection of
+fake news always has been a problem for many years, but after the evolution of
+social networks and increasing speed of news dissemination in recent years has
+been considered again. There are several approaches to solving this problem,
+one of which is to detect fake news based on its text style using deep neural
+networks. In recent years, one of the most used forms of deep neural networks
+for natural language processing is transfer learning with transformers. BERT is
+one of the most promising transformers who outperforms other models in many NLP
+benchmarks. This article, we introduce MWPBert, which uses two parallel BERT
+networks to perform veracity detection on full-text news articles. One of the
+BERT networks encodes news headline, and another encodes news body. Since the
+input length of the BERT network is limited and constant and the news body is
+usually a long text, we cannot fed the whole news text into the BERT.
+Therefore, using the MaxWorth algorithm, we selected the part of the news text
+that is more valuable for fact-checking, and fed it into the BERT network.
+Finally, we encode the output of the two BERT networks to an output network to
+classify the news. The experiment results showed that the proposed model
+outperformed previous models in terms of accuracy and other performance
+measures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Multimed Tools Appl (2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MacLaSa: Multi-Aspect Controllable Text Generation via Efficient
+  Sampling from Compact Latent Space <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12785v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12785v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanxing Ding, Liang Pang, Zihao Wei, Huawei Shen, Xueqi Cheng, Tat-Seng Chua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-aspect controllable text generation aims to generate fluent sentences
+that possess multiple desired attributes simultaneously. Traditional methods
+either combine many operators in the decoding stage, often with costly
+iteration or search in the discrete text space, or train separate controllers
+for each aspect, resulting in a degeneration of text quality due to the
+discrepancy between different aspects. To address these limitations, we
+introduce a novel approach for multi-aspect control, namely MacLaSa, that
+estimates compact latent space for multiple aspects and performs efficient
+sampling with a robust sampler based on ordinary differential equations (ODEs).
+To eliminate the domain gaps between different aspects, we utilize a
+Variational Autoencoder (VAE) network to map text sequences from varying data
+sources into close latent representations. The estimated latent space enables
+the formulation of joint energy-based models (EBMs) and the plugging in of
+arbitrary attribute discriminators to achieve multi-aspect control. Afterwards,
+we draw latent vector samples with an ODE-based sampler and feed sampled
+examples to the VAE decoder to produce target text sequences. Experimental
+results demonstrate that MacLaSa outperforms several strong baselines on
+attribute relevance and textual quality while maintaining a high inference
+speed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PAXQA: Generating Cross-lingual Question Answering Examples at Training
+  Scale <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.12206v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.12206v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bryan Li, Chris Callison-Burch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing question answering (QA) systems owe much of their success to large,
+high-quality training data. Such annotation efforts are costly, and the
+difficulty compounds in the cross-lingual setting. Therefore, prior
+cross-lingual QA work has focused on releasing evaluation datasets, and then
+applying zero-shot methods as baselines. This work proposes a synthetic data
+generation method for cross-lingual QA which leverages indirect supervision
+from existing parallel corpora. Our method termed PAXQA (Projecting annotations
+for cross-lingual (x) QA) decomposes cross-lingual QA into two stages. First,
+we apply a question generation (QG) model to the English side. Second, we apply
+annotation projection to translate both the questions and answers. To better
+translate questions, we propose a novel use of lexically-constrained machine
+translation, in which constrained entities are extracted from the parallel
+bitexts.
+  We apply PAXQA to generate cross-lingual QA examples in 4 languages (662K
+examples total), and perform human evaluation on a subset to create validation
+and test splits. We then show that models fine-tuned on these datasets
+outperform prior synthetic data generation models over several extractive QA
+datasets. The largest performance gains are for directions with non-English
+questions and English contexts. Ablation studies show that our dataset
+generation method is relatively robust to noise from automatic word alignments,
+showing the sufficient quality of our generations. To facilitate follow-up
+work, we release our code and datasets at https://github.com/manestay/paxqa .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 (Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Look Before You Leap: An Exploratory Study of Uncertainty Measurement
+  for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.10236v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.10236v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuheng Huang, Jiayang Song, Zhijie Wang, Shengming Zhao, Huaming Chen, Felix Juefei-Xu, Lei Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent performance leap of Large Language Models (LLMs) opens up new
+opportunities across numerous industrial applications and domains. However,
+erroneous generations, such as false predictions, misinformation, and
+hallucination made by LLMs, have also raised severe concerns for the
+trustworthiness of LLMs', especially in safety-, security- and
+reliability-sensitive scenarios, potentially hindering real-world adoptions.
+While uncertainty estimation has shown its potential for interpreting the
+prediction risks made by general machine learning (ML) models, little is known
+about whether and to what extent it can help explore an LLM's capabilities and
+counteract its undesired behavior. To bridge the gap, in this paper, we
+initiate an exploratory study on the risk assessment of LLMs from the lens of
+uncertainty. In particular, we experiment with twelve uncertainty estimation
+methods and four LLMs on four prominent natural language processing (NLP) tasks
+to investigate to what extent uncertainty estimation techniques could help
+characterize the prediction risks of LLMs. Our findings validate the
+effectiveness of uncertainty estimation for revealing LLMs'
+uncertain/non-factual predictions. In addition to general NLP tasks, we
+extensively conduct experiments with four LLMs for code generation on two
+datasets. We find that uncertainty estimation can potentially uncover buggy
+programs generated by LLMs. Insights from our study shed light on future design
+and development for reliable LLMs, facilitating further research toward
+enhancing the trustworthiness of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Interpreter Understands Your Meaning: End-to-end Spoken Language
+  Understanding Aided by Speech Translation <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.09652v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.09652v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mutian He, Philip N. Garner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  End-to-end spoken language understanding (SLU) remains elusive even with
+current large pretrained language models on text and speech, especially in
+multilingual cases. Machine translation has been established as a powerful
+pretraining objective on text as it enables the model to capture high-level
+semantics of the input utterance and associations between different languages,
+which is desired for speech models that work on lower-level acoustic frames.
+Motivated particularly by the task of cross-lingual SLU, we demonstrate that
+the task of speech translation (ST) is a good means of pretraining speech
+models for end-to-end SLU on both intra- and cross-lingual scenarios.
+  By introducing ST, our models reach higher performance over baselines on
+monolingual and multilingual intent classification as well as spoken question
+answering using SLURP, MINDS-14, and NMSQA benchmarks. To verify the
+effectiveness of our methods, we also create new benchmark datasets from both
+synthetic and real sources, for speech summarization and low-resource/zero-shot
+transfer from English to French or Spanish. We further show the value of
+preserving knowledge for the ST pretraining task for better downstream
+performance, possibly using Bayesian transfer regularizers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 3 figures; accepted by Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BioT5: Enriching Cross-modal Integration in Biology with Chemical
+  Knowledge and Natural Language Associations <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07276v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07276v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qizhi Pei, Wei Zhang, Jinhua Zhu, Kehan Wu, Kaiyuan Gao, Lijun Wu, Yingce Xia, Rui Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in biological research leverage the integration of
+molecules, proteins, and natural language to enhance drug discovery. However,
+current models exhibit several limitations, such as the generation of invalid
+molecular SMILES, underutilization of contextual information, and equal
+treatment of structured and unstructured knowledge. To address these issues, we
+propose $\mathbf{BioT5}$, a comprehensive pre-training framework that enriches
+cross-modal integration in biology with chemical knowledge and natural language
+associations. $\mathbf{BioT5}$ utilizes SELFIES for $100%$ robust molecular
+representations and extracts knowledge from the surrounding context of
+bio-entities in unstructured biological literature. Furthermore,
+$\mathbf{BioT5}$ distinguishes between structured and unstructured knowledge,
+leading to more effective utilization of information. After fine-tuning, BioT5
+shows superior performance across a wide range of tasks, demonstrating its
+strong capability of capturing underlying relations and properties of
+bio-entities. Our code is available at
+$\href{https://github.com/QizhiPei/BioT5}{Github}$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Empirical Methods in Natural Language Processing 2023
+  (EMNLP 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reward-Augmented Decoding: Efficient Controlled Text Generation With a
+  Unidirectional Reward Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09520v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09520v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haikang Deng, Colin Raffel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While large language models have proven effective in a huge range of
+downstream applications, they often generate text that is problematic or lacks
+a desired attribute. In this paper, we introduce Reward-Augmented Decoding
+(RAD), a text generation procedure that uses a small unidirectional reward
+model to encourage a language model to generate text that has certain
+properties. Specifically, RAD uses the reward model to score generations as
+they are produced and rescales sampling probabilities to favor high-reward
+tokens. By using a unidirectional reward model, RAD can cache activations from
+prior generation steps to decrease computational overhead. Through experiments
+on generating non-toxic and sentiment-controlled text, we demonstrate that RAD
+performs best among methods that change only the generation procedure and
+matches the performance of state-of-the-art methods that involve re-training
+the language model. We further validate that RAD is effective on very large
+language models while incurring a minimal computational overhead.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TrueTeacher: Learning Factual Consistency Evaluation with Large Language
+  Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11171v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11171v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zorik Gekhman, Jonathan Herzig, Roee Aharoni, Chen Elkind, Idan Szpektor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Factual consistency evaluation is often conducted using Natural Language
+Inference (NLI) models, yet these models exhibit limited success in evaluating
+summaries. Previous work improved such models with synthetic training data.
+However, the data is typically based on perturbed human-written summaries,
+which often differ in their characteristics from real model-generated summaries
+and have limited coverage of possible factual errors. Alternatively, large
+language models (LLMs) have recently shown promising results in directly
+evaluating generative tasks, but are too computationally expensive for
+practical use. Motivated by these limitations, we introduce TrueTeacher, a
+method for generating synthetic data by annotating diverse model-generated
+summaries using a LLM. Unlike prior work, TrueTeacher does not rely on
+human-written summaries, and is multilingual by nature. Experiments on the TRUE
+benchmark show that a student model trained using our data, substantially
+outperforms both the state-of-the-art model with similar capacity, and the LLM
+teacher. In a systematic study, we compare TrueTeacher to existing synthetic
+data generation methods and demonstrate its superiority and robustness to
+domain-shift. We also show that our method generalizes to multilingual
+scenarios. Lastly, we release our large scale synthetic dataset (1.4M
+examples), generated using TrueTeacher, and a checkpoint trained on this data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a long paper in EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Active Learning for Natural Language Generation <span class="chip">EMNLP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15040v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15040v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yotam Perlitz, Ariel Gera, Michal Shmueli-Scheuer, Dafna Sheinwald, Noam Slonim, Liat Ein-Dor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of Natural Language Generation (NLG) suffers from a severe shortage
+of labeled data due to the extremely expensive and time-consuming process
+involved in manual annotation. A natural approach for coping with this problem
+is active learning (AL), a well-known machine learning technique for improving
+annotation efficiency by selectively choosing the most informative examples to
+label. However, while AL has been well-researched in the context of text
+classification, its application to NLG remains largely unexplored. In this
+paper, we present a first systematic study of active learning for NLG,
+considering a diverse set of tasks and multiple leading selection strategies,
+and harnessing a strong instruction-tuned model. Our results indicate that the
+performance of existing AL strategies is inconsistent, surpassing the baseline
+of random example selection in some cases but not in others. We highlight some
+notable differences between the classification and generation scenarios, and
+analyze the selection behaviors of existing AL strategies. Our findings
+motivate exploring novel approaches for applying AL to generation tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP2023 as a long paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can <span class="highlight-title">Pre-train</span>ed Vision and Language Models Answer Visual
+  Information-Seeking Questions? <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.11713v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.11713v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Chen, Hexiang Hu, Yi Luan, Haitian Sun, Soravit Changpinyo, Alan Ritter, Ming-Wei Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained vision and language models have demonstrated state-of-the-art
+capabilities over existing tasks involving images and texts, including visual
+question answering. However, it remains unclear whether these models possess
+the capability to answer questions that are not only querying visual content
+but knowledge-intensive and information-seeking. In this study, we introduce
+InfoSeek, a visual question answering dataset tailored for information-seeking
+questions that cannot be answered with only common sense knowledge. Using
+InfoSeek, we analyze various pre-trained visual question answering models and
+gain insights into their characteristics. Our findings reveal that
+state-of-the-art pre-trained multi-modal models (e.g., PaLI-X, BLIP2, etc.)
+face challenges in answering visual information-seeking questions, but
+fine-tuning on the InfoSeek dataset elicits models to use fine-grained
+knowledge that was learned during their pre-training. Furthermore, we show that
+accurate visual entity recognition can be used to improve performance on
+InfoSeek by retrieving relevant documents, showing a significant space for
+improvement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 (main conference); Our dataset and evaluation is available
+  at https://open-vision-language.github.io/infoseek/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AnglE-optimized Text Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.12871v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.12871v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianming Li, Jing Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High-quality text embedding is pivotal in improving semantic textual
+similarity (STS) tasks, which are crucial components in Large Language Model
+(LLM) applications. However, a common challenge existing text embedding models
+face is the problem of vanishing gradients, primarily due to their reliance on
+the cosine function in the optimization objective, which has saturation zones.
+To address this issue, this paper proposes a novel angle-optimized text
+embedding model called AnglE. The core idea of AnglE is to introduce angle
+optimization in a complex space. This novel approach effectively mitigates the
+adverse effects of the saturation zone in the cosine function, which can impede
+gradient and hinder optimization processes. To set up a comprehensive STS
+evaluation, we experimented on existing short-text STS datasets and a newly
+collected long-text STS dataset from GitHub Issues. Furthermore, we examine
+domain-specific STS scenarios with limited labeled data and explore how AnglE
+works with LLM-annotated data. Extensive experiments were conducted on various
+tasks including short-text STS, long-text STS, and domain-specific STS tasks.
+The results show that AnglE outperforms the state-of-the-art (SOTA) STS models
+that ignore the cosine saturation zone. These findings demonstrate the ability
+of AnglE to generate high-quality text embeddings and the usefulness of angle
+optimization in STS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>update llama results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Wav2vec-based Detection and Severity Level Classification of Dysarthria
+  from Speech 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.14107v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.14107v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Farhad Javanmardi, Saska Tirronen, Manila Kodali, Sudarsana Reddy Kadiri, Paavo Alku
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic detection and severity level classification of dysarthria directly
+from acoustic speech signals can be used as a tool in medical diagnosis. In
+this work, the pre-trained wav2vec 2.0 model is studied as a feature extractor
+to build detection and severity level classification systems for dysarthric
+speech. The experiments were carried out with the popularly used UA-speech
+database. In the detection experiments, the results revealed that the best
+performance was obtained using the embeddings from the first layer of the
+wav2vec model that yielded an absolute improvement of 1.23% in accuracy
+compared to the best performing baseline feature (spectrogram). In the studied
+severity level classification task, the results revealed that the embeddings
+from the final layer gave an absolute improvement of 10.62% in accuracy
+compared to the best baseline features (mel-frequency cepstral coefficients).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>copyright 2023 IEEE. Personal use of this material is permitted.
+  Permission from IEEE must be obtained for all other uses, in any current or
+  future media, including reprinting/republishing this material for advertising
+  or promotional purposes, creating new collective works, for resale or
+  redistribution to servers or lists, or reuse of any copyrighted component of
+  this work in other works</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analysis and Detection of Pathological Voice using Glottal Source
+  Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.14080v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.14080v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sudarsana Reddy Kadiri, Paavo Alku
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic detection of voice pathology enables objective assessment and
+earlier intervention for the diagnosis. This study provides a systematic
+analysis of glottal source features and investigates their effectiveness in
+voice pathology detection. Glottal source features are extracted using glottal
+flows estimated with the quasi-closed phase (QCP) glottal inverse filtering
+method, using approximate glottal source signals computed with the zero
+frequency filtering (ZFF) method, and using acoustic voice signals directly. In
+addition, we propose to derive mel-frequency cepstral coefficients (MFCCs) from
+the glottal source waveforms computed by QCP and ZFF to effectively capture the
+variations in glottal source spectra of pathological voice. Experiments were
+carried out using two databases, the Hospital Universitario Principe de
+Asturias (HUPA) database and the Saarbrucken Voice Disorders (SVD) database.
+Analysis of features revealed that the glottal source contains information that
+discriminates normal and pathological voice. Pathology detection experiments
+were carried out using support vector machine (SVM). From the detection
+experiments it was observed that the performance achieved with the studied
+glottal source features is comparable or better than that of conventional MFCCs
+and perceptual linear prediction (PLP) features. The best detection performance
+was achieved when the glottal source features were combined with the
+conventional MFCCs and PLP features, which indicates the complementary nature
+of the features.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Copyright 2020 IEEE. Personal use of this material is permitted.
+  Permission from IEEE must be obtained for all other uses, in any current or
+  future media, including reprinting/republishing this material for advertising
+  or promotional purposes, creating new collective works, for resale or
+  redistribution to servers or lists, or reuse of any copyrighted component of
+  this work in other works</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Is Chat<span class="highlight-title">GPT</span> Fair for Recommendation? Evaluating Fairness in Large
+  Language Model Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07609v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07609v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jizhi Zhang, Keqin Bao, Yang Zhang, Wenjie Wang, Fuli Feng, Xiangnan He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The remarkable achievements of Large Language Models (LLMs) have led to the
+emergence of a novel recommendation paradigm -- Recommendation via LLM
+(RecLLM). Nevertheless, it is important to note that LLMs may contain social
+prejudices, and therefore, the fairness of recommendations made by RecLLM
+requires further investigation. To avoid the potential risks of RecLLM, it is
+imperative to evaluate the fairness of RecLLM with respect to various sensitive
+attributes on the user side. Due to the differences between the RecLLM paradigm
+and the traditional recommendation paradigm, it is problematic to directly use
+the fairness benchmark of traditional recommendation. To address the dilemma,
+we propose a novel benchmark called Fairness of Recommendation via LLM
+(FaiRLLM). This benchmark comprises carefully crafted metrics and a dataset
+that accounts for eight sensitive attributes1 in two recommendation scenarios:
+music and movies. By utilizing our FaiRLLM benchmark, we conducted an
+evaluation of ChatGPT and discovered that it still exhibits unfairness to some
+sensitive attributes when generating recommendations. Our code and dataset can
+be found at https://github.com/jizhi-zhang/FaiRLLM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Recsys 2023 (Short)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ace<span class="highlight-title">GPT</span>, Localizing Large Language Models in Arabic 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.12053v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.12053v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huang Huang, Fei Yu, Jianqing Zhu, Xuening Sun, Hao Cheng, Dingjie Song, Zhihong Chen, Abdulmohsen Alharthi, Bang An, Juncai He, Ziche Liu, Zhiyi Zhang, Junying Chen, Jianquan Li, Benyou Wang, Lian Zhang, Ruoyu Sun, Xiang Wan, Haizhou Li, Jinchao Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper is devoted to the development of a localized Large Language Model
+(LLM) specifically for Arabic, a language imbued with unique cultural
+characteristics inadequately addressed by current mainstream models.
+Significant concerns emerge when addressing cultural sensitivity and local
+values. To address this, the paper proposes a comprehensive solution that
+includes further pre-training with Arabic texts, Supervised Fine-Tuning (SFT)
+utilizing native Arabic instructions, and GPT-4 responses in Arabic, alongside
+Reinforcement Learning with AI Feedback (RLAIF) employing a reward model
+attuned to local culture and values. The goal is to cultivate culturally
+cognizant and value-aligned Arabic LLMs capable of accommodating the diverse,
+application-specific needs of Arabic-speaking communities. Comprehensive
+evaluations reveal that the resulting model, dubbed 'AceGPT', sets the
+state-of-the-art standard for open Arabic LLMs across various benchmarks,
+including the instruction-following benchmark (i.e., Arabic Vicuna-80 and
+Arabic AlpacaEval), knowledge benchmark (i.e., Arabic MMLU and EXAMs), and the
+newly introduced Arabic Cultural and Value Alignment benchmark. Notably, AceGPT
+outperforms Turbo in the popular Vicuna-80 benchmark when evaluated with GPT-4,
+despite the benchmark's limited scale. Codes, data, and models are in
+https://github.com/FreedomIntelligence/AceGPT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>https://github.com/FreedomIntelligence/AceGPT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-level Adaptive Contrastive Learning for Knowledge Internalization
+  in Dialogue Generation <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08943v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08943v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenxu Yang, Zheng Lin, Lanrui Wang, Chong Tian, Liang Pang, Jiangnan Li, Qirong Ho, Yanan Cao, Weiping Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge-grounded dialogue generation aims to mitigate the issue of text
+degeneration by incorporating external knowledge to supplement the context.
+However, the model often fails to internalize this information into responses
+in a human-like manner. Instead, it simply inserts segments of the provided
+knowledge into generic responses. As a result, the generated responses tend to
+be tedious, incoherent, and in lack of interactivity which means the
+degeneration problem is still unsolved. In this work, we first find that such
+copying-style degeneration is primarily due to the weak likelihood objective,
+which allows the model to "cheat" the objective by merely duplicating knowledge
+segments in a superficial pattern matching based on overlap. To overcome this
+challenge, we then propose a Multi-level Adaptive Contrastive Learning (MACL)
+framework that dynamically samples negative examples and subsequently penalizes
+degeneration behaviors at both the token-level and sequence-level. Extensive
+experiments on the WoW dataset demonstrate the effectiveness of our approach
+across various pre-trained models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ When Do Program-of-Thoughts Work for Reasoning? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.15452v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.15452v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Bi, Ningyu Zhang, Yinuo Jiang, Shumin Deng, Guozhou Zheng, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The reasoning capabilities of Large Language Models (LLMs) play a pivotal
+role in the realm of embodied artificial intelligence. Although there are
+effective methods like program-of-thought prompting for LLMs which uses
+programming language to tackle complex reasoning tasks, the specific impact of
+code data on the improvement of reasoning capabilities remains under-explored.
+To address this gap, we propose complexity-impacted reasoning score (CIRS),
+which combines structural and logical attributes, to measure the correlation
+between code and reasoning abilities. Specifically, we use the abstract syntax
+tree to encode the structural information and calculate logical complexity by
+considering the difficulty and the cyclomatic complexity. Through an empirical
+analysis, we find not all code data of complexity can be learned or understood
+by LLMs. Optimal level of complexity is critical to the improvement of
+reasoning abilities by program-aided prompting. Then we design an
+auto-synthesizing and stratifying algorithm, and apply it to instruction
+generation for mathematical reasoning and code data filtering for code
+generation tasks. Extensive results demonstrates the effectiveness of our
+proposed approach. Code will be integrated into the EasyInstruct framework at
+https://github.com/zjunlp/EasyInstruct.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLM-CXR: Instruction-Finetuned LLM for CXR Image Understanding and
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11490v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11490v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suhyeon Lee, Won Jun Kim, Jinho Chang, Jong Chul Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Following the impressive development of LLMs, vision-language alignment in
+LLMs is actively being researched to enable multimodal reasoning and visual IO.
+This direction of research is particularly relevant to medical imaging because
+medical image analysis and generation consist of reasoning based on a
+combination of visual features and prior knowledge. Many recent works have
+focused on training adapter networks that serve as an information bridge
+between image processing networks and LLMs; but presumably, in order to achieve
+maximum reasoning potential of LLMs on visual information as well, visual and
+language features should be allowed to interact more freely. This is especially
+important in the medical domain because understanding and generating medical
+images such as chest X-rays (CXR) require not only accurate visual and
+language-based reasoning but also a more intimate mapping between the two
+modalities. Thus, taking inspiration from previous work on the transformer and
+VQ-GAN combination for bidirectional image and text generation, we build upon
+this approach and develop a method for instruction-tuning an LLM pre-trained
+only on text to gain vision-language capabilities for medical images.
+Specifically, we leverage a pretrained LLM's existing question-answering and
+instruction-following abilities to teach it to understand visual inputs by
+instructing it to answer questions about image inputs and, symmetrically,
+output both text and image responses appropriate to a given query by tuning the
+LLM with diverse tasks that encompass image-based text-generation and
+text-based image-generation. We show that our model, LLM-CXR, trained in this
+approach shows better image-text alignment in both CXR understanding and
+generation tasks while being smaller in size compared to previously developed
+models that perform a narrower range of tasks. The code is at
+https://github.com/hyn2028/llm-cxr.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Unsupervised Recognition of Semantic Differences in Related
+  Documents <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13303v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13303v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jannis Vamvas, Rico Sennrich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatically highlighting words that cause semantic differences between two
+documents could be useful for a wide range of applications. We formulate
+recognizing semantic differences (RSD) as a token-level regression task and
+study three unsupervised approaches that rely on a masked language model. To
+assess the approaches, we begin with basic English sentences and gradually move
+to more complex, cross-lingual document pairs. Our results show that an
+approach based on word alignment and sentence-level contrastive learning has a
+robust correlation to gold labels. However, all unsupervised approaches still
+leave a large margin of improvement. Code to reproduce our experiments is
+available at https://github.com/ZurichNLP/recognizing-semantic-differences
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models Are Latent Variable Models: Explaining and Finding
+  Good Demonstrations for In-Context Learning <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.11916v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.11916v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyi Wang, Wanrong Zhu, Michael Saxon, Mark Steyvers, William Yang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, pre-trained large language models (LLMs) have demonstrated
+remarkable efficiency in achieving an inference-time few-shot learning
+capability known as in-context learning. However, existing literature has
+highlighted the sensitivity of this capability to the selection of few-shot
+demonstrations. Current understandings of the underlying mechanisms by which
+this capability arises from regular language model pretraining objectives
+remain disconnected from the real-world LLMs. This study aims to examine the
+in-context learning phenomenon through a Bayesian lens, viewing real-world LLMs
+as latent variable models. On this premise, we propose an algorithm to select
+optimal demonstrations from a set of annotated data with a small LM, and then
+directly generalize the selected demonstrations to larger LMs. We demonstrate
+significant improvement over baselines, averaged over eight GPT models on eight
+real-world text classification datasets. We also demonstrate the real-world
+usefulness of our algorithm on GSM8K, a math word problem dataset. Our
+empirical findings support our hypothesis that LLMs implicitly infer a latent
+variable containing task information.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>code at:
+  https://github.com/WANGXinyiLinda/concept-based-demonstration-selection
+  Accepted to NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised Text Embedding Space Generation Using Generative
+  Adversarial Networks for Text Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.17181v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.17181v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun-Min Lee, Tae-Bin Ha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Adversarial Networks (GAN) is a model for data synthesis, which
+creates plausible data through the competition of generator and discriminator.
+Although GAN application to image synthesis is extensively studied, it has
+inherent limitations to natural language generation. Because natural language
+is composed of discrete tokens, a generator has difficulty updating its
+gradient through backpropagation; therefore, most text-GAN studies generate
+sentences starting with a random token based on a reward system. Thus, the
+generators of previous studies are pre-trained in an autoregressive way before
+adversarial training, causing data memorization that synthesized sentences
+reproduce the training data. In this paper, we synthesize sentences using a
+framework similar to the original GAN. More specifically, we propose Text
+Embedding Space Generative Adversarial Networks (TESGAN) which generate
+continuous text embedding spaces instead of discrete tokens to solve the
+gradient backpropagation problem. Furthermore, TESGAN conducts unsupervised
+learning which does not directly refer to the text of the training data to
+overcome the data memorization issue. By adopting this novel method, TESGAN can
+synthesize new sentences, showing the potential of unsupervised learning for
+text synthesis. We expect to see extended research combining Large Language
+Models with a new perspective of viewing text as an continuous space.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NEJLT accpeted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Expression Tree Decoding Strategy for Mathematical Equation
+  Generation <span class="chip">EMNLP-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09619v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09619v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenqi Zhang, Yongliang Shen, Qingpeng Nong, Zeqi Tan Yanna Ma, Weiming Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating mathematical equations from natural language requires an accurate
+understanding of the relations among math expressions. Existing approaches can
+be broadly categorized into token-level and expression-level generation. The
+former treats equations as a mathematical language, sequentially generating
+math tokens. Expression-level methods generate each expression one by one.
+However, each expression represents a solving step, and there naturally exist
+parallel or dependent relations between these steps, which are ignored by
+current sequential methods. Therefore, we integrate tree structure into the
+expression-level generation and advocate an expression tree decoding strategy.
+To generate a tree with expression as its node, we employ a layer-wise parallel
+decoding strategy: we decode multiple independent expressions (leaf nodes) in
+parallel at each layer and repeat parallel decoding layer by layer to
+sequentially generate these parent node expressions that depend on others.
+Besides, a bipartite matching algorithm is adopted to align multiple
+predictions with annotations for each layer. Experiments show our method
+outperforms other baselines, especially for these equations with complex
+structures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP-2023, camera-ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Internal State of an LLM Knows When It's Lying 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.13734v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.13734v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amos Azaria, Tom Mitchell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While Large Language Models (LLMs) have shown exceptional performance in
+various tasks, one of their most prominent drawbacks is generating inaccurate
+or false information with a confident tone. In this paper, we provide evidence
+that the LLM's internal state can be used to reveal the truthfulness of
+statements. This includes both statements provided to the LLM, and statements
+that the LLM itself generates. Our approach is to train a classifier that
+outputs the probability that a statement is truthful, based on the hidden layer
+activations of the LLM as it reads or generates the statement. Experiments
+demonstrate that given a set of test sentences, of which half are true and half
+false, our trained classifier achieves an average of 71\% to 83\% accuracy
+labeling which sentences are true versus false, depending on the LLM base
+model. Furthermore, we explore the relationship between our classifier's
+performance and approaches based on the probability assigned to the sentence by
+the LLM. We show that while LLM-assigned sentence probability is related to
+sentence truthfulness, this probability is also dependent on sentence length
+and the frequencies of words in the sentence, resulting in our trained
+classifier providing a more reliable approach to detecting truthfulness,
+highlighting its potential to enhance the reliability of LLM-generated content
+and its practical applicability in real-world scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NLP for Crypto-Asset Regulation: A Roadmap 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10333v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10333v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carolina Camassa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the rapidly evolving field of crypto-assets, white papers are essential
+documents for investor guidance, and are now subject to unprecedented content
+requirements under the EU's Markets in Crypto-Assets Regulation (MiCAR).
+Natural Language Processing can serve as a powerful tool for both analyzing
+these documents and assisting in regulatory compliance. This paper delivers two
+contributions to the topic. First, we survey existing applications of textual
+analysis to unregulated crypto-asset white papers, uncovering a research gap
+that could be bridged with interdisciplinary collaboration. We then conduct an
+analysis of the changes introduced by MiCAR, highlighting the opportunities and
+challenges of integrating NLP within the new regulatory framework. The findings
+set the stage for further research, with the potential to benefit regulators,
+crypto-asset issuers, and investors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at NLLP23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ConvLab-3: A Flexible Dialogue System Toolkit Based on a Unified Data
+  Format 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.17148v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.17148v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qi Zhu, Christian Geishauser, Hsien-chin Lin, Carel van Niekerk, Baolin Peng, Zheng Zhang, Michael Heck, Nurul Lubis, Dazhen Wan, Xiaochen Zhu, Jianfeng Gao, Milica Gašić, Minlie Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Task-oriented dialogue (TOD) systems function as digital assistants, guiding
+users through various tasks such as booking flights or finding restaurants.
+Existing toolkits for building TOD systems often fall short of in delivering
+comprehensive arrays of data, models, and experimental environments with a
+user-friendly experience. We introduce ConvLab-3: a multifaceted dialogue
+system toolkit crafted to bridge this gap. Our unified data format simplifies
+the integration of diverse datasets and models, significantly reducing
+complexity and cost for studying generalization and transfer. Enhanced with
+robust reinforcement learning (RL) tools, featuring a streamlined training
+process, in-depth evaluation tools, and a selection of user simulators,
+ConvLab-3 supports the rapid development and evaluation of robust dialogue
+policies. Through an extensive study, we demonstrate the efficacy of transfer
+learning and RL and showcase that ConvLab-3 is not only a powerful tool for
+seasoned researchers but also an accessible platform for newcomers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sophia: A Scalable Stochastic Second-order Optimizer for Language Model
+  <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14342v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14342v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hong Liu, Zhiyuan Li, David Hall, Percy Liang, Tengyu Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given the massive cost of language model pre-training, a non-trivial
+improvement of the optimization algorithm would lead to a material reduction on
+the time and cost of training. Adam and its variants have been state-of-the-art
+for years, and more sophisticated second-order (Hessian-based) optimizers often
+incur too much per-step overhead. In this paper, we propose Sophia,
+Second-order Clipped Stochastic Optimization, a simple scalable second-order
+optimizer that uses a light-weight estimate of the diagonal Hessian as the
+pre-conditioner. The update is the moving average of the gradients divided by
+the moving average of the estimated Hessian, followed by element-wise clipping.
+The clipping controls the worst-case update size and tames the negative impact
+of non-convexity and rapid change of Hessian along the trajectory. Sophia only
+estimates the diagonal Hessian every handful of iterations, which has
+negligible average per-step time and memory overhead. On language modeling with
+GPT models of sizes ranging from 125M to 1.5B, Sophia achieves a 2x speed-up
+compared to Adam in the number of steps, total compute, and wall-clock time,
+achieving the same perplexity with 50% fewer steps, less total compute, and
+reduced wall-clock time. Theoretically, we show that Sophia, in a much
+simplified setting, adapts to the heterogeneous curvatures in different
+parameter dimensions, and thus has a run-time bound that does not depend on the
+condition number of the loss.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BLM-17m: A Large-Scale <span class="highlight-title">Dataset</span> for Black Lives Matter Topic Detection on
+  Twitter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2105.01331v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2105.01331v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hasan Kemik, Nusret Özateş, Meysam Asgari-Chenaghlu, Yang Li, Erik Cambria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Protection of human rights is one of the most important problems of our
+world. In this paper, our aim is to provide a dataset which covers one of the
+most significant human rights contradiction in recent months affected the whole
+world, George Floyd incident. We propose a labeled dataset for topic detection
+that contains 17 million tweets. These Tweets are collected from 25 May 2020 to
+21 August 2020 that covers 89 days from start of this incident. We labeled the
+dataset by monitoring most trending news topics from global and local
+newspapers. Apart from that, we present two baselines, TF-IDF and LDA. We
+evaluated the results of these two methods with three different k values for
+metrics of precision, recall and f1-score. The collected dataset is available
+at https://github.com/MeysamAsgariC/BLMT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Making the Most of Chat<span class="highlight-title">GPT</span> for Machine Translation <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13780v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13780v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keqin Peng, Liang Ding, Qihuang Zhong, Li Shen, Xuebo Liu, Min Zhang, Yuanxin Ouyang, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  ChatGPT shows remarkable capabilities for machine translation (MT). Several
+prior studies have shown that it achieves comparable results to commercial
+systems for high-resource languages, but lags behind in complex tasks, e.g.,
+low-resource and distant-language-pairs translation. However, they usually
+adopt simple prompts which can not fully elicit the capability of ChatGPT. In
+this paper, we aim to further mine ChatGPT's translation ability by revisiting
+several aspects: temperature, task information, and domain information, and
+correspondingly propose an optimal temperature setting and two (simple but
+effective) prompts: Task-Specific Prompts (TSP) and Domain-Specific Prompts
+(DSP). We show that: 1) The performance of ChatGPT depends largely on
+temperature, and a lower temperature usually can achieve better performance; 2)
+Emphasizing the task information can further improve ChatGPT's performance,
+particularly in complex MT tasks; 3) Introducing domain information can elicit
+ChatGPT's generalization ability and improve its performance in the specific
+domain; 4) ChatGPT tends to generate hallucinations for non-English-centric MT
+tasks, which can be partially addressed by our proposed prompts but still need
+to be highlighted for the MT/NLP community. We also explore the effects of
+advanced in-context learning strategies and find a (negative but interesting)
+observation: the powerful chain-of-thought prompt leads to word-by-word
+translation behavior, thus bringing significant translation degradation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 (findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> on Evaluation of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03109v8">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03109v8.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yupeng Chang, Xu Wang, Jindong Wang, Yuan Wu, Linyi Yang, Kaijie Zhu, Hao Chen, Xiaoyuan Yi, Cunxiang Wang, Yidong Wang, Wei Ye, Yue Zhang, Yi Chang, Philip S. Yu, Qiang Yang, Xing Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are gaining increasing popularity in both
+academia and industry, owing to their unprecedented performance in various
+applications. As LLMs continue to play a vital role in both research and daily
+use, their evaluation becomes increasingly critical, not only at the task
+level, but also at the society level for better understanding of their
+potential risks. Over the past years, significant efforts have been made to
+examine LLMs from various perspectives. This paper presents a comprehensive
+review of these evaluation methods for LLMs, focusing on three key dimensions:
+what to evaluate, where to evaluate, and how to evaluate. Firstly, we provide
+an overview from the perspective of evaluation tasks, encompassing general
+natural language processing tasks, reasoning, medical usage, ethics,
+educations, natural and social sciences, agent applications, and other areas.
+Secondly, we answer the `where' and `how' questions by diving into the
+evaluation methods and benchmarks, which serve as crucial components in
+assessing performance of LLMs. Then, we summarize the success and failure cases
+of LLMs in different tasks. Finally, we shed light on several future challenges
+that lie ahead in LLMs evaluation. Our aim is to offer invaluable insights to
+researchers in the realm of LLMs evaluation, thereby aiding the development of
+more proficient LLMs. Our key point is that evaluation should be treated as an
+essential discipline to better assist the development of LLMs. We consistently
+maintain the related open-source materials at:
+https://github.com/MLGroupJLU/LLM-eval-survey.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages; a major update to include more recent works;
+  https://llm-eval.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can Chat<span class="highlight-title">GPT</span> replace StackOverflow? A Study on Robustness and Reliability
+  of Large Language Model Code Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.10335v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.10335v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Zhong, Zilong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the large language models (LLMs) have shown extraordinary ability
+in understanding natural language and generating programming code. It has been
+a common practice of software engineers to consult LLMs when encountering
+coding questions. Although efforts have been made to avoid syntax errors and
+align the code with the intended semantics, the reliability and robustness of
+the code generationfrom LLMs have not yet been thoroughly studied. The
+executable code is not equivalent to the reliable and robust code, especially
+in the context of real-world software development. The misuse of APIs in the
+generated code could lead to severe problem, such as resource leaks, program
+crashes. To make things worse, the users of LLM code generation services are
+actually the developers that are most vulnerable to these code that seems right
+-- They are always novice developers that are not familiar with the APIs that
+LLMs generate code for them. Therefore, they could hardly tell the misuse in
+the code generated by LLMs, which further facilitates the incorrect code
+applied in real-world software. Existing code evaluation benchmark and datasets
+focus on crafting small tasks such as programming questions in coding
+interviews, which however deviates from the problem that developers would ask
+LLM for real-world coding help. To fill the missing piece, in this work, we
+propose a dataset RobustAPI for evaluating the reliability and robustness of
+code generated by LLMs. We collect 1208 coding questions from StackOverflow on
+24 representative Java APIs. We summarize thecommon misuse patterns of these
+APIs and evaluate them oncurrent popular LLMs. The evaluation results show that
+evenfor GPT-4, 62% of the generated code contains API misuses,which would cause
+unexpected consequences if the code isintroduced into real-world software.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improved Contextual Recognition In Automatic Speech Recognition Systems
+  By Semantic Lattice Rescoring 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09680v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09680v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ankitha Sudarshan, Vinay Samuel, Parth Patwa, Ibtihel Amara, Aman Chadha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic Speech Recognition (ASR) has witnessed a profound research
+interest. Recent breakthroughs have given ASR systems different prospects such
+as faithfully transcribing spoken language, which is a pivotal advancement in
+building conversational agents. However, there is still an imminent challenge
+of accurately discerning context-dependent words and phrases. In this work, we
+propose a novel approach for enhancing contextual recognition within ASR
+systems via semantic lattice processing leveraging the power of deep learning
+models in accurately delivering spot-on transcriptions across a wide variety of
+vocabularies and speaking styles. Our solution consists of using Hidden Markov
+Models and Gaussian Mixture Models (HMM-GMM) along with Deep Neural Networks
+(DNN) models integrating both language and acoustic modeling for better
+accuracy. We infused our network with the use of a transformer-based model to
+properly rescore the word lattice achieving remarkable capabilities with a
+palpable reduction in Word Error Rate (WER). We demonstrate the effectiveness
+of our proposed framework on the LibriSpeech dataset with empirical analyses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2209.01250,
+  arXiv:2301.06735 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CLEME: Debiasing Multi-reference Evaluation for Grammatical Error
+  Correction <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10819v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10819v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingheng Ye, Yinghui Li, Qingyu Zhou, Yangning Li, Shirong Ma, Hai-Tao Zheng, Ying Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evaluating the performance of Grammatical Error Correction (GEC) systems is a
+challenging task due to its subjectivity. Designing an evaluation metric that
+is as objective as possible is crucial to the development of GEC task. However,
+mainstream evaluation metrics, i.e., reference-based metrics, introduce bias
+into the multi-reference evaluation by extracting edits without considering the
+presence of multiple references. To overcome this issue, we propose Chunk-LEvel
+Multi-reference Evaluation (CLEME), designed to evaluate GEC systems in the
+multi-reference evaluation setting. CLEME builds chunk sequences with
+consistent boundaries for the source, the hypothesis and references, thus
+eliminating the bias caused by inconsistent edit boundaries. Furthermore, we
+observe the consistent boundary could also act as the boundary of grammatical
+errors, based on which the F$_{0.5}$ score is then computed following the
+correction independence assumption. We conduct experiments on six English
+reference sets based on the CoNLL-2014 shared task. Extensive experiments and
+detailed analyses demonstrate the correctness of our discovery and the
+effectiveness of CLEME. Further analysis reveals that CLEME is robust to
+evaluate GEC systems across reference sets with varying numbers of references
+and annotation style.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as an EMNLP 2023 main paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can <span class="highlight-title">GPT</span>-4V(ision) Serve Medical Applications? Case Studies on <span class="highlight-title">GPT</span>-4V for
+  Multimodal Medical Diagnosis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09909v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09909v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoyi Wu, Jiayu Lei, Qiaoyu Zheng, Weike Zhao, Weixiong Lin, Xiaoman Zhang, Xiao Zhou, Ziheng Zhao, Ya Zhang, Yanfeng Wang, Weidi Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Driven by the large foundation models, the development of artificial
+intelligence has witnessed tremendous progress lately, leading to a surge of
+general interest from the public. In this study, we aim to assess the
+performance of OpenAI's newest model, GPT-4V(ision), specifically in the realm
+of multimodal medical diagnosis. Our evaluation encompasses 17 human body
+systems, including Central Nervous System, Head and Neck, Cardiac, Chest,
+Hematology, Hepatobiliary, Gastrointestinal, Urogenital, Gynecology,
+Obstetrics, Breast, Musculoskeletal, Spine, Vascular, Oncology, Trauma,
+Pediatrics, with images taken from 8 modalities used in daily clinic routine,
+e.g., X-ray, Computed Tomography (CT), Magnetic Resonance Imaging (MRI),
+Positron Emission Tomography (PET), Digital Subtraction Angiography (DSA),
+Mammography, Ultrasound, and Pathology. We probe the GPT-4V's ability on
+multiple clinical tasks with or without patent history provided, including
+imaging modality and anatomy recognition, disease diagnosis, report generation,
+disease localisation.
+  Our observation shows that, while GPT-4V demonstrates proficiency in
+distinguishing between medical image modalities and anatomy, it faces
+significant challenges in disease diagnosis and generating comprehensive
+reports. These findings underscore that while large multimodal models have made
+significant advancements in computer vision and natural language processing, it
+remains far from being used to effectively support real-world medical
+applications and clinical decision-making.
+  All images used in this report can be found in
+https://github.com/chaoyi-wu/GPT-4V_Medical_Evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Opportunities and Challenges for Chat<span class="highlight-title">GPT</span> and Large Language Models in
+  Biomedicine and Health 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10070v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10070v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubo Tian, Qiao Jin, Lana Yeganova, Po-Ting Lai, Qingqing Zhu, Xiuying Chen, Yifan Yang, Qingyu Chen, Won Kim, Donald C. Comeau, Rezarta Islamaj, Aadit Kapoor, Xin Gao, Zhiyong Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  ChatGPT has drawn considerable attention from both the general public and
+domain experts with its remarkable text generation capabilities. This has
+subsequently led to the emergence of diverse applications in the field of
+biomedicine and health. In this work, we examine the diverse applications of
+large language models (LLMs), such as ChatGPT, in biomedicine and health.
+Specifically we explore the areas of biomedical information retrieval, question
+answering, medical text summarization, information extraction, and medical
+education, and investigate whether LLMs possess the transformative power to
+revolutionize these tasks or whether the distinct complexities of biomedical
+domain presents unique challenges. Following an extensive literature survey, we
+find that significant advances have been made in the field of text generation
+tasks, surpassing the previous state-of-the-art methods. For other
+applications, the advances have been modest. Overall, LLMs have not yet
+revolutionized biomedicine, but recent rapid progress indicates that such
+methods hold great potential to provide valuable means for accelerating
+discovery and improving health. We also find that the use of LLMs, like
+ChatGPT, in the fields of biomedicine and health entails various risks and
+challenges, including fabricated information in its generated responses, as
+well as legal and privacy concerns associated with sensitive patient data. We
+believe this survey can provide a comprehensive and timely overview to
+biomedical researchers and healthcare practitioners on the opportunities and
+challenges associated with using ChatGPT and other LLMs for transforming
+biomedicine and health.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Unified Contrastive Transfer Framework with Propagation Structure for
+  Boosting Low-Resource Rumor Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.01492v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.01492v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongzhan Lin, Jing Ma, Ruichao Yang, Zhiwei Yang, Mingfei Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The truth is significantly hampered by massive rumors that spread along with
+breaking news or popular topics. Since there is sufficient corpus gathered from
+the same domain for model training, existing rumor detection algorithms show
+promising performance on yesterday's news. However, due to a lack of
+substantial training data and prior expert knowledge, they are poor at spotting
+rumors concerning unforeseen events, especially those propagated in different
+languages (i.e., low-resource regimes). In this paper, we propose a unified
+contrastive transfer framework to detect rumors by adapting the features
+learned from well-resourced rumor data to that of the low-resourced with only
+few-shot annotations. More specifically, we first represent rumor circulated on
+social media as an undirected topology for enhancing the interaction of user
+opinions, and then train a Multi-scale Graph Convolutional Network via a
+unified contrastive paradigm to mine effective clues simultaneously from post
+semantics and propagation structure. Our model explicitly breaks the barriers
+of the domain and/or language issues, via language alignment and a novel
+domain-adaptive contrastive learning mechanism. To well-generalize the
+representation learning using a small set of annotated target events, we reveal
+that rumor-indicative signal is closely correlated with the uniformity of the
+distribution of these events. We design a target-wise contrastive training
+mechanism with three event-level data augmentation strategies, capable of
+unifying the representations by distinguishing target events. Extensive
+experiments conducted on four low-resource datasets collected from real-world
+microblog platforms demonstrate that our framework achieves much better
+performance than state-of-the-art methods and exhibits a superior capacity for
+detecting rumors at early stages.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>An extension of the first contrastive approach for low-resource rumor
+  detection (arXiv:2204.08143)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Systematic Evaluation of Large Language Models on Out-of-Distribution
+  Logical Reasoning Tasks <span class="chip">IJCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09430v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09430v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiming Bao, Gael Gendron, Alex Yuxuan Peng, Wanjun Zhong, Neset Tan, Yang Chen, Michael Witbrock, Jiamou Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs), such as GPT-3.5 and GPT-4, have greatly
+advanced the performance of artificial systems on various natural language
+processing tasks to human-like levels. However, their generalisation and
+robustness to perform logical reasoning remain under-evaluated. To probe this
+ability, we propose three new logical reasoning datasets named "ReClor-plus",
+"LogiQA-plus" and "LogiQAv2-plus", each featuring three subsets: the first with
+randomly shuffled options, the second with the correct choices replaced by
+"none of the other options are correct", and a combination of the previous two
+subsets. We carry out experiments on these datasets with both discriminative
+and generative LLMs and show that these simple tricks greatly hinder the
+performance of the language models. Despite their superior performance on the
+original publicly available datasets, we find that all models struggle to
+answer our newly constructed datasets. We show that introducing task variations
+by perturbing a sizable training set can markedly improve the model's
+generalisation and robustness in logical reasoning tasks. Moreover, applying
+logic-driven data augmentation for fine-tuning, combined with prompting can
+enhance the generalisation performance of both discriminative large language
+models and generative large language models. These results offer insights into
+assessing and improving the generalisation and robustness of large language
+models for logical reasoning tasks. We make our source code and data publicly
+available
+\url{https://github.com/Strong-AI-Lab/Logical-and-abstract-reasoning}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for oral presentation at the LLM@IJCAI 2023 non-archival
+  symposium</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LoftQ: LoRA-Fine-Tuning-Aware Quantization for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08659v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08659v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixiao Li, Yifan Yu, Chen Liang, Pengcheng He, Nikos Karampatziakis, Weizhu Chen, Tuo Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantization is an indispensable technique for serving Large Language Models
+(LLMs) and has recently found its way into LoRA fine-tuning. In this work we
+focus on the scenario where quantization and LoRA fine-tuning are applied
+together on a pre-trained model. In such cases it is common to observe a
+consistent gap in the performance on downstream tasks between full fine-tuning
+and quantization plus LoRA fine-tuning approach. In response, we propose LoftQ
+(LoRA-Fine-Tuning-aware Quantization), a novel quantization framework that
+simultaneously quantizes an LLM and finds a proper low-rank initialization for
+LoRA fine-tuning. Such an initialization alleviates the discrepancy between the
+quantized and full-precision model and significantly improves the
+generalization in downstream tasks. We evaluate our method on natural language
+understanding, question answering, summarization, and natural language
+generation tasks. Experiments show that our method is highly effective and
+outperforms existing quantization methods, especially in the challenging 2-bit
+and 2/4-bit mixed precision regimes. We will release our code.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Prefix-diffusion: A Lightweight Diffusion Model for Diverse Image
+  Captioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.04965v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.04965v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guisheng Liu, Yi Li, Zhengcong Fei, Haiyan Fu, Xiangyang Luo, Yanqing Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While impressive performance has been achieved in image captioning, the
+limited diversity of the generated captions and the large parameter scale
+remain major barriers to the real-word application of these systems. In this
+work, we propose a lightweight image captioning network in combination with
+continuous diffusion, called Prefix-diffusion. To achieve diversity, we design
+an efficient method that injects prefix image embeddings into the denoising
+process of the diffusion model. In order to reduce trainable parameters, we
+employ a pre-trained model to extract image features and further design an
+extra mapping network. Prefix-diffusion is able to generate diverse captions
+with relatively less parameters, while maintaining the fluency and relevance of
+the captions benefiting from the generative capabilities of the diffusion
+model. Our work paves the way for scaling up diffusion models for image
+captioning, and achieves promising performance compared with recent approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages,4 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Visual Storytelling with Question-Answer Plans <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05295v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05295v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Danyang Liu, Mirella Lapata, Frank Keller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual storytelling aims to generate compelling narratives from image
+sequences. Existing models often focus on enhancing the representation of the
+image sequence, e.g., with external knowledge sources or advanced graph
+structures. Despite recent progress, the stories are often repetitive,
+illogical, and lacking in detail. To mitigate these issues, we present a novel
+framework which integrates visual representations with pretrained language
+models and planning. Our model translates the image sequence into a visual
+prefix, a sequence of continuous embeddings which language models can
+interpret. It also leverages a sequence of question-answer pairs as a blueprint
+plan for selecting salient visual concepts and determining how they should be
+assembled into a narrative. Automatic and human evaluation on the VIST
+benchmark (Huang et al., 2016) demonstrates that blueprint-based models
+generate stories that are more coherent, interesting, and natural compared to
+competitive baselines and state-of-the-art systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Developing a Natural Language Understanding Model to Characterize Cable
+  News Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09166v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09166v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seth P. Benson, Iain J. Cruickshank
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Media bias has been extensively studied by both social and computational
+sciences. However, current work still has a large reliance on human input and
+subjective assessment to label biases. This is especially true for cable news
+research. To address these issues, we develop an unsupervised machine learning
+method to characterize the bias of cable news programs without any human input.
+This method relies on the analysis of what topics are mentioned through Named
+Entity Recognition and how those topics are discussed through Stance Analysis
+in order to cluster programs with similar biases together. Applying our method
+to 2020 cable news transcripts, we find that program clusters are consistent
+over time and roughly correspond to the cable news network of the program. This
+method reveals the potential for future tools to objectively assess media bias
+and characterize unfamiliar media environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning List-Level Domain-Invariant Representations for Ranking <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.10764v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.10764v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruicheng Xian, Honglei Zhuang, Zhen Qin, Hamed Zamani, Jing Lu, Ji Ma, Kai Hui, Han Zhao, Xuanhui Wang, Michael Bendersky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain adaptation aims to transfer the knowledge learned on (data-rich)
+source domains to (low-resource) target domains, and a popular method is
+invariant representation learning, which matches and aligns the data
+distributions on the feature space. Although this method is studied extensively
+and applied on classification and regression problems, its adoption on ranking
+problems is sporadic, and the few existing implementations lack theoretical
+justifications. This paper revisits invariant representation learning for
+ranking. Upon reviewing prior work, we found that they implement what we call
+item-level alignment, which aligns the distributions of the items being ranked
+from all lists in aggregate but ignores their list structure. However, the list
+structure should be leveraged, because it is intrinsic to ranking problems
+where the data and the metrics are defined and computed on lists, not the items
+by themselves. To close this discrepancy, we propose list-level alignment --
+learning domain-invariant representations at the higher level of lists. The
+benefits are twofold: it leads to the first domain adaptation generalization
+bound for ranking, in turn providing theoretical support for the proposed
+method, and it achieves better empirical transfer performance for unsupervised
+domain adaptation on ranking tasks, including passage reranking.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Video-Text Retrieval by Supervised Sparse Multi-Grained Learning <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.09473v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.09473v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yimu Wang, Peng Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While recent progress in video-text retrieval has been advanced by the
+exploration of better representation learning, in this paper, we present a
+novel multi-grained sparse learning framework, S3MA, to learn an aligned sparse
+space shared between the video and the text for video-text retrieval. The
+shared sparse space is initialized with a finite number of sparse concepts,
+each of which refers to a number of words. With the text data at hand, we learn
+and update the shared sparse space in a supervised manner using the proposed
+similarity and alignment losses. Moreover, to enable multi-grained alignment,
+we incorporate frame representations for better modeling the video modality and
+calculating fine-grained and coarse-grained similarities. Benefiting from the
+learned shared sparse space and multi-grained similarities, extensive
+experiments on several video-text retrieval benchmarks demonstrate the
+superiority of S3MA over existing methods. Our code is available at
+https://github.com/yimuwangcs/Better_Cross_Modal_Retrieval.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Difference-Masking: Choosing What to Mask in Continued <span class="highlight-title">Pretrain</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14577v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14577v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alex Wilf, Syeda Nahida Akter, Leena Mathur, Paul Pu Liang, Sheryl Mathew, Mengrou Shou, Eric Nyberg, Louis-Philippe Morency
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The self-supervised objective of masking-and-predicting has led to promising
+performance gains on a variety of downstream tasks. However, while most
+approaches randomly mask tokens, there is strong intuition that deciding what
+to mask can substantially improve learning outcomes. We investigate this in
+continued pretraining setting in which pretrained models continue to pretrain
+on domain-specific data before performing some downstream task. We introduce
+Difference-Masking, a masking strategy that automatically chooses what to mask
+during continued pretraining by considering what makes a task domain different
+from the pretraining domain. Empirically, we find that Difference-Masking
+outperforms baselines on continued pretraining settings across four diverse
+language-only and multimodal video tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cross-Lingual Consistency of Factual Knowledge in Multilingual Language
+  Models <span class="chip">EMNLP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10378v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10378v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jirui Qi, Raquel Fernández, Arianna Bisazza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multilingual large-scale Pretrained Language Models (PLMs) have been shown to
+store considerable amounts of factual knowledge, but large variations are
+observed across languages. With the ultimate goal of ensuring that users with
+different language backgrounds obtain consistent feedback from the same model,
+we study the cross-lingual consistency (CLC) of factual knowledge in various
+multilingual PLMs. To this end, we propose a Ranking-based Consistency (RankC)
+metric to evaluate knowledge consistency across languages independently from
+accuracy. Using this metric, we conduct an in-depth analysis of the determining
+factors for CLC, both at model level and at language-pair level. Among other
+results, we find that increasing model size leads to higher factual probing
+accuracy in most languages, but does not improve cross-lingual consistency.
+Finally, we conduct a case study on CLC when new factual associations are
+inserted in the PLMs via model editing. Results on a small sample of facts
+inserted in English reveal a clear pattern whereby the new piece of knowledge
+transfers only to languages with which English has a high RankC score.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at EMNLP2023 main conference. All code and data are released
+  at https://github.com/Betswish/Cross-Lingual-Consistency</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Text Summarization Using Large Language Models: A Comparative Study of
+  MPT-7b-instruct, Falcon-7b-instruct, and OpenAI Chat-<span class="highlight-title">GPT</span> Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10449v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10449v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lochan Basyal, Mihir Sanghvi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text summarization is a critical Natural Language Processing (NLP) task with
+applications ranging from information retrieval to content generation.
+Leveraging Large Language Models (LLMs) has shown remarkable promise in
+enhancing summarization techniques. This paper embarks on an exploration of
+text summarization with a diverse set of LLMs, including MPT-7b-instruct,
+falcon-7b-instruct, and OpenAI ChatGPT text-davinci-003 models. The experiment
+was performed with different hyperparameters and evaluated the generated
+summaries using widely accepted metrics such as the Bilingual Evaluation
+Understudy (BLEU) Score, Recall-Oriented Understudy for Gisting Evaluation
+(ROUGE) Score, and Bidirectional Encoder Representations from Transformers
+(BERT) Score. According to the experiment, text-davinci-003 outperformed the
+others. This investigation involved two distinct datasets: CNN Daily Mail and
+XSum. Its primary objective was to provide a comprehensive understanding of the
+performance of Large Language Models (LLMs) when applied to different datasets.
+The assessment of these models' effectiveness contributes valuable insights to
+researchers and practitioners within the NLP domain. This work serves as a
+resource for those interested in harnessing the potential of LLMs for text
+summarization and lays the foundation for the development of advanced
+Generative AI applications aimed at addressing a wide spectrum of business
+challenges.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Answering Questions by Meta-Reasoning over Multiple Chains of Thought <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.13007v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.13007v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ori Yoran, Tomer Wolfson, Ben Bogin, Uri Katz, Daniel Deutch, Jonathan Berant
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern systems for multi-hop question answering (QA) typically break
+questions into a sequence of reasoning steps, termed chain-of-thought (CoT),
+before arriving at a final answer. Often, multiple chains are sampled and
+aggregated through a voting mechanism over the final answers, but the
+intermediate steps themselves are discarded. While such approaches improve
+performance, they do not consider the relations between intermediate steps
+across chains and do not provide a unified explanation for the predicted
+answer. We introduce Multi-Chain Reasoning (MCR), an approach which prompts
+large language models to meta-reason over multiple chains of thought, rather
+than aggregating their answers. MCR examines different reasoning chains, mixes
+information between them and selects the most relevant facts in generating an
+explanation and predicting the answer. MCR outperforms strong baselines on 7
+multi-hop QA datasets. Moreover, our analysis reveals that MCR explanations
+exhibit high quality, enabling humans to verify its answers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in The 2023 Conference on Empirical Methods
+  in Natural Language Processing (EMNLP 2023). Author's final version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Measuring and Narrowing the Compositionality Gap in Language Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.03350v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.03350v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ofir Press, Muru Zhang, Sewon Min, Ludwig Schmidt, Noah A. Smith, Mike Lewis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the ability of language models to perform compositional
+reasoning tasks where the overall solution depends on correctly composing the
+answers to sub-problems. We measure how often models can correctly answer all
+sub-problems but not generate the overall solution, a ratio we call the
+compositionality gap. We evaluate this ratio by asking multi-hop questions with
+answers that require composing multiple facts unlikely to have been observed
+together during pretraining. In the GPT-3 family of models, as model size
+increases we show that the single-hop question answering performance improves
+faster than the multi-hop performance does, therefore the compositionality gap
+does not decrease. This surprising result suggests that while more powerful
+models memorize and recall more factual knowledge, they show no corresponding
+improvement in their ability to perform this kind of compositional reasoning.
+  We then demonstrate how elicitive prompting (such as chain of thought)
+narrows the compositionality gap by reasoning explicitly. We present a new
+method, self-ask, that further improves on chain of thought. In our method, the
+model explicitly asks itself (and answers) follow-up questions before answering
+the initial question. We finally show that self-ask's structured prompting lets
+us easily plug in a search engine to answer the follow-up questions, which
+additionally improves accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Grounding Complex Natural Language Commands for Temporal Tasks in Unseen
+  Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.11649v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.11649v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jason Xinyu Liu, Ziyi Yang, Ifrah Idrees, Sam Liang, Benjamin Schornstein, Stefanie Tellex, Ankit Shah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Grounding navigational commands to linear temporal logic (LTL) leverages its
+unambiguous semantics for reasoning about long-horizon tasks and verifying the
+satisfaction of temporal constraints. Existing approaches require training data
+from the specific environment and landmarks that will be used in natural
+language to understand commands in those environments. We propose Lang2LTL, a
+modular system and a software package that leverages large language models
+(LLMs) to ground temporal navigational commands to LTL specifications in
+environments without prior language data. We comprehensively evaluate Lang2LTL
+for five well-defined generalization behaviors. Lang2LTL demonstrates the
+state-of-the-art ability of a single model to ground navigational commands to
+diverse temporal specifications in 21 city-scaled environments. Finally, we
+demonstrate a physical robot using Lang2LTL can follow 52 semantically diverse
+navigational commands in two indoor environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Conference on Robot Learning 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Online Speculative Decoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07177v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07177v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoxuan Liu, Lanxiang Hu, Peter Bailis, Ion Stoica, Zhijie Deng, Alvin Cheung, Hao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speculative decoding is a pivotal technique to accelerate the inference of
+large language models (LLMs) by employing a smaller draft model to predict the
+target model's outputs. However, its efficacy can be limited due to the low
+predictive accuracy of the draft model, particularly when faced with diverse
+text inputs and a significant capability gap between the draft and target
+models. We introduce online speculative decoding (OSD) to address this
+challenge. The main idea is to continually update (multiple) draft model(s) on
+observed user query data using the abundant excess computational power in an
+LLM serving cluster. Given that LLM inference is memory-bounded, the surplus
+computational power in a typical LLM serving cluster can be repurposed for
+online retraining of draft models, thereby making the training cost-neutral.
+Since the query distribution of an LLM service is relatively simple, retraining
+on query distribution enables the draft model to more accurately predict the
+target model's outputs, particularly on data originating from query
+distributions. As the draft model evolves online, it aligns with the query
+distribution in real time, mitigating distribution shifts. We develop a
+prototype of online speculative decoding based on online knowledge distillation
+and evaluate it using both synthetic and real query data on several popular
+LLMs. The results show a substantial increase in the token acceptance rate by
+0.1 to 0.65, which translates into 1.22x to 3.06x latency reduction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">125</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DELIFFAS: Deformable Light Fields for Fast Avatar Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11449v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11449v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youngjoong Kwon, Lingjie Liu, Henry Fuchs, Marc Habermann, Christian Theobalt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating controllable and photorealistic digital human avatars is a
+long-standing and important problem in Vision and Graphics. Recent methods have
+shown great progress in terms of either photorealism or inference speed while
+the combination of the two desired properties still remains unsolved. To this
+end, we propose a novel method, called DELIFFAS, which parameterizes the
+appearance of the human as a surface light field that is attached to a
+controllable and deforming human mesh model. At the core, we represent the
+light field around the human with a deformable two-surface parameterization,
+which enables fast and accurate inference of the human appearance. This allows
+perceptual supervision on the full image compared to previous approaches that
+could only supervise individual pixels or small patches due to their slow
+runtime. Our carefully designed human representation and supervision strategy
+leads to state-of-the-art synthesis results and inference time. The video
+results and code are available at
+https://vcai.mpi-inf.mpg.de/projects/DELIFFAS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 4K4D: Real-Time 4D View Synthesis at 4K Resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11448v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11448v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Xu, Sida Peng, Haotong Lin, Guangzhao He, Jiaming Sun, Yujun Shen, Hujun Bao, Xiaowei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper targets high-fidelity and real-time view synthesis of dynamic 3D
+scenes at 4K resolution. Recently, some methods on dynamic view synthesis have
+shown impressive rendering quality. However, their speed is still limited when
+rendering high-resolution images. To overcome this problem, we propose 4K4D, a
+4D point cloud representation that supports hardware rasterization and enables
+unprecedented rendering speed. Our representation is built on a 4D feature grid
+so that the points are naturally regularized and can be robustly optimized. In
+addition, we design a novel hybrid appearance model that significantly boosts
+the rendering quality while preserving efficiency. Moreover, we develop a
+differentiable depth peeling algorithm to effectively learn the proposed model
+from RGB videos. Experiments show that our representation can be rendered at
+over 400 FPS on the DNA-Rendering dataset at 1080p resolution and 80 FPS on the
+ENeRF-Outdoor dataset at 4K resolution using an RTX 4090 GPU, which is 30x
+faster than previous methods and achieves the state-of-the-art rendering
+quality. We will release the code for reproducibility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://zju3dv.github.io/4k4d</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Set-of-Mark <span class="highlight-title">Prompt</span>ing Unleashes Extraordinary Visual Grounding in <span class="highlight-title">GPT</span>-4V 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11441v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11441v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianwei Yang, Hao Zhang, Feng Li, Xueyan Zou, Chunyuan Li, Jianfeng Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Set-of-Mark (SoM), a new visual prompting method, to unleash the
+visual grounding abilities of large multimodal models (LMMs), such as GPT-4V.
+As illustrated in Fig. 1 (right), we employ off-the-shelf interactive
+segmentation models, such as SAM, to partition an image into regions at
+different levels of granularity, and overlay these regions with a set of marks
+e.g., alphanumerics, masks, boxes. Using the marked image as input, GPT-4V can
+answer the questions that require visual grounding. We perform a comprehensive
+empirical study to validate the effectiveness of SoM on a wide range of
+fine-grained vision and multimodal tasks. For example, our experiments show
+that GPT-4V with SoM outperforms the state-of-the-art fully-finetuned referring
+segmentation model on RefCOCOg in a zero-shot setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EvalCrafter: Benchmarking and Evaluating Large Video Generation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11440v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11440v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaofang Liu, Xiaodong Cun, Xuebo Liu, Xintao Wang, Yong Zhang, Haoxin Chen, Yang Liu, Tieyong Zeng, Raymond Chan, Ying Shan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The vision and language generative models have been overgrown in recent
+years. For video generation, various open-sourced models and public-available
+services are released for generating high-visual quality videos. However, these
+methods often use a few academic metrics, for example, FVD or IS, to evaluate
+the performance. We argue that it is hard to judge the large conditional
+generative models from the simple metrics since these models are often trained
+on very large datasets with multi-aspect abilities. Thus, we propose a new
+framework and pipeline to exhaustively evaluate the performance of the
+generated videos. To achieve this, we first conduct a new prompt list for
+text-to-video generation by analyzing the real-world prompt list with the help
+of the large language model. Then, we evaluate the state-of-the-art video
+generative models on our carefully designed benchmarks, in terms of visual
+qualities, content qualities, motion qualities, and text-caption alignment with
+around 18 objective metrics. To obtain the final leaderboard of the models, we
+also fit a series of coefficients to align the objective metrics to the users'
+opinions. Based on the proposed opinion alignment method, our final score shows
+a higher correlation than simply averaging the metrics, showing the
+effectiveness of the proposed evaluation method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Map Relations for Unsupervised Non-Rigid Shape Matching <span class="chip">3DV 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11420v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11420v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongliang Cao, Paul Roetzer, Florian Bernard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel unsupervised learning approach for non-rigid 3D shape
+matching. Our approach improves upon recent state-of-the art deep functional
+map methods and can be applied to a broad range of different challenging
+scenarios. Previous deep functional map methods mainly focus on feature
+extraction and aim exclusively at obtaining more expressive features for
+functional map computation. However, the importance of the functional map
+computation itself is often neglected and the relationship between the
+functional map and point-wise map is underexplored. In this paper, we
+systematically investigate the coupling relationship between the functional map
+from the functional map solver and the point-wise map based on feature
+similarity. To this end, we propose a self-adaptive functional map solver to
+adjust the functional map regularisation for different shape matching
+scenarios, together with a vertex-wise contrastive loss to obtain more
+discriminative features. Using different challenging datasets (including
+non-isometry, topological noise and partiality), we demonstrate that our method
+substantially outperforms previous state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>3DV 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VcT: Visual change <span class="highlight-title">Transformer</span> for Remote Sensing Image Change Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11417v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11417v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Jiang, Zitian Wang, Xixi Wang, Ziyan Zhang, Lan Chen, Xiao Wang, Bin Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing visual change detectors usually adopt CNNs or Transformers for
+feature representation learning and focus on learning effective representation
+for the changed regions between images. Although good performance can be
+obtained by enhancing the features of the change regions, however, these works
+are still limited mainly due to the ignorance of mining the unchanged
+background context information. It is known that one main challenge for change
+detection is how to obtain the consistent representations for two images
+involving different variations, such as spatial variation, sunlight intensity,
+etc. In this work, we demonstrate that carefully mining the common background
+information provides an important cue to learn the consistent representations
+for the two images which thus obviously facilitates the visual change detection
+problem. Based on this observation, we propose a novel Visual change
+Transformer (VcT) model for visual change detection problem. To be specific, a
+shared backbone network is first used to extract the feature maps for the given
+image pair. Then, each pixel of feature map is regarded as a graph node and the
+graph neural network is proposed to model the structured information for coarse
+change map prediction. Top-K reliable tokens can be mined from the map and
+refined by using the clustering algorithm. Then, these reliable tokens are
+enhanced by first utilizing self/cross-attention schemes and then interacting
+with original features via an anchor-primary attention learning module.
+Finally, the prediction head is proposed to get a more accurate change map.
+Extensive experiments on multiple benchmark datasets validated the
+effectiveness of our proposed VcT model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Transactions on Geoscience and Remote Sensing (TGRS)
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Automatic Satellite Images Captions Generation Using Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11392v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11392v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingxu He, Qiqi Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic image captioning is a promising technique for conveying visual
+information using natural language. It can benefit various tasks in satellite
+remote sensing, such as environmental monitoring, resource management, disaster
+management, etc. However, one of the main challenges in this domain is the lack
+of large-scale image-caption datasets, as they require a lot of human expertise
+and effort to create. Recent research on large language models (LLMs) has
+demonstrated their impressive performance in natural language understanding and
+generation tasks. Nonetheless, most of them cannot handle images (GPT-3.5,
+Falcon, Claude, etc.), while conventional captioning models pre-trained on
+general ground-view images often fail to produce detailed and accurate captions
+for aerial images (BLIP, GIT, CM3, CM3Leon, etc.). To address this problem, we
+propose a novel approach: Automatic Remote Sensing Image Captioning (ARSIC) to
+automatically collect captions for remote sensing images by guiding LLMs to
+describe their object annotations. We also present a benchmark model that
+adapts the pre-trained generative image2text model (GIT) to generate
+high-quality captions for remote-sensing images. Our evaluation demonstrates
+the effectiveness of our approach for collecting captions for remote sensing
+images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A voxel-level approach to brain age prediction: A method to assess
+  regional brain aging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11385v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11385v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neha Gianchandani, Mahsa Dibaji, Johanna Ospel, Fernando Vega, Mariana Bento, M. Ethan MacDonald, Roberto Souza
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain aging is a regional phenomenon, a facet that remains relatively
+under-explored within the realm of brain age prediction research using machine
+learning methods. Voxel-level predictions can provide localized brain age
+estimates that can provide granular insights into the regional aging processes.
+This is essential to understand the differences in aging trajectories in
+healthy versus diseased subjects. In this work, a deep learning-based multitask
+model is proposed for voxel-level brain age prediction from T1-weighted
+magnetic resonance images. The proposed model outperforms the models existing
+in the literature and yields valuable clinical insights when applied to both
+healthy and diseased populations. Regional analysis is performed on the
+voxel-level brain age predictions to understand aging trajectories of known
+anatomical regions in the brain and show that there exist disparities in
+regional aging trajectories of healthy subjects compared to ones with
+underlying neurological disorders such as Dementia and more specifically,
+Alzheimer's disease. Our code is available at
+https://github.com/nehagianchandani/Voxel-level-brain-age-prediction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, submitted to MELBA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Generalizable Multi-Camera 3D Object Detection via Perspective
+  Debiasing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11346v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11346v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Lu, Yunpeng Zhang, Qing Lian, Dalong Du, Yingcong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting objects in 3D space using multiple cameras, known as Multi-Camera
+3D Object Detection (MC3D-Det), has gained prominence with the advent of
+bird's-eye view (BEV) approaches. However, these methods often struggle when
+faced with unfamiliar testing environments due to the lack of diverse training
+data encompassing various viewpoints and environments. To address this, we
+propose a novel method that aligns 3D detection with 2D camera plane results,
+ensuring consistent and accurate detections. Our framework, anchored in
+perspective debiasing, helps the learning of features resilient to domain
+shifts. In our approach, we render diverse view maps from BEV features and
+rectify the perspective bias of these maps, leveraging implicit foreground
+volumes to bridge the camera and BEV planes. This two-step process promotes the
+learning of perspective- and context-independent features, crucial for accurate
+object detection across varying viewpoints, camera parameters and environment
+conditions. Notably, our model-agnostic approach preserves the original network
+structure without incurring additional inference costs, facilitating seamless
+integration across various models and simplifying deployment. Furthermore, we
+also show our approach achieves satisfactory results in real data when trained
+only with virtual datasets, eliminating the need for real scene annotations.
+Experimental results on both Domain Generalization (DG) and Unsupervised Domain
+Adaptation (UDA) clearly demonstrate its effectiveness. Our code will be
+released.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual Cognitive Architecture: Incorporating Biases and Multi-Memory
+  Systems for Lifelong Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11341v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11341v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shruthi Gowda, Bahram Zonooz, Elahe Arani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial neural networks (ANNs) exhibit a narrow scope of expertise on
+stationary independent data. However, the data in the real world is continuous
+and dynamic, and ANNs must adapt to novel scenarios while also retaining the
+learned knowledge to become lifelong learners. The ability of humans to excel
+at these tasks can be attributed to multiple factors ranging from cognitive
+computational structures, cognitive biases, and the multi-memory systems in the
+brain. We incorporate key concepts from each of these to design a novel
+framework, Dual Cognitive Architecture (DUCA), which includes multiple
+sub-systems, implicit and explicit knowledge representation dichotomy,
+inductive bias, and a multi-memory system. The inductive bias learner within
+DUCA is instrumental in encoding shape information, effectively countering the
+tendency of ANNs to learn local textures. Simultaneously, the inclusion of a
+semantic memory submodule facilitates the gradual consolidation of knowledge,
+replicating the dynamics observed in fast and slow learning systems,
+reminiscent of the principles underpinning the complementary learning system in
+human cognition. DUCA shows improvement across different settings and datasets,
+and it also exhibits reduced task recency bias, without the need for extra
+information. To further test the versatility of lifelong learning methods on a
+challenging distribution shift, we introduce a novel domain-incremental dataset
+DN4IL. In addition to improving performance on existing benchmarks, DUCA also
+demonstrates superior performance on this complex dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Transactions on Machine Learning Research (TMLR)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Key Point-based Orientation Estimation of Strawberries for Robotic Fruit
+  Picking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11333v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11333v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Justin Le Louëdec, Grzegorz Cielniak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Selective robotic harvesting is a promising technological solution to address
+labour shortages which are affecting modern agriculture in many parts of the
+world. For an accurate and efficient picking process, a robotic harvester
+requires the precise location and orientation of the fruit to effectively plan
+the trajectory of the end effector. The current methods for estimating fruit
+orientation employ either complete 3D information which typically requires
+registration from multiple views or rely on fully-supervised learning
+techniques, which require difficult-to-obtain manual annotation of the
+reference orientation. In this paper, we introduce a novel key-point-based
+fruit orientation estimation method allowing for the prediction of 3D
+orientation from 2D images directly. The proposed technique can work without
+full 3D orientation annotations but can also exploit such information for
+improved accuracy. We evaluate our work on two separate datasets of strawberry
+images obtained from real-world data collection scenarios. Our proposed method
+achieves state-of-the-art performance with an average error as low as
+$8^{\circ}$, improving predictions by $\sim30\%$ compared to previous work
+presented in~\cite{wagner2021efficient}. Furthermore, our method is suited for
+real-time robotic applications with fast inference times of $\sim30$ms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Generic Semi-Supervised Framework for Volumetric Medical Image
+  Segmentation <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11320v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11320v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haonan Wang, Xiaomeng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Volume-wise labeling in 3D medical images is a time-consuming task that
+requires expertise. As a result, there is growing interest in using
+semi-supervised learning (SSL) techniques to train models with limited labeled
+data. However, the challenges and practical applications extend beyond SSL to
+settings such as unsupervised domain adaptation (UDA) and semi-supervised
+domain generalization (SemiDG). This work aims to develop a generic SSL
+framework that can handle all three settings. We identify two main obstacles to
+achieving this goal in the existing SSL framework: 1) the weakness of capturing
+distribution-invariant features; and 2) the tendency for unlabeled data to be
+overwhelmed by labeled data, leading to over-fitting to the labeled data during
+training. To address these issues, we propose an Aggregating & Decoupling
+framework. The aggregating part consists of a Diffusion encoder that constructs
+a common knowledge set by extracting distribution-invariant features from
+aggregated information from multiple distributions/domains. The decoupling part
+consists of three decoders that decouple the training process with labeled and
+unlabeled data, thus avoiding over-fitting to labeled data, specific domains
+and classes. We evaluate our proposed framework on four benchmark datasets for
+SSL, Class-imbalanced SSL, UDA and SemiDG. The results showcase notable
+improvements compared to state-of-the-art methods across all four settings,
+indicating the potential of our framework to tackle more challenging SSL
+scenarios. Code and models are available at:
+https://github.com/xmed-lab/GenericSSL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MonoSKD: General Distillation Framework for Monocular 3D Object
+  Detection via Spearman Correlation Coefficient <span class="chip">ECAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11316v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11316v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sen Wang, Jin Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monocular 3D object detection is an inherently ill-posed problem, as it is
+challenging to predict accurate 3D localization from a single image. Existing
+monocular 3D detection knowledge distillation methods usually project the LiDAR
+onto the image plane and train the teacher network accordingly. Transferring
+LiDAR-based model knowledge to RGB-based models is more complex, so a general
+distillation strategy is needed. To alleviate cross-modal prob-lem, we propose
+MonoSKD, a novel Knowledge Distillation framework for Monocular 3D detection
+based on Spearman correlation coefficient, to learn the relative correlation
+between cross-modal features. Considering the large gap between these features,
+strict alignment of features may mislead the training, so we propose a looser
+Spearman loss. Furthermore, by selecting appropriate distillation locations and
+removing redundant modules, our scheme saves more GPU resources and trains
+faster than existing methods. Extensive experiments are performed to verify the
+effectiveness of our framework on the challenging KITTI 3D object detection
+benchmark. Our method achieves state-of-the-art performance until submission
+with no additional inference computational cost. Our codes are available at
+https://github.com/Senwang98/MonoSKD
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ECAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi <span class="highlight-title">Self-supervised</span> Pre-fine-tuned <span class="highlight-title">Transformer</span> Fusion for Better
+  Intelligent Transportation Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11307v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11307v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Juwu Zheng, Jiangtao Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Intelligent transportation system combines advanced information technology to
+provide intelligent services such as monitoring, detection, and early warning
+for modern transportation. Intelligent transportation detection is the
+cornerstone of many intelligent traffic services by identifying task targets
+through object detection methods. However existing detection methods in
+intelligent transportation are limited by two aspects. First, there is a
+difference between the model knowledge pre-trained on large-scale datasets and
+the knowledge required for target task. Second, most detection models follow
+the pattern of single-source learning, which limits the learning ability. To
+address these problems, we propose a Multi Self-supervised Pre-fine-tuned
+Transformer Fusion (MSPTF) network, consisting of two steps: unsupervised
+pre-fine-tune domain knowledge learning and multi-model fusion target task
+learning. In the first step, we introduced self-supervised learning methods
+into transformer model pre-fine-tune which could reduce data costs and
+alleviate the knowledge gap between pre-trained model and target task. In the
+second step, we take feature information differences between different model
+architectures and different pre-fine-tune tasks into account and propose
+Multi-model Semantic Consistency Cross-attention Fusion (MSCCF) network to
+combine different transformer model features by considering channel semantic
+consistency and feature vector semantic consistency, which obtain more complete
+and proper fusion features for detection task. We experimented the proposed
+method on vehicle recognition dataset and road disease detection dataset and
+achieved 1.1%, 5.5%, 4.2% improvement compared with baseline and 0.7%, 1.8%,
+1.7% compared with sota, which proved the effectiveness of our method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CorrTalk: Correlation Between Hierarchical Speech and Facial Activity
+  Variances for 3D Animation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11295v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11295v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhaojie Chu, Kailing Guo, Xiaofen Xing, Yilin Lan, Bolun Cai, Xiangmin Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech-driven 3D facial animation is a challenging cross-modal task that has
+attracted growing research interest. During speaking activities, the mouth
+displays strong motions, while the other facial regions typically demonstrate
+comparatively weak activity levels. Existing approaches often simplify the
+process by directly mapping single-level speech features to the entire facial
+animation, which overlook the differences in facial activity intensity leading
+to overly smoothed facial movements. In this study, we propose a novel
+framework, CorrTalk, which effectively establishes the temporal correlation
+between hierarchical speech features and facial activities of different
+intensities across distinct regions. A novel facial activity intensity metric
+is defined to distinguish between strong and weak facial activity, obtained by
+computing the short-time Fourier transform of facial vertex displacements.
+Based on the variances in facial activity, we propose a dual-branch decoding
+framework to synchronously synthesize strong and weak facial activity, which
+guarantees wider intensity facial animation synthesis. Furthermore, a weighted
+hierarchical feature encoder is proposed to establish temporal correlation
+between hierarchical speech features and facial activity at different
+intensities, which ensures lip-sync and plausible facial expressions. Extensive
+qualitatively and quantitatively experiments as well as a user study indicate
+that our CorrTalk outperforms existing state-of-the-art methods. The source
+code and supplementary video are publicly available at:
+https://zjchu.github.io/projects/CorrTalk/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-Supervised</span> 3D Scene Flow Estimation and Motion Prediction using
+  Local Rigidity Prior <span class="chip">CVPR 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11284v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11284v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruibo Li, Chi Zhang, Zhe Wang, Chunhua Shen, Guosheng Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this article, we investigate self-supervised 3D scene flow estimation and
+class-agnostic motion prediction on point clouds. A realistic scene can be well
+modeled as a collection of rigidly moving parts, therefore its scene flow can
+be represented as a combination of the rigid motion of these individual parts.
+Building upon this observation, we propose to generate pseudo scene flow labels
+for self-supervised learning through piecewise rigid motion estimation, in
+which the source point cloud is decomposed into local regions and each region
+is treated as rigid. By rigidly aligning each region with its potential
+counterpart in the target point cloud, we obtain a region-specific rigid
+transformation to generate its pseudo flow labels. To mitigate the impact of
+potential outliers on label generation, when solving the rigid registration for
+each region, we alternately perform three steps: establishing point
+correspondences, measuring the confidence for the correspondences, and updating
+the rigid transformation based on the correspondences and their confidence. As
+a result, confident correspondences will dominate label generation and a
+validity mask will be derived for the generated pseudo labels. By using the
+pseudo labels together with their validity mask for supervision, models can be
+trained in a self-supervised manner. Extensive experiments on FlyingThings3D
+and KITTI datasets demonstrate that our method achieves new state-of-the-art
+performance in self-supervised scene flow learning, without any ground truth
+scene flow for supervision, even performing better than some supervised
+counterparts. Additionally, our method is further extended to class-agnostic
+motion prediction and significantly outperforms previous state-of-the-art
+self-supervised methods on nuScenes dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>An extension of our CVPR 2022 paper (RigidFlow: Self-Supervised Scene
+  Flow Learning on Point Clouds by Local Rigidity Prior)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Video Super-Resolution Using a Grouped Residual in Residual Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11276v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11276v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        MohammadHossein Ashoori, Arash Amini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Super-resolution (SR) is the technique of increasing the nominal resolution
+of image / video content accompanied with quality improvement. Video
+super-resolution (VSR) can be considered as the generalization of single image
+super-resolution (SISR). This generalization should be such that more detail is
+created in the output using adjacent input frames. In this paper, we propose a
+grouped residual in residual network (GRRN) for VSR. By adjusting the
+hyperparameters of the proposed structure, we train three networks with
+different numbers of parameters and compare their quantitative and qualitative
+results with the existing methods. Although based on some quantitative
+criteria, GRRN does not provide better results than the existing methods, in
+terms of the quality of the output image it has acceptable performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Image Compression using only Attention based Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11265v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11265v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Natacha Luka, Romain Negrel, David Picard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent research, Learned Image Compression has gained prominence for its
+capacity to outperform traditional handcrafted pipelines, especially at low
+bit-rates. While existing methods incorporate convolutional priors with
+occasional attention blocks to address long-range dependencies, recent advances
+in computer vision advocate for a transformative shift towards fully
+transformer-based architectures grounded in the attention mechanism. This paper
+investigates the feasibility of image compression exclusively using attention
+layers within our novel model, QPressFormer. We introduce the concept of
+learned image queries to aggregate patch information via cross-attention,
+followed by quantization and coding techniques. Through extensive evaluations,
+our work demonstrates competitive performance achieved by convolution-free
+architectures across the popular Kodak, DIV2K, and CLIC datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An empirical study of automatic wildlife detection using drone thermal
+  imaging and object detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11257v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11257v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miao Chang, Tan Vuong, Manas Palaparthi, Lachlan Howell, Alessio Bonti, Mohamed Abdelrazek, Duc Thanh Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial intelligence has the potential to make valuable contributions to
+wildlife management through cost-effective methods for the collection and
+interpretation of wildlife data. Recent advances in remotely piloted aircraft
+systems (RPAS or ``drones'') and thermal imaging technology have created new
+approaches to collect wildlife data. These emerging technologies could provide
+promising alternatives to standard labourious field techniques as well as cover
+much larger areas. In this study, we conduct a comprehensive review and
+empirical study of drone-based wildlife detection. Specifically, we collect a
+realistic dataset of drone-derived wildlife thermal detections. Wildlife
+detections, including arboreal (for instance, koalas, phascolarctos cinereus)
+and ground dwelling species in our collected data are annotated via bounding
+boxes by experts. We then benchmark state-of-the-art object detection
+algorithms on our collected dataset. We use these experimental results to
+identify issues and discuss future directions in automatic animal monitoring
+using drones.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gromov-Wassertein-like Distances in the Gaussian Mixture Models Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11256v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11256v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antoine Salmona, Julie Delon, Agnès Desolneux
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce two Gromov-Wasserstein-type distances on the set
+of Gaussian mixture models. The first one takes the form of a
+Gromov-Wasserstein distance between two discrete distributionson the space of
+Gaussian measures. This distance can be used as an alternative to
+Gromov-Wasserstein for applications which only require to evaluate how far the
+distributions are from each other but does not allow to derive directly an
+optimal transportation plan between clouds of points. To design a way to define
+such a transportation plan, we introduce another distance between measures
+living in incomparable spaces that turns out to be closely related to
+Gromov-Wasserstein. When restricting the set of admissible transportation
+couplings to be themselves Gaussian mixture models in this latter, this defines
+another distance between Gaussian mixture models that can be used as another
+alternative to Gromov-Wasserstein and which allows to derive an optimal
+assignment between points. Finally, we design a transportation plan associated
+with the first distance by analogy with the second, and we illustrate their
+practical uses on medium-to-large scale problems such as shape matching and
+hyperspectral image color transfer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LiDAR-based 4D Occupancy Completion and Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11239v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11239v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinhao Liu, Moonjun Gong, Qi Fang, Haoyu Xie, Yiming Li, Hang Zhao, Chen Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene completion and forecasting are two popular perception problems in
+research for mobile agents like autonomous vehicles. Existing approaches treat
+the two problems in isolation, resulting in a separate perception of the two
+aspects. In this paper, we introduce a novel LiDAR perception task of Occupancy
+Completion and Forecasting (OCF) in the context of autonomous driving to unify
+these aspects into a cohesive framework. This task requires new algorithms to
+address three challenges altogether: (1) sparse-to-dense reconstruction, (2)
+partial-to-complete hallucination, and (3) 3D-to-4D prediction. To enable
+supervision and evaluation, we curate a large-scale dataset termed OCFBench
+from public autonomous driving datasets. We analyze the performance of closely
+related existing baseline models and our own ones on our dataset. We envision
+that this research will inspire and call for further investigation in this
+evolving and crucial area of 4D perception. Our code for data curation and
+baseline implementation is available at https://github.com/ai4ce/Occ4cast.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Innovative Methods for Non-Destructive Inspection of Handwritten
+  Documents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11217v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11217v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eleonora Breci, Luca Guarnera, Sebastiano Battiato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Handwritten document analysis is an area of forensic science, with the goal
+of establishing authorship of documents through examination of inherent
+characteristics. Law enforcement agencies use standard protocols based on
+manual processing of handwritten documents. This method is time-consuming, is
+often subjective in its evaluation, and is not replicable. To overcome these
+limitations, in this paper we present a framework capable of extracting and
+analyzing intrinsic measures of manuscript documents related to text line
+heights, space between words, and character sizes using image processing and
+deep learning techniques. The final feature vector for each document involved
+consists of the mean and standard deviation for every type of measure
+collected. By quantifying the Euclidean distance between the feature vectors of
+the documents to be compared, authorship can be discerned. We also proposed a
+new and challenging dataset consisting of 362 handwritten manuscripts written
+on paper and digital devices by 124 different people. Our study pioneered the
+comparison between traditionally handwritten documents and those produced with
+digital tools (e.g., tablets). Experimental results demonstrate the ability of
+our method to objectively determine authorship in different writing media,
+outperforming the state of the art.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Comprehensive Representations with Richer Self for
+  Text-to-Image Person Re-Identification <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11210v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11210v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuanglin Yan, Neng Dong, Jun Liu, Liyan Zhang, Jinhui Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image person re-identification (TIReID) retrieves pedestrian images
+of the same identity based on a query text. However, existing methods for
+TIReID typically treat it as a one-to-one image-text matching problem, only
+focusing on the relationship between image-text pairs within a view. The
+many-to-many matching between image-text pairs across views under the same
+identity is not taken into account, which is one of the main reasons for the
+poor performance of existing methods. To this end, we propose a simple yet
+effective framework, called LCR$^2$S, for modeling many-to-many correspondences
+of the same identity by learning comprehensive representations for both
+modalities from a novel perspective. We construct a support set for each image
+(text) by using other images (texts) under the same identity and design a
+multi-head attentional fusion module to fuse the image (text) and its support
+set. The resulting enriched image and text features fuse information from
+multiple views, which are aligned to train a "richer" TIReID model with
+many-to-many correspondences. Since the support set is unavailable during
+inference, we propose to distill the knowledge learned by the "richer" model
+into a lightweight model for inference with a single image/text as input. The
+lightweight model focuses on semantic association and reasoning of multi-view
+information, which can generate a comprehensive representation containing
+multi-view information with only a single-view input to perform accurate
+text-to-image retrieval during inference. In particular, we use the intra-modal
+features and inter-modal semantic relations of the "richer" model to supervise
+the lightweight model to inherit its powerful capability. Extensive experiments
+demonstrate the effectiveness of LCR$^2$S, and it also achieves new
+state-of-the-art performance on three popular TIReID datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Video Deepfake Detection: A DCT-Based Approach with
+  Patch-Level Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11204v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11204v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Guarnera, Salvatore Manganello, Sebastiano Battiato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The term deepfake refers to all those multimedia contents that were
+synthetically altered or created from scratch through the use of generative
+models. This phenomenon has become widespread due to the use of increasingly
+accurate and efficient architectures capable of rendering manipulated content
+indistinguishable from real content. In order to fight the illicit use of this
+powerful technology, it has become necessary to develop algorithms able to
+distinguish synthetic content from real ones. In this study, a new algorithm
+for the detection of deepfakes in digital videos is presented, focusing on the
+main goal of creating a fast and explainable method from a forensic
+perspective. To achieve this goal, the I-frames were extracted in order to
+provide faster computation and analysis than approaches described in
+literature. In addition, to identify the most discriminating regions within
+individual video frames, the entire frame, background, face, eyes, nose, mouth,
+and face frame were analyzed separately. From the Discrete Cosine Transform
+(DCT), the Beta components were extracted from the AC coefficients and used as
+input to standard classifiers (e.g., k-NN, SVM, and others) in order to
+identify those frequencies most discriminative for solving the task in
+question. Experimental results obtained on the Faceforensics++ and Celeb-DF
+(v2) datasets show that the eye and mouth regions are those most discriminative
+and able to determine the nature of the video with greater reliability than the
+analysis of the whole frame. The method proposed in this study is analytical,
+fast and does not require much computational power.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sparse Multi-Object Render-and-Compare 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11184v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11184v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Langer, Ignas Budvytis, Roberto Cipolla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconstructing 3D shape and pose of static objects from a single image is an
+essential task for various industries, including robotics, augmented reality,
+and digital content creation. This can be done by directly predicting 3D shape
+in various representations or by retrieving CAD models from a database and
+predicting their alignments. Directly predicting 3D shapes often produces
+unrealistic, overly smoothed or tessellated shapes. Retrieving CAD models
+ensures realistic shapes but requires robust and accurate alignment. Learning
+to directly predict CAD model poses from image features is challenging and
+inaccurate. Works, such as ROCA, compute poses from predicted normalised object
+coordinates which can be more accurate but are susceptible to systematic
+failure. SPARC demonstrates that following a ''render-and-compare'' approach
+where a network iteratively improves upon its own predictions achieves accurate
+alignments. Nevertheless, it performs individual CAD alignment for every object
+detected in an image. This approach is slow when applied to many objects as the
+time complexity increases linearly with the number of objects and can not learn
+inter-object relations. Introducing a new network architecture Multi-SPARC we
+learn to perform CAD model alignments for multiple detected objects jointly.
+Compared to other single-view methods we achieve state-of-the-art performance
+on the challenging real-world dataset ScanNet. By improving the instance
+alignment accuracy from 31.8% to 40.3% we perform similar to state-of-the-art
+multi-view methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FocDepthFormer: <span class="highlight-title">Transformer</span> with LSTM for Depth Estimation from Focus 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11178v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11178v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueyang Kang, Fengze Han, Abdur Fayjie, Dong Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Depth estimation from focal stacks is a fundamental computer vision problem
+that aims to infer depth from focus/defocus cues in the image stacks. Most
+existing methods tackle this problem by applying convolutional neural networks
+(CNNs) with 2D or 3D convolutions over a set of fixed stack images to learn
+features across images and stacks. Their performance is restricted due to the
+local properties of the CNNs, and they are constrained to process a fixed
+number of stacks consistent in train and inference, limiting the generalization
+to the arbitrary length of stacks. To handle the above limitations, we develop
+a novel Transformer-based network, FocDepthFormer, composed mainly of a
+Transformer with an LSTM module and a CNN decoder. The self-attention in
+Transformer enables learning more informative features via an implicit
+non-local cross reference. The LSTM module is learned to integrate the
+representations across the stack with arbitrary images. To directly capture the
+low-level features of various degrees of focus/defocus, we propose to use
+multi-scale convolutional kernels in an early-stage encoder. Benefiting from
+the design with LSTM, our FocDepthFormer can be pre-trained with abundant
+monocular RGB depth estimation data for visual pattern capturing, alleviating
+the demand for the hard-to-collect focal stack data. Extensive experiments on
+various focal stack benchmark datasets show that our model outperforms the
+state-of-the-art models on multiple metrics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 18 figures, journal paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge Extraction and Distillation from Large-Scale Image-Text
+  Colonoscopy Records Leveraging Large Language and Vision Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11173v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11173v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuo Wang, Yan Zhu, Xiaoyuan Luo, Zhiwei Yang, Yizhe Zhang, Peiyao Fu, Manning Wang, Zhijian Song, Quanlin Li, Pinghong Zhou, Yike Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of artificial intelligence systems for colonoscopy analysis
+often necessitates expert-annotated image datasets. However, limitations in
+dataset size and diversity impede model performance and generalisation.
+Image-text colonoscopy records from routine clinical practice, comprising
+millions of images and text reports, serve as a valuable data source, though
+annotating them is labour-intensive. Here we leverage recent advancements in
+large language and vision models and propose EndoKED, a data mining paradigm
+for deep knowledge extraction and distillation. EndoKED automates the
+transformation of raw colonoscopy records into image datasets with pixel-level
+annotation. We validate EndoKED using multi-centre datasets of raw colonoscopy
+records (~1 million images), demonstrating its superior performance in training
+polyp detection and segmentation models. Furthermore, the EndoKED pre-trained
+vision backbone enables data-efficient and generalisable learning for optical
+biopsy, achieving expert-level performance in both retrospective and
+prospective validation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised <span class="highlight-title">Pre-Train</span>ing Using Masked Autoencoders for ECG Analysis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11153v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11153v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoxin Wang, Qingyuan Wang, Ganesh Neelakanta Iyer, Avishek Nag, Deepu John
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised learning methods have become increasingly important in deep
+learning due to their demonstrated large utilization of datasets and higher
+accuracy in computer vision and natural language processing tasks. There is a
+growing trend to extend unsupervised learning methods to other domains, which
+helps to utilize a large amount of unlabelled data. This paper proposes an
+unsupervised pre-training technique based on masked autoencoder (MAE) for
+electrocardiogram (ECG) signals. In addition, we propose a task-specific
+fine-tuning to form a complete framework for ECG analysis. The framework is
+high-level, universal, and not individually adapted to specific model
+architectures or tasks. Experiments are conducted using various model
+architectures and large-scale datasets, resulting in an accuracy of 94.39% on
+the MITDB dataset for ECG arrhythmia classification task. The result shows a
+better performance for the classification of previously unseen data for the
+proposed approach compared to fully supervised methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Biomedical Circuits and Systems (BIOCAS) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BayesDiff: Estimating Pixel-wise Uncertainty in Diffusion via Bayesian
+  Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11142v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11142v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siqi Kou, Lei Gan, Dequan Wang, Chongxuan Li, Zhijie Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have impressive image generation capability, but low-quality
+generations still exist, and their identification remains challenging due to
+the lack of a proper sample-wise metric. To address this, we propose BayesDiff,
+a pixel-wise uncertainty estimator for generations from diffusion models based
+on Bayesian inference. In particular, we derive a novel uncertainty iteration
+principle to characterize the uncertainty dynamics in diffusion, and leverage
+the last-layer Laplace approximation for efficient Bayesian inference. The
+estimated pixel-wise uncertainty can not only be aggregated into a sample-wise
+metric to filter out low-fidelity images but also aids in augmenting successful
+generations and rectifying artifacts in failed generations in text-to-image
+tasks. Extensive experiments demonstrate the efficacy of BayesDiff and its
+promise for practical applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ USDC: Unified Static and Dynamic Compression for Visual <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11117v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11117v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huan Yuan, Chao Liao, Jianchao Tan, Peng Yao, Jiyuan Jia, Bin Chen, Chengru Song, Di Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual Transformers have achieved great success in almost all vision tasks,
+such as classification, detection, and so on. However, the model complexity and
+the inference speed of the visual transformers hinder their deployments in
+industrial products. Various model compression techniques focus on directly
+compressing the visual transformers into a smaller one while maintaining the
+model performance, however, the performance drops dramatically when the
+compression ratio is large. Furthermore, several dynamic network techniques
+have also been applied to dynamically compress the visual transformers to
+obtain input-adaptive efficient sub-structures during the inference stage,
+which can achieve a better trade-off between the compression ratio and the
+model performance. The upper bound of memory of dynamic models is not reduced
+in the practical deployment since the whole original visual transformer model
+and the additional control gating modules should be loaded onto devices
+together for inference. To alleviate two disadvantages of two categories of
+methods, we propose to unify the static compression and dynamic compression
+techniques jointly to obtain an input-adaptive compressed model, which can
+further better balance the total compression ratios and the model performances.
+Moreover, in practical deployment, the batch sizes of the training and
+inference stage are usually different, which will cause the model inference
+performance to be worse than the model training performance, which is not
+touched by all previous dynamic network papers. We propose a sub-group gates
+augmentation technique to solve this performance drop problem. Extensive
+experiments demonstrate the superiority of our method on various baseline
+visual transformers such as DeiT, T2T-ViT, and so on.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper was actually finished in 2021</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Super resolution of histopathological frozen sections via deep learning
+  preserving tissue structure 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11112v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11112v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elad Yoshai, Gil Goldinger, Miki Haifler, Natan T. Shaked
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Histopathology plays a pivotal role in medical diagnostics. In contrast to
+preparing permanent sections for histopathology, a time-consuming process,
+preparing frozen sections is significantly faster and can be performed during
+surgery, where the sample scanning time should be optimized. Super-resolution
+techniques allow imaging the sample in lower magnification and sparing scanning
+time. In this paper, we present a new approach to super resolution for
+histopathological frozen sections, with focus on achieving better distortion
+measures, rather than pursuing photorealistic images that may compromise
+critical diagnostic information. Our deep-learning architecture focuses on
+learning the error between interpolated images and real images, thereby it
+generates high-resolution images while preserving critical image details,
+reducing the risk of diagnostic misinterpretation. This is done by leveraging
+the loss functions in the frequency domain, assigning higher weights to the
+reconstruction of complex, high-frequency components. In comparison to existing
+methods, we obtained significant improvements in terms of Structural Similarity
+Index (SSIM) and Peak Signal-to-Noise Ratio (PSNR), as well as indicated
+details that lost in the low-resolution frozen-section images, affecting the
+pathologist's clinical decisions. Our approach has a great potential in
+providing more-rapid frozen-section imaging, with less scanning, while
+preserving the high resolution in the imaged sample.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D Structure-guided Network for Tooth Alignment in 2D Photograph 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11106v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11106v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yulong Dou, Lanzhuju Mei, Dinggang Shen, Zhiming Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Orthodontics focuses on rectifying misaligned teeth (i.e., malocclusions),
+affecting both masticatory function and aesthetics. However, orthodontic
+treatment often involves complex, lengthy procedures. As such, generating a 2D
+photograph depicting aligned teeth prior to orthodontic treatment is crucial
+for effective dentist-patient communication and, more importantly, for
+encouraging patients to accept orthodontic intervention. In this paper, we
+propose a 3D structure-guided tooth alignment network that takes 2D photographs
+as input (e.g., photos captured by smartphones) and aligns the teeth within the
+2D image space to generate an orthodontic comparison photograph featuring
+aesthetically pleasing, aligned teeth. Notably, while the process operates
+within a 2D image space, our method employs 3D intra-oral scanning models
+collected in clinics to learn about orthodontic treatment, i.e., projecting the
+pre- and post-orthodontic 3D tooth structures onto 2D tooth contours, followed
+by a diffusion model to learn the mapping relationship. Ultimately, the aligned
+tooth contours are leveraged to guide the generation of a 2D photograph with
+aesthetically pleasing, aligned teeth and realistic textures. We evaluate our
+network on various facial photographs, demonstrating its exceptional
+performance and strong applicability within the orthodontic industry.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalizability of CNN Architectures for Face Morph Presentation Attack 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11105v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11105v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sherko R. HmaSalah, Aras Asaad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic border control systems are wide spread in modern airports
+worldwide. Morphing attacks on face biometrics is a serious threat that
+undermines the security and reliability of face recognition systems deployed in
+airports and border controls. Therefore, developing a robust Machine Learning
+(ML) system is necessary to prevent criminals crossing borders with fake
+identifications especially since it has been shown that security officers
+cannot detect morphs better than machines. In this study, we investigate the
+generalization power of Convolutional Neural Network (CNN) architectures
+against morphing attacks. The investigation utilizes 5 distinct CNNs namely
+ShuffleNet, DenseNet201, VGG16, EffecientNet-B0 and InceptionResNet-v2. Each
+CNN architecture represents a well-known family of CNN models in terms of
+number of parameters, architectural design and performance across various
+computer vision applications. To ensure robust evaluation, we employ 4
+different datasets (Utrecht, London, Defacto and KurdFace) that contain a
+diverse range of digital face images which cover variations in ethnicity,
+gender, age, lighting condition and camera setting. One of the fundamental
+concepts of ML system design is the ability to generalize effectively to
+previously unseen data, hence not only we evaluate the performance of CNN
+models within individual datasets but also explore their performance across
+combined datasets and investigating each dataset in testing phase only.
+Experimental results on more than 8 thousand images (genuine and morph) from
+the 4 datasets show that InceptionResNet-v2 generalizes better to unseen data
+and outperforms the other 4 CNN models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SODA: Robust Training of Test-Time Data Adaptors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11093v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11093v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zige Wang, Yonggang Zhang, Zhen Fang, Long Lan, Wenjing Yang, Bo Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adapting models deployed to test distributions can mitigate the performance
+degradation caused by distribution shifts. However, privacy concerns may render
+model parameters inaccessible. One promising approach involves utilizing
+zeroth-order optimization (ZOO) to train a data adaptor to adapt the test data
+to fit the deployed models. Nevertheless, the data adaptor trained with ZOO
+typically brings restricted improvements due to the potential corruption of
+data features caused by the data adaptor. To address this issue, we revisit ZOO
+in the context of test-time data adaptation. We find that the issue directly
+stems from the unreliable estimation of the gradients used to optimize the data
+adaptor, which is inherently due to the unreliable nature of the pseudo-labels
+assigned to the test data. Based on this observation, we propose
+pseudo-label-robust data adaptation (SODA) to improve the performance of data
+adaptation. Specifically, SODA leverages high-confidence predicted labels as
+reliable labels to optimize the data adaptor with ZOO for label prediction. For
+data with low-confidence predictions, SODA encourages the adaptor to preserve
+data information to mitigate data corruption. Empirical results indicate that
+SODA can significantly enhance the performance of deployed models in the
+presence of distribution shifts without requiring access to model parameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DORec: Decomposed Object Reconstruction Utilizing 2D <span class="highlight-title">Self-Supervised</span>
+  Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11092v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11092v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun Wu, Sicheng Li, Sihui Ji, Yue Wang, Rong Xiong, Yiyi Liao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decomposing a target object from a complex background while reconstructing is
+challenging. Most approaches acquire the perception for object instances
+through the use of manual labels, but the annotation procedure is costly. The
+recent advancements in 2D self-supervised learning have brought new prospects
+to object-aware representation, yet it remains unclear how to leverage such
+noisy 2D features for clean decomposition. In this paper, we propose a
+Decomposed Object Reconstruction (DORec) network based on neural implicit
+representations. Our key idea is to transfer 2D self-supervised features into
+masks of two levels of granularity to supervise the decomposition, including a
+binary mask to indicate the foreground regions and a K-cluster mask to indicate
+the semantically similar regions. These two masks are complementary to each
+other and lead to robust decomposition. Experimental results show the
+superiority of DORec in segmenting and reconstructing the foreground object on
+various datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ United We Stand: Using Epoch-wise Agreement of Ensembles to Combat
+  Overfit 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11077v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11077v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Uri Stern, Daniel Shwartz, Daphna Weinshall
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks have become the method of choice for solving many image
+classification tasks, largely because they can fit very complex functions
+defined over raw images. The downside of such powerful learners is the danger
+of overfitting the training set, leading to poor generalization, which is
+usually avoided by regularization and "early stopping" of the training. In this
+paper, we propose a new deep network ensemble classifier that is very effective
+against overfit. We begin with the theoretical analysis of a regression model,
+whose predictions - that the variance among classifiers increases when overfit
+occurs - is demonstrated empirically in deep networks in common use. Guided by
+these results, we construct a new ensemble-based prediction method designed to
+combat overfit, where the prediction is determined by the most consensual
+prediction throughout the training. On multiple image and text classification
+datasets, we show that when regular ensembles suffer from overfit, our method
+eliminates the harmful reduction in generalization due to overfit, and often
+even surpasses the performance obtained by early stopping. Our method is easy
+to implement, and can be integrated with any training scheme and architecture,
+without additional prior knowledge beyond the training set. Accordingly, it is
+a practical and useful tool to overcome overfit.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ $k$-$t$ CLAIR: Self-Consistency Guided Multi-Prior Learning for Dynamic
+  Parallel MR Image Reconstruction <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11050v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11050v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liping Zhang, Weitian Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cardiac magnetic resonance imaging (CMR) has been widely used in clinical
+practice for the medical diagnosis of cardiac diseases. However, the long
+acquisition time hinders its development in real-time applications. Here, we
+propose a novel self-consistency guided multi-prior learning framework named
+$k$-$t$ CLAIR to exploit spatiotemporal correlations from highly undersampled
+data for accelerated dynamic parallel MRI reconstruction. The $k$-$t$ CLAIR
+progressively reconstructs faithful images by leveraging multiple complementary
+priors learned in the $x$-$t$, $x$-$f$, and $k$-$t$ domains in an iterative
+fashion, as dynamic MRI exhibits high spatiotemporal redundancy. Additionally,
+$k$-$t$ CLAIR incorporates calibration information for prior learning,
+resulting in a more consistent reconstruction. Experimental results on cardiac
+cine and T1W/T2W images demonstrate that $k$-$t$ CLAIR achieves high-quality
+dynamic MR reconstruction in terms of both quantitative and qualitative
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 3 figures, 4 tables. CMRxRecon Challenge, MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Co-Learning Semantic-aware Unsupervised Segmentation for Pathological
+  Image Registration <span class="chip">MICCAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11040v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11040v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Liu, Shi Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The registration of pathological images plays an important role in medical
+applications. Despite its significance, most researchers in this field
+primarily focus on the registration of normal tissue into normal tissue. The
+negative impact of focal tissue, such as the loss of spatial correspondence
+information and the abnormal distortion of tissue, are rarely considered. In
+this paper, we propose GIRNet, a novel unsupervised approach for pathological
+image registration by incorporating segmentation and inpainting through the
+principles of Generation, Inpainting, and Registration (GIR). The registration,
+segmentation, and inpainting modules are trained simultaneously in a
+co-learning manner so that the segmentation of the focal area and the
+registration of inpainted pairs can improve collaboratively. Overall, the
+registration of pathological images is achieved in a completely unsupervised
+learning framework. Experimental results on multiple datasets, including
+Magnetic Resonance Imaging (MRI) of T1 sequences, demonstrate the efficacy of
+our proposed method. Our results show that our method can accurately achieve
+the registration of pathological images and identify lesions even in
+challenging imaging modalities. Our unsupervised approach offers a promising
+solution for the efficient and cost-effective registration of pathological
+images. Our code is available at
+https://github.com/brain-intelligence-lab/GIRNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures, published in Medical Image Computing and
+  Computer Assisted Intervention (MICCAI) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Domain Generalization Using Large <span class="highlight-title">Pretrain</span>ed Models with
+  Mixture-of-Adapters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11031v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11031v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gyuseong Lee, Wooseok Jang, Jin Hyeon Kim, Jaewoo Jung, Seungryong Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning a robust vision model despite large distribution shift is essential
+for model deployment in real-world settings. Especially, domain generalization
+(DG) algorithm aims to maintain the performance of a trained model on different
+distributions which were not seen during training. One of the most effective
+methods has been leveraging the already learned rich knowledge of large
+pretrained models. However, naively fine-tuning large models to DG tasks is
+often practically infeasible due to memory limitations, extensive time
+requirements for training, and the risk of learned knowledge deterioration.
+Recently, parameter-efficient fine-tuning (PEFT) methods have been proposed to
+reduce the high computational cost during training and efficiently adapt large
+models to downstream tasks. In this work, for the first time, we find that the
+use of adapters in PEFT methods not only reduce high computational cost during
+training but also serve as an effective regularizer for DG tasks. Surprisingly,
+a naive adapter implementation for large models achieve superior performance on
+common datasets. However, in situations of large distribution shifts,
+additional factors such as optimal amount of regularization due to the strength
+of distribution shifts should be considered for a sophisticated adapter
+implementation. To address this, we propose a mixture-of-expert based adapter
+fine-tuning method, dubbed as mixture-of-adapters (MoA). Specifically, we
+employ multiple adapters that have varying capacities, and by using learnable
+routers, we allocate each token to a proper adapter. By using both PEFT and MoA
+methods, we effectively alleviate the performance deterioration caused by
+distribution shifts and achieve state-of-the-art performance on diverse DG
+benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NICE: Improving Panoptic Narrative Detection and Segmentation with
+  Cascading Collaborative Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10975v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10975v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haowei Wang, Jiayi Ji, Tianyu Guo, Yilong Yang, Yiyi Zhou, Xiaoshuai Sun, Rongrong Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Panoptic Narrative Detection (PND) and Segmentation (PNS) are two challenging
+tasks that involve identifying and locating multiple targets in an image
+according to a long narrative description. In this paper, we propose a unified
+and effective framework called NICE that can jointly learn these two panoptic
+narrative recognition tasks. Existing visual grounding tasks use a two-branch
+paradigm, but applying this directly to PND and PNS can result in prediction
+conflict due to their intrinsic many-to-many alignment property. To address
+this, we introduce two cascading modules based on the barycenter of the mask,
+which are Coordinate Guided Aggregation (CGA) and Barycenter Driven
+Localization (BDL), responsible for segmentation and detection, respectively.
+By linking PNS and PND in series with the barycenter of segmentation as the
+anchor, our approach naturally aligns the two tasks and allows them to
+complement each other for improved performance. Specifically, CGA provides the
+barycenter as a reference for detection, reducing BDL's reliance on a large
+number of candidate boxes. BDL leverages its excellent properties to
+distinguish different instances, which improves the performance of CGA for
+segmentation. Extensive experiments demonstrate that NICE surpasses all
+existing methods by a large margin, achieving 4.1% for PND and 2.9% for PNS
+over the state-of-the-art. These results validate the effectiveness of our
+proposed collaborative learning strategy. The project of this work is made
+publicly available at https://github.com/Mr-Neko/NICE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages. 9 figures, 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Context-Aware Meta-Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10971v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10971v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christopher Fifty, Dennis Duan, Ronald G. Junkins, Ehsan Amid, Jure Leskovec, Christopher Ré, Sebastian Thrun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models like ChatGPT demonstrate a remarkable capacity to learn
+new concepts during inference without any fine-tuning. However, visual models
+trained to detect new objects during inference have been unable to replicate
+this ability, and instead either perform poorly or require meta-training and/or
+fine-tuning on similar objects. In this work, we propose a meta-learning
+algorithm that emulates Large Language Models by learning new visual concepts
+during inference without fine-tuning. Our approach leverages a frozen
+pre-trained feature extractor, and analogous to in-context learning, recasts
+meta-learning as sequence modeling over datapoints with known labels and a test
+datapoint with an unknown label. On 8 out of 11 meta-learning benchmarks, our
+approach -- without meta-training or fine-tuning -- exceeds or matches the
+state-of-the-art algorithm, P>M>F, which is meta-trained on these benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MRI brain tumor segmentation using informative feature vectors and
+  kernel dictionary learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10963v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10963v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seyedeh Mahya Mousavi, Mohammad Mostafavi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a method based on a kernel dictionary learning algorithm
+for segmenting brain tumor regions in magnetic resonance images (MRI). A set of
+first-order and second-order statistical feature vectors are extracted from
+patches of size 3 * 3 around pixels in the brain MRI scans. These feature
+vectors are utilized to train two kernel dictionaries separately for healthy
+and tumorous tissues. To enhance the efficiency of the dictionaries and reduce
+training time, a correlation-based sample selection technique is developed to
+identify the most informative and discriminative subset of feature vectors.
+This technique aims to improve the performance of the dictionaries by selecting
+a subset of feature vectors that provide valuable information for the
+segmentation task. Subsequently, a linear classifier is utilized to distinguish
+between healthy and unhealthy pixels based on the learned dictionaries. The
+results demonstrate that the proposed method outperforms other existing methods
+in terms of segmentation accuracy and significantly reduces both the time and
+memory required, resulting in a remarkably fast training process.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Deep Neural Network Training Efficiency and Performance
+  through Linear Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10958v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10958v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hejie Ying, Mengmeng Song, Yaohong Tang, Shungen Xiao, Zimin Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNN) have achieved remarkable success in various
+fields, including computer vision and natural language processing. However,
+training an effective DNN model still poses challenges. This paper aims to
+propose a method to optimize the training effectiveness of DNN, with the goal
+of improving model performance. Firstly, based on the observation that the DNN
+parameters change in certain laws during training process, the potential of
+parameter prediction for improving model training efficiency and performance is
+discovered. Secondly, considering the magnitude of DNN model parameters,
+hardware limitations and characteristics of Stochastic Gradient Descent (SGD)
+for noise tolerance, a Parameter Linear Prediction (PLP) method is exploit to
+perform DNN parameter prediction. Finally, validations are carried out on some
+representative backbones. Experiment results show that compare to the normal
+training ways, under the same training conditions and epochs, by employing
+proposed PLP method, the optimal model is able to obtain average about 1%
+accuracy improvement and 0.01 top-1/top-5 error reduction for Vgg16, Resnet18
+and GoogLeNet based on CIFAR-100 dataset, which shown the effectiveness of the
+proposed method on different DNN structures, and validated its capacity in
+enhancing DNN training efficiency and performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Medical Image Segmentation via Sparse Coding Decoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10957v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10957v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Long Zeng, Kaigui Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have achieved significant success in medical image segmentation,
+owing to its capability to capture long-range dependencies. Previous works
+incorporate convolutional layers into the encoder module of transformers,
+thereby enhancing their ability to learn local relationships among pixels.
+However, transformers may suffer from limited generalization capabilities and
+reduced robustness, attributed to the insufficient spatial recovery ability of
+their decoders. To address this issue, A convolution sparse vector coding based
+decoder is proposed , namely CAScaded multi-layer Convolutional Sparse vector
+Coding DEcoder (CASCSCDE), which represents features extracted by the encoder
+using sparse vectors. To prove the effectiveness of our CASCSCDE, The
+widely-used TransUNet model is chosen for the demonstration purpose, and the
+CASCSCDE is incorporated with TransUNet to establish the TransCASCSCDE
+architecture. Our experiments demonstrate that TransUNet with CASCSCDE
+significantly enhances performance on the Synapse benchmark, obtaining up to
+3.15\% and 1.16\% improvements in DICE and mIoU scores, respectively. CASCSCDE
+opens new ways for constructing decoders based on convolutional sparse vector
+coding.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 1 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FusionU-Net: U-Net with Enhanced Skip Connection for Pathology Image
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10951v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10951v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zongyi Li, Hongbing Lyu, Jun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, U-Net and its variants have been widely used in pathology
+image segmentation tasks. One of the key designs of U-Net is the use of skip
+connections between the encoder and decoder, which helps to recover detailed
+information after upsampling. While most variations of U-Net adopt the original
+skip connection design, there is semantic gap between the encoder and decoder
+that can negatively impact model performance. Therefore, it is important to
+reduce this semantic gap before conducting skip connection. To address this
+issue, we propose a new segmentation network called FusionU-Net, which is based
+on U-Net structure and incorporates a fusion module to exchange information
+between different skip connections to reduce semantic gaps. Unlike the other
+fusion modules in existing networks, ours is based on a two-round fusion design
+that fully considers the local relevance between adjacent encoder layer outputs
+and the need for bi-directional information exchange across multiple layers. We
+conducted extensive experiments on multiple pathology image datasets to
+evaluate our model and found that FusionU-Net achieves better performance
+compared to other competing methods. We argue our fusion module is more
+effective than the designs of existing networks, and it could be easily
+embedded into other networks to further enhance the model performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 figures and 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unanswerable Visual Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10942v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10942v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanyang Guo, Fangkai Jiao, Zhiqi Shen, Liqiang Nie, Mohan Kankanhalli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Teaching Visual Question Answering (VQA) models to abstain from unanswerable
+questions is indispensable for building a trustworthy AI system. Existing
+studies, though have explored various aspects of VQA, yet marginally ignored
+this particular attribute. This paper aims to bridge the research gap by
+contributing a comprehensive dataset, called UNK-VQA. The dataset is
+specifically designed to address the challenge of questions that can be
+unanswerable. To this end, we first augment the existing data via deliberate
+perturbations on either the image or question. In specific, we carefully ensure
+that the question-image semantics remain close to the original unperturbed
+distribution. By means of this, the identification of unanswerable questions
+becomes challenging, setting our dataset apart from others that involve mere
+image replacement. We then extensively evaluate the zero- and few-shot
+performance of several emerging multi-modal large models and discover
+significant limitations of them when applied to our dataset. Additionally, we
+also propose a straightforward method to tackle these unanswerable questions.
+This dataset, we believe, will serve as a valuable benchmark for enhancing the
+abstention capability of VQA models, thereby leading to increased
+trustworthiness of AI systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Training-free Open-world Segmentation via Image <span class="highlight-title">Prompt</span>ing
+  Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10912v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10912v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lv Tang, Peng-Tao Jiang, Hao-Ke Xiao, Bo Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The realm of computer vision has witnessed a paradigm shift with the advent
+of foundational models, mirroring the transformative influence of large
+language models in the domain of natural language processing. This paper delves
+into the exploration of open-world segmentation, presenting a novel approach
+called Image Prompt Segmentation (IPSeg) that harnesses the power of vision
+foundational models. At the heart of IPSeg lies the principle of a
+training-free paradigm, which capitalizes on image prompting techniques. IPSeg
+utilizes a single image containing a subjective visual concept as a flexible
+prompt to query vision foundation models like DINOv2 and Stable Diffusion. Our
+approach extracts robust features for the prompt image and input image, then
+matches the input representations to the prompt representations via a novel
+feature interaction module to generate point prompts highlighting target
+objects in the input image. The generated point prompts are further utilized to
+guide the Segment Anything Model to segment the target object in the input
+image. The proposed method stands out by eliminating the need for exhaustive
+training sessions, thereby offering a more efficient and scalable solution.
+Experiments on COCO, PASCAL VOC, and other datasets demonstrate IPSeg's
+efficacy for flexible open-world segmentation using intuitive image prompts.
+This work pioneers tapping foundation models for open-world understanding
+through visual concepts conveyed in images.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Holistic Parking Slot Detection with Polygon-Shaped Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11629v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11629v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lihao Wang, Antonyo Musabini, Christel Leonet, Rachid Benmokhtar, Amaury Breheret, Chaima Yedes, Fabian Burger, Thomas Boulay, Xavier Perrotton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current parking slot detection in advanced driver-assistance systems (ADAS)
+primarily relies on ultrasonic sensors. This method has several limitations
+such as the need to scan the entire parking slot before detecting it, the
+incapacity of detecting multiple slots in a row, and the difficulty of
+classifying them. Due to the complex visual environment, vehicles are equipped
+with surround view camera systems to detect vacant parking slots. Previous
+research works in this field mostly use image-domain models to solve the
+problem. These two-stage approaches separate the 2D detection and 3D pose
+estimation steps using camera calibration. In this paper, we propose one-step
+Holistic Parking Slot Network (HPS-Net), a tailor-made adaptation of the You
+Only Look Once (YOLO)v4 algorithm. This camera-based approach directly outputs
+the four vertex coordinates of the parking slot in topview domain, instead of a
+bounding box in raw camera images. Several visible points and shapes can be
+proposed from different angles. A novel regression loss function named
+polygon-corner Generalized Intersection over Union (GIoU) for polygon vertex
+position optimization is also proposed to manage the slot orientation and to
+distinguish the entrance line. Experiments show that HPS-Net can detect various
+vacant parking slots with a F1-score of 0.92 on our internal Valeo Parking
+Slots Dataset (VPSD) and 0.99 on the public dataset PS2.0. It provides a
+satisfying generalization and robustness in various parking scenarios, such as
+indoor (F1: 0.86) or paved ground (F1: 0.91). Moreover, it achieves a real-time
+detection speed of 17 FPS on Nvidia Drive AGX Xavier. A demo video can be found
+at https://streamable.com/75j7sj.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ High-Resolution Building and Road Detection from Sentinel-2 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11622v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11622v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wojciech Sirko, Emmanuel Asiedu Brempong, Juliana T. C. Marcos, Abigail Annkah, Abel Korme, Mohammed Alewi Hassen, Krishna Sapkota, Tomer Shekel, Abdoulaye Diack, Sella Nevo, Jason Hickey, John Quinn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mapping buildings and roads automatically with remote sensing typically
+requires high-resolution imagery, which is expensive to obtain and often
+sparsely available. In this work we demonstrate how multiple 10 m resolution
+Sentinel-2 images can be used to generate 50 cm resolution building and road
+segmentation masks. This is done by training a `student' model with access to
+Sentinel-2 images to reproduce the predictions of a `teacher' model which has
+access to corresponding high-resolution imagery. While the predictions do not
+have all the fine detail of the teacher model, we find that we are able to
+retain much of the performance: for building segmentation we achieve 78.3%
+mIoU, compared to the high-resolution teacher model accuracy of 85.3% mIoU. We
+also describe a related method for counting individual buildings in a
+Sentinel-2 patch which achieves R^2 = 0.91 against true counts. This work opens
+up new possibilities for using freely available Sentinel-2 imagery for a range
+of tasks that previously could only be done with high-resolution satellite
+imagery.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Classification of Safety Driver Attention During Autonomous Vehicle
+  Operation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11608v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11608v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Santiago Gerling Konrad, Julie Stephany Berrio, Mao Shan, Favio Masson, Stewart Worrall
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the continual advances in Advanced Driver Assistance Systems (ADAS)
+and the development of high-level autonomous vehicles (AV), there is a general
+consensus that for the short to medium term, there is a requirement for a human
+supervisor to handle the edge cases that inevitably arise. Given this
+requirement, it is essential that the state of the vehicle operator is
+monitored to ensure they are contributing to the vehicle's safe operation. This
+paper introduces a dual-source approach integrating data from an infrared
+camera facing the vehicle operator and vehicle perception systems to produce a
+metric for driver alertness in order to promote and ensure safe operator
+behaviour. The infrared camera detects the driver's head, enabling the
+calculation of head orientation, which is relevant as the head typically moves
+according to the individual's focus of attention. By incorporating
+environmental data from the perception system, it becomes possible to determine
+whether the vehicle operator observes objects in the surroundings. Experiments
+were conducted using data collected in Sydney, Australia, simulating AV
+operations in an urban environment. Our results demonstrate that the proposed
+system effectively determines a metric for the attention levels of the vehicle
+operator, enabling interventions such as warnings or reducing autonomous
+functionality as appropriate. This comprehensive solution shows promise in
+contributing to ADAS and AVs' overall safety and efficiency in a real-world
+setting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DIAR: Deep Image Alignment and Reconstruction using Swin <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11605v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11605v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Monika Kwiatkowski, Simon Matern, Olaf Hellwich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When taking images of some occluded content, one is often faced with the
+problem that every individual image frame contains unwanted artifacts, but a
+collection of images contains all relevant information if properly aligned and
+aggregated. In this paper, we attempt to build a deep learning pipeline that
+simultaneously aligns a sequence of distorted images and reconstructs them. We
+create a dataset that contains images with image distortions, such as lighting,
+specularities, shadows, and occlusion. We create perspective distortions with
+corresponding ground-truth homographies as labels. We use our dataset to train
+Swin transformer models to analyze sequential image data. The attention maps
+enable the model to detect relevant image content and differentiate it from
+outliers and artifacts. We further explore using neural feature maps as
+alternatives to classical key point detectors. The feature maps of trained
+convolutional layers provide dense image descriptors that can be used to find
+point correspondences between images. We utilize this to compute coarse image
+alignments and explore its limitations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Neural Implicit through Volume Rendering with Attentive Depth
+  Fusion Priors <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11598v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11598v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengchong Hu, Zhizhong Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning neural implicit representations has achieved remarkable performance
+in 3D reconstruction from multi-view images. Current methods use volume
+rendering to render implicit representations into either RGB or depth images
+that are supervised by multi-view ground truth. However, rendering a view each
+time suffers from incomplete depth at holes and unawareness of occluded
+structures from the depth supervision, which severely affects the accuracy of
+geometry inference via volume rendering. To resolve this issue, we propose to
+learn neural implicit representations from multi-view RGBD images through
+volume rendering with an attentive depth fusion prior. Our prior allows neural
+networks to perceive coarse 3D structures from the Truncated Signed Distance
+Function (TSDF) fused from all depth images available for rendering. The TSDF
+enables accessing the missing depth at holes on one depth image and the
+occluded parts that are invisible from the current view. By introducing a novel
+attention mechanism, we allow neural networks to directly use the depth fusion
+prior with the inferred occupancy as the learned implicit function. Our
+attention mechanism works with either a one-time fused TSDF that represents a
+whole scene or an incrementally fused TSDF that represents a partial scene in
+the context of Simultaneous Localization and Mapping (SLAM). Our evaluations on
+widely used benchmarks including synthetic and real-world scans show our
+superiority over the latest neural implicit methods. Project page:
+https://machineperceptionlab.github.io/Attentive_DF_Prior/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WaveAttack: Asymmetric Frequency Obfuscation-based Backdoor Attacks
+  Against Deep Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11595v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11595v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun Xia, Zhihao Yue, Yingbo Zhou, Zhiwei Ling, Xian Wei, Mingsong Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the popularity of Artificial Intelligence (AI) technology, numerous
+backdoor attacks are designed by adversaries to mislead deep neural network
+predictions by manipulating training samples and training processes. Although
+backdoor attacks are effective in various real scenarios, they still suffer
+from the problems of both low fidelity of poisoned samples and non-negligible
+transfer in latent space, which make them easily detectable by existing
+backdoor detection algorithms. To overcome the weakness, this paper proposes a
+novel frequency-based backdoor attack method named WaveAttack, which obtains
+image high-frequency features through Discrete Wavelet Transform (DWT) to
+generate backdoor triggers. Furthermore, we introduce an asymmetric frequency
+obfuscation method, which can add an adaptive residual in the training and
+inference stage to improve the impact of triggers and further enhance the
+effectiveness of WaveAttack. Comprehensive experimental results show that
+WaveAttack not only achieves higher stealthiness and effectiveness, but also
+outperforms state-of-the-art (SOTA) backdoor attack methods in the fidelity of
+images by up to 28.27\% improvement in PSNR, 1.61\% improvement in SSIM, and
+70.59\% reduction in IS. Our code is available at
+https://anonymous.4open.science/r/AnonymousRep-701D.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Studying the Effects of Sex-related Differences on Brain Age Prediction
+  using brain MR Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11577v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11577v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahsa Dibaji, Neha Gianchandani, Akhil Nair, Mansi Singhal, Roberto Souza, Mariana Bento
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While utilizing machine learning models, one of the most crucial aspects is
+how bias and fairness affect model outcomes for diverse demographics. This
+becomes especially relevant in the context of machine learning for medical
+imaging applications as these models are increasingly being used for diagnosis
+and treatment planning. In this paper, we study biases related to sex when
+developing a machine learning model based on brain magnetic resonance images
+(MRI). We investigate the effects of sex by performing brain age prediction
+considering different experimental designs: model trained using only female
+subjects, only male subjects and a balanced dataset. We also perform evaluation
+on multiple MRI datasets (Calgary-Campinas(CC359) and CamCAN) to assess the
+generalization capability of the proposed models. We found disparities in the
+performance of brain age prediction models when trained on distinct sex
+subgroups and datasets, in both final predictions and decision making (assessed
+using interpretability models). Our results demonstrated variations in model
+generalizability across sex-specific subgroups, suggesting potential biases in
+models trained on unbalanced datasets. This underlines the critical role of
+careful experimental design in generating fair and reliable outcomes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Lens Blur Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11535v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11535v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Esther Y. H. Lin, Zhecheng Wang, Rebecca Lin, Daniel Miau, Florian Kainz, Jiawen Chen, Xuaner Cecilia Zhang, David B. Lindell, Kiriakos N. Kutulakos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical blur is an inherent property of any lens system and is challenging to
+model in modern cameras because of their complex optical elements. To tackle
+this challenge, we introduce a high-dimensional neural representation of
+blur$-$$\textit{the lens blur field}$$-$and a practical method for acquiring
+it. The lens blur field is a multilayer perceptron (MLP) designed to (1)
+accurately capture variations of the lens 2D point spread function over image
+plane location, focus setting and, optionally, depth and (2) represent these
+variations parametrically as a single, sensor-specific function. The
+representation models the combined effects of defocus, diffraction, aberration,
+and accounts for sensor features such as pixel color filters and pixel-specific
+micro-lenses. To learn the real-world blur field of a given device, we
+formulate a generalized non-blind deconvolution problem that directly optimizes
+the MLP weights using a small set of focal stacks as the only input. We also
+provide a first-of-its-kind dataset of 5D blur fields$-$for smartphone cameras,
+camera bodies equipped with a variety of lenses, etc. Lastly, we show that
+acquired 5D blur fields are expressive and accurate enough to reveal, for the
+first time, differences in optical behavior of smartphone devices of the same
+make and model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GenEval: An Object-Focused Framework for Evaluating Text-to-Image
+  Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11513v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11513v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dhruba Ghosh, Hanna Hajishirzi, Ludwig Schmidt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent breakthroughs in diffusion models, multimodal pretraining, and
+efficient finetuning have led to an explosion of text-to-image generative
+models. Given human evaluation is expensive and difficult to scale, automated
+methods are critical for evaluating the increasingly large number of new
+models. However, most current automated evaluation metrics like FID or
+CLIPScore only offer a holistic measure of image quality or image-text
+alignment, and are unsuited for fine-grained or instance-level analysis. In
+this paper, we introduce GenEval, an object-focused framework to evaluate
+compositional image properties such as object co-occurrence, position, count,
+and color. We show that current object detection models can be leveraged to
+evaluate text-to-image models on a variety of generation tasks with strong
+human agreement, and that other discriminative vision models can be linked to
+this pipeline to further verify properties like object color. We then evaluate
+several open-source text-to-image models and analyze their relative generative
+capabilities on our benchmark. We find that recent models demonstrate
+significant improvement on these tasks, though they are still lacking in
+complex capabilities such as spatial relations and attribute binding. Finally,
+we demonstrate how GenEval might be used to help discover existing failure
+modes, in order to inform development of the next generation of text-to-image
+models. Our code to run the GenEval framework is publicly available at
+https://github.com/djghosh13/geneval.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Class-incremental Learning in the Era of Large <span class="highlight-title">Pre-train</span>ed
+  Models via Test-Time Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11482v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11482v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Imad Eddine Marouf, Subhankar Roy, Enzo Tartaglione, Stéphane Lathuilière
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Class-incremental learning (CIL) is a challenging task that involves
+continually learning to categorize classes into new tasks without forgetting
+previously learned information. The advent of the large pre-trained models
+(PTMs) has fast-tracked the progress in CIL due to the highly transferable PTM
+representations, where tuning a small set of parameters results in
+state-of-the-art performance when compared with the traditional CIL methods
+that are trained from scratch. However, repeated fine-tuning on each task
+destroys the rich representations of the PTMs and further leads to forgetting
+previous tasks. To strike a balance between the stability and plasticity of
+PTMs for CIL, we propose a novel perspective of eliminating training on every
+new task and instead performing test-time adaptation (TTA) directly on the test
+instances. Concretely, we propose "Test-Time Adaptation for Class-Incremental
+Learning" (TTACIL) that first fine-tunes Layer Norm parameters of the PTM on
+each test instance for learning task-specific features, and then resets them
+back to the base model to preserve stability. As a consequence, TTACIL does not
+undergo any forgetting, while benefiting each task with the rich PTM features.
+Additionally, by design, our method is robust to common data corruptions. Our
+TTACIL outperforms several state-of-the-art CIL methods when evaluated on
+multiple CIL benchmarks under both clean and corrupted data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages,5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Whole-brain radiomics for clustered federated personalization in brain
+  tumor segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11480v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11480v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthis Manthe, Stefan Duffner, Carole Lartizien
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning and its application to medical image segmentation have
+recently become a popular research topic. This training paradigm suffers from
+statistical heterogeneity between participating institutions' local datasets,
+incurring convergence slowdown as well as potential accuracy loss compared to
+classical training. To mitigate this effect, federated personalization emerged
+as the federated optimization of one model per institution. We propose a novel
+personalization algorithm tailored to the feature shift induced by the usage of
+different scanners and acquisition parameters by different institutions. This
+method is the first to account for both inter and intra-institution feature
+shift (multiple scanners used in a single institution). It is based on the
+computation, within each centre, of a series of radiomic features capturing the
+global texture of each 3D image volume, followed by a clustering analysis
+pooling all feature vectors transferred from the local institutions to the
+central server. Each computed clustered decentralized dataset (potentially
+including data from different institutions) then serves to finetune a global
+model obtained through classical federated learning. We validate our approach
+on the Federated Brain Tumor Segmentation 2022 Challenge dataset (FeTS2022).
+Our code is available at (https://github.com/MatthisManthe/radiomics_CFFL).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Medical Imaging with Deep Learning (MiDL) 2023 conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Equivariance in State-of-the-Art Supervised Depth and Normal
+  Predictors <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16646v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16646v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanyi Zhong, Anand Bhattad, Yu-Xiong Wang, David Forsyth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dense depth and surface normal predictors should possess the equivariant
+property to cropping-and-resizing -- cropping the input image should result in
+cropping the same output image. However, we find that state-of-the-art depth
+and normal predictors, despite having strong performances, surprisingly do not
+respect equivariance. The problem exists even when crop-and-resize data
+augmentation is employed during training. To remedy this, we propose an
+equivariant regularization technique, consisting of an averaging procedure and
+a self-consistency loss, to explicitly promote cropping-and-resizing
+equivariance in depth and normal networks. Our approach can be applied to both
+CNN and Transformer architectures, does not incur extra cost during testing,
+and notably improves the supervised and semi-supervised learning performance of
+dense predictors on Taskonomy tasks. Finally, finetuning with our loss on
+unlabeled images improves not only equivariance but also accuracy of
+state-of-the-art depth and normal predictors when evaluated on NYU-v2. GitHub
+link: https://github.com/mikuhatsune/equivariance
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Steering Prototypes with <span class="highlight-title">Prompt</span>-tuning for Rehearsal-free Continual
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09447v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09447v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuowei Li, Long Zhao, Zizhao Zhang, Han Zhang, Di Liu, Ting Liu, Dimitris N. Metaxas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the context of continual learning, prototypes-as representative class
+embeddings-offer advantages in memory conservation and the mitigation of
+catastrophic forgetting. However, challenges related to semantic drift and
+prototype interference persist. In this study, we introduce the Contrastive
+Prototypical Prompt (CPP) approach. Through task-specific prompt-tuning,
+underpinned by a contrastive learning objective, we effectively address both
+aforementioned challenges. Our evaluations on four challenging
+class-incremental benchmarks reveal that CPP achieves a significant 4% to 6%
+improvement over state-of-the-art methods. Importantly, CPP operates without a
+rehearsal buffer and narrows the performance divergence between continual and
+offline joint-learning, suggesting an innovative scheme for Transformer-based
+continual learning systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RFNet-4D++: Joint Object Reconstruction and Flow Estimation from 4D
+  Point Clouds with Cross-Attention Spatio-Temporal Features <span class="chip">ECCV 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.16482v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.16482v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tuan-Anh Vu, Duc Thanh Nguyen, Binh-Son Hua, Quang-Hieu Pham, Sai-Kit Yeung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object reconstruction from 3D point clouds has been a long-standing research
+problem in computer vision and computer graphics, and achieved impressive
+progress. However, reconstruction from time-varying point clouds (a.k.a. 4D
+point clouds) is generally overlooked. In this paper, we propose a new network
+architecture, namely RFNet-4D++, that jointly reconstructs objects and their
+motion flows from 4D point clouds. The key insight is simultaneously performing
+both tasks via learning of spatial and temporal features from a sequence of
+point clouds can leverage individual tasks, leading to improved overall
+performance. To prove this ability, we design a temporal vector field learning
+module using an unsupervised learning approach for flow estimation task,
+leveraged by supervised learning of spatial structures for object
+reconstruction. Extensive experiments and analyses on benchmark datasets
+validated the effectiveness and efficiency of our method. As shown in
+experimental results, our method achieves state-of-the-art performance on both
+flow estimation and object reconstruction while performing much faster than
+existing methods in both training and inference. Our code and data are
+available at https://github.com/hkust-vgd/RFNet-4D
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>TPAMI journal extension of ECCV 2022 arXiv:2203.16482</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Coarse-to-Fine Contrastive Learning in Image-Text-Graph Space for
+  Improved Vision-Language Compositionality <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13812v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13812v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harman Singh, Pengchuan Zhang, Qifan Wang, Mengjiao Wang, Wenhan Xiong, Jingfei Du, Yu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Contrastively trained vision-language models have achieved remarkable
+progress in vision and language representation learning, leading to
+state-of-the-art models for various downstream multimodal tasks. However,
+recent research has highlighted severe limitations of these models in their
+ability to perform compositional reasoning over objects, attributes, and
+relations. Scene graphs have emerged as an effective way to understand images
+compositionally. These are graph-structured semantic representations of images
+that contain objects, their attributes, and relations with other objects in a
+scene. In this work, we consider the scene graph parsed from text as a proxy
+for the image scene graph and propose a graph decomposition and augmentation
+framework along with a coarse-to-fine contrastive learning objective between
+images and text that aligns sentences of various complexities to the same
+image. Along with this, we propose novel negative mining techniques in the
+scene graph space for improving attribute binding and relation understanding.
+Through extensive experiments, we demonstrate the effectiveness of our approach
+that significantly improves attribute binding, relation understanding,
+systematic generalization, and productivity on multiple recently proposed
+benchmarks (For example, improvements upto $18\%$ for systematic
+generalization, $16.5\%$ for relation understanding over a strong baseline),
+while achieving similar or better performance than CLIP on various general
+multimodal tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 (main)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Imitating Task and Motion Planning with Visuomotor <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16309v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16309v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Murtaza Dalal, Ajay Mandlekar, Caelan Garrett, Ankur Handa, Ruslan Salakhutdinov, Dieter Fox
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Imitation learning is a powerful tool for training robot manipulation
+policies, allowing them to learn from expert demonstrations without manual
+programming or trial-and-error. However, common methods of data collection,
+such as human supervision, scale poorly, as they are time-consuming and
+labor-intensive. In contrast, Task and Motion Planning (TAMP) can autonomously
+generate large-scale datasets of diverse demonstrations. In this work, we show
+that the combination of large-scale datasets generated by TAMP supervisors and
+flexible Transformer models to fit them is a powerful paradigm for robot
+manipulation. To that end, we present a novel imitation learning system called
+OPTIMUS that trains large-scale visuomotor Transformer policies by imitating a
+TAMP agent. OPTIMUS introduces a pipeline for generating TAMP data that is
+specifically curated for imitation learning and can be used to train performant
+transformer-based policies. In this paper, we present a thorough study of the
+design decisions required to imitate TAMP and demonstrate that OPTIMUS can
+solve a wide variety of challenging vision-based manipulation tasks with over
+70 different objects, ranging from long-horizon pick-and-place tasks, to shelf
+and articulated object manipulation, achieving 70 to 80% success rates. Video
+results and code at https://mihdalal.github.io/optimus/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Conference on Robot Learning (CoRL) 2023. 8 pages, 5 figures, 2
+  tables; 11 pages appendix (10 additional figures)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AniPixel: Towards Animatable Pixel-Aligned Human Avatar 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.03397v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.03397v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinlong Fan, Jing Zhang, Zhi Hou, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although human reconstruction typically results in human-specific avatars,
+recent 3D scene reconstruction techniques utilizing pixel-aligned features show
+promise in generalizing to new scenes. Applying these techniques to human
+avatar reconstruction can result in a volumetric avatar with generalizability
+but limited animatability due to rendering only being possible for static
+representations. In this paper, we propose AniPixel, a novel animatable and
+generalizable human avatar reconstruction method that leverages pixel-aligned
+features for body geometry prediction and RGB color blending. Technically, to
+align the canonical space with the target space and the observation space, we
+propose a bidirectional neural skinning field based on skeleton-driven
+deformation to establish the target-to-canonical and canonical-to-observation
+correspondences. Then, we disentangle the canonical body geometry into a
+normalized neutral-sized body and a subject-specific residual for better
+generalizability. As the geometry and appearance are closely related, we
+introduce pixel-aligned features to facilitate the body geometry prediction and
+detailed surface normals to reinforce the RGB color blending. We also devise a
+pose-dependent and view direction-related shading module to represent the local
+illumination variance. Experiments show that AniPixel renders comparable novel
+views while delivering better novel pose animation results than
+state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by MM'23, code will be released at
+  https://github.com/loong8888/AniPixel</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Toward Building General Foundation Models for Language, Vision, and
+  Vision-Language Understanding Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.05065v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.05065v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinsong Zhang, Yan Zeng, Jipeng Zhang, Hang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models or pre-trained models have substantially improved the
+performance of various language, vision, and vision-language understanding
+tasks. However, existing foundation models can only perform the best in one
+type of tasks, namely language, vision, or vision-language. It is still an open
+question whether it is possible to construct a foundation model performing the
+best for all the understanding tasks, which we call a general foundation model.
+In this paper, we propose a new general foundation model, X-FM (the
+X-Foundation Model). X-FM has one language encoder, one vision encoder, and one
+fusion encoder, as well as a new training method. The training method includes
+two new techniques for learning X-FM from text, image, and image-text pair
+data. One is to stop gradients from the vision-language training when learning
+the language encoder. The other is to leverage the vision-language training to
+guide the learning of the vision encoder. Extensive experiments on benchmark
+datasets show that X-FM can significantly outperform existing general
+foundation models and perform better than or comparable to existing foundation
+models specifically for language, vision, or vision-language understanding.
+Code and pre-trained models are released at
+https://github.com/zhangxinsong-nlp/XFM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ALP: Action-Aware Embodied Learning for Perception 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10190v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10190v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinran Liang, Anthony Han, Wilson Yan, Aditi Raghunathan, Pieter Abbeel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current methods in training and benchmarking vision models exhibit an
+over-reliance on passive, curated datasets. Although models trained on these
+datasets have shown strong performance in a wide variety of tasks such as
+classification, detection, and segmentation, they fundamentally are unable to
+generalize to an ever-evolving world due to constant out-of-distribution shifts
+of input data. Therefore, instead of training on fixed datasets, can we
+approach learning in a more human-centric and adaptive manner? In this paper,
+we introduce Action-Aware Embodied Learning for Perception (ALP), an embodied
+learning framework that incorporates action information into representation
+learning through a combination of optimizing a reinforcement learning policy
+and an inverse dynamics prediction objective. Our method actively explores in
+complex 3D environments to both learn generalizable task-agnostic visual
+representations as well as collect downstream training data. We show that ALP
+outperforms existing baselines in several downstream perception tasks. In
+addition, we show that by training on actively collected data more relevant to
+the environment and task, our method generalizes more robustly to downstream
+tasks compared to models pre-trained on fixed datasets such as ImageNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>project website available at https://xinranliang.github.io/alp/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CARSO: Blending Adversarial Training and Purification Improves
+  Adversarial Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06081v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06081v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emanuele Ballarin, Alessio Ansuini, Luca Bortolussi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose a novel adversarial defence mechanism for image
+classification - CARSO - blending the paradigms of adversarial training and
+adversarial purification in a mutually-beneficial, robustness-enhancing way.
+The method builds upon an adversarially-trained classifier, and learns to map
+its internal representation associated with a potentially perturbed input onto
+a distribution of tentative clean reconstructions. Multiple samples from such
+distribution are classified by the adversarially-trained model itself, and an
+aggregation of its outputs finally constitutes the robust prediction of
+interest. Experimental evaluation by a well-established benchmark of varied,
+strong adaptive attacks, across different image datasets and classifier
+architectures, shows that CARSO is able to defend itself against foreseen and
+unforeseen threats, including adaptive end-to-end attacks devised for
+stochastic defences. Paying a tolerable clean accuracy toll, our method
+improves by a significant margin the state of the art for CIFAR-10 and
+CIFAR-100 $\ell_\infty$ robust classification accuracy against AutoAttack. Code
+and pre-trained models are available at https://github.com/emaballarin/CARSO .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 1 figure, 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Image Clustering with Contrastive Learning and Multi-scale Graph
+  Convolutional Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.07173v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.07173v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuankun Xu, Dong Huang, Chang-Dong Wang, Jian-Huang Lai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep clustering has shown its promising capability in joint representation
+learning and clustering via deep neural networks. Despite the significant
+progress, the existing deep clustering works mostly utilize some
+distribution-based clustering loss, lacking the ability to unify representation
+learning and multi-scale structure learning. To address this, this paper
+presents a new deep clustering approach termed image clustering with
+contrastive learning and multi-scale graph convolutional networks (IcicleGCN),
+which bridges the gap between convolutional neural network (CNN) and graph
+convolutional network (GCN) as well as the gap between contrastive learning and
+multi-scale structure learning for the deep clustering task. Our framework
+consists of four main modules, namely, the CNN-based backbone, the Instance
+Similarity Module (ISM), the Joint Cluster Structure Learning and Instance
+reconstruction Module (JC-SLIM), and the Multi-scale GCN module (M-GCN).
+Specifically, the backbone network with two weight-sharing views is utilized to
+learn the representations for the two augmented samples (from each image). The
+learned representations are then fed to ISM and JC-SLIM for joint
+instance-level and cluster-level contrastive learning, respectively, during
+which an auto-encoder in JC-SLIM is also pretrained to serve as a bridge to the
+M-GCN module. Further, to enforce multi-scale neighborhood structure learning,
+two streams of GCNs and the auto-encoder are simultaneously trained via (i) the
+layer-wise interaction with representation fusion and (ii) the joint
+self-adaptive learning. Experiments on multiple image datasets demonstrate the
+superior clustering performance of IcicleGCN over the state-of-the-art. The
+code is available at https://github.com/xuyuankun631/IcicleGCN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in the Pattern Recognition journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can <span class="highlight-title">Pre-train</span>ed Vision and Language Models Answer Visual
+  Information-Seeking Questions? <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.11713v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.11713v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Chen, Hexiang Hu, Yi Luan, Haitian Sun, Soravit Changpinyo, Alan Ritter, Ming-Wei Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained vision and language models have demonstrated state-of-the-art
+capabilities over existing tasks involving images and texts, including visual
+question answering. However, it remains unclear whether these models possess
+the capability to answer questions that are not only querying visual content
+but knowledge-intensive and information-seeking. In this study, we introduce
+InfoSeek, a visual question answering dataset tailored for information-seeking
+questions that cannot be answered with only common sense knowledge. Using
+InfoSeek, we analyze various pre-trained visual question answering models and
+gain insights into their characteristics. Our findings reveal that
+state-of-the-art pre-trained multi-modal models (e.g., PaLI-X, BLIP2, etc.)
+face challenges in answering visual information-seeking questions, but
+fine-tuning on the InfoSeek dataset elicits models to use fine-grained
+knowledge that was learned during their pre-training. Furthermore, we show that
+accurate visual entity recognition can be used to improve performance on
+InfoSeek by retrieving relevant documents, showing a significant space for
+improvement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 (main conference); Our dataset and evaluation is available
+  at https://open-vision-language.github.io/infoseek/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CGAN-ECT: Tomography Image Reconstruction from Electrical Capacitance
+  Measurements Using CGANs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.03737v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.03737v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wael Deabes, Alaa E. Abdel-Hakim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the rapid growth of Electrical Capacitance Tomography (ECT)
+applications in several industrial fields, there is a crucial need for
+developing high quality, yet fast, methodologies of image reconstruction from
+raw capacitance measurements. Deep learning, as an effective non-linear mapping
+tool for complicated functions, has been going viral in many fields including
+electrical tomography. In this paper, we propose a Conditional Generative
+Adversarial Network (CGAN) model for reconstructing ECT images from capacitance
+measurements. The initial image of the CGAN model is constructed from the
+capacitance measurement. To our knowledge, this is the first time to represent
+the capacitance measurements in an image form. We have created a new massive
+ECT dataset of 320K synthetic image measurements pairs for training, and
+testing the proposed model. The feasibility and generalization ability of the
+proposed CGAN-ECT model are evaluated using testing dataset, contaminated data
+and flow patterns that are not exposed to the model during the training phase.
+The evaluation results prove that the proposed CGAN-ECT model can efficiently
+create more accurate ECT images than traditional and other deep learning-based
+image reconstruction algorithms. CGAN-ECT achieved an average image correlation
+coefficient of more than 99.3% and an average relative image error about 0.07.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 10 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comprehensive Study of the Robustness for LiDAR-based 3D Object
+  Detectors against Adversarial Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.10230v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.10230v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Zhang, Junhui Hou, Yixuan Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have witnessed significant advancements in deep learning-based
+3D object detection, leading to its widespread adoption in numerous
+applications. As 3D object detectors become increasingly crucial for
+security-critical tasks, it is imperative to understand their robustness
+against adversarial attacks. This paper presents the first comprehensive
+evaluation and analysis of the robustness of LiDAR-based 3D detectors under
+adversarial attacks. Specifically, we extend three distinct adversarial attacks
+to the 3D object detection task, benchmarking the robustness of
+state-of-the-art LiDAR-based 3D object detectors against attacks on the KITTI
+and Waymo datasets. We further analyze the relationship between robustness and
+detector properties. Additionally, we explore the transferability of
+cross-model, cross-task, and cross-data attacks. Thorough experiments on
+defensive strategies for 3D detectors are conducted, demonstrating that simple
+transformations like flipping provide little help in improving robustness when
+the applied transformation strategy is exposed to attackers. \revise{Finally,
+we propose balanced adversarial focal training, based on conventional
+adversarial training, to strike a balance between accuracy and robustness.} Our
+findings will facilitate investigations into understanding and defending
+against adversarial attacks on LiDAR-based 3D object detectors, thus advancing
+the field. The source code is publicly available at
+\url{https://github.com/Eaphan/Robust3DOD}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 14 figures. Accepted by IJCV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rotating Features for Object Discovery <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.00600v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.00600v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sindy Löwe, Phillip Lippe, Francesco Locatello, Max Welling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The binding problem in human cognition, concerning how the brain represents
+and connects objects within a fixed network of neural connections, remains a
+subject of intense debate. Most machine learning efforts addressing this issue
+in an unsupervised setting have focused on slot-based methods, which may be
+limiting due to their discrete nature and difficulty to express uncertainty.
+Recently, the Complex AutoEncoder was proposed as an alternative that learns
+continuous and distributed object-centric representations. However, it is only
+applicable to simple toy data. In this paper, we present Rotating Features, a
+generalization of complex-valued features to higher dimensions, and a new
+evaluation procedure for extracting objects from distributed representations.
+Additionally, we show the applicability of our approach to pre-trained
+features. Together, these advancements enable us to scale distributed
+object-centric representations from simple toy to real-world data. We believe
+this work advances a new paradigm for addressing the binding problem in machine
+learning and has the potential to inspire further innovation in the field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Oral presentation at NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Customized Segment Anything Model for Medical Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.13785v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.13785v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaidong Zhang, Dong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose SAMed, a general solution for medical image segmentation.
+Different from the previous methods, SAMed is built upon the large-scale image
+segmentation model, Segment Anything Model (SAM), to explore the new research
+paradigm of customizing large-scale models for medical image segmentation.
+SAMed applies the low-rank-based (LoRA) finetuning strategy to the SAM image
+encoder and finetunes it together with the prompt encoder and the mask decoder
+on labeled medical image segmentation datasets. We also observe the warmup
+finetuning strategy and the AdamW optimizer lead SAMed to successful
+convergence and lower loss. Different from SAM, SAMed could perform semantic
+segmentation on medical images. Our trained SAMed model achieves 81.88 DSC and
+20.64 HD on the Synapse multi-organ segmentation dataset, which is on par with
+the state-of-the-art methods. We conduct extensive experiments to validate the
+effectiveness of our design. Since SAMed only updates a small fraction of the
+SAM parameters, its deployment cost and storage cost are quite marginal in
+practical usage. The code of SAMed is available at
+https://github.com/hitachinsk/SAMed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report, 14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLM-CXR: Instruction-Finetuned LLM for CXR Image Understanding and
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11490v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11490v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suhyeon Lee, Won Jun Kim, Jinho Chang, Jong Chul Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Following the impressive development of LLMs, vision-language alignment in
+LLMs is actively being researched to enable multimodal reasoning and visual IO.
+This direction of research is particularly relevant to medical imaging because
+medical image analysis and generation consist of reasoning based on a
+combination of visual features and prior knowledge. Many recent works have
+focused on training adapter networks that serve as an information bridge
+between image processing networks and LLMs; but presumably, in order to achieve
+maximum reasoning potential of LLMs on visual information as well, visual and
+language features should be allowed to interact more freely. This is especially
+important in the medical domain because understanding and generating medical
+images such as chest X-rays (CXR) require not only accurate visual and
+language-based reasoning but also a more intimate mapping between the two
+modalities. Thus, taking inspiration from previous work on the transformer and
+VQ-GAN combination for bidirectional image and text generation, we build upon
+this approach and develop a method for instruction-tuning an LLM pre-trained
+only on text to gain vision-language capabilities for medical images.
+Specifically, we leverage a pretrained LLM's existing question-answering and
+instruction-following abilities to teach it to understand visual inputs by
+instructing it to answer questions about image inputs and, symmetrically,
+output both text and image responses appropriate to a given query by tuning the
+LLM with diverse tasks that encompass image-based text-generation and
+text-based image-generation. We show that our model, LLM-CXR, trained in this
+approach shows better image-text alignment in both CXR understanding and
+generation tasks while being smaller in size compared to previously developed
+models that perform a narrower range of tasks. The code is at
+https://github.com/hyn2028/llm-cxr.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lightweight Full-Convolutional Siamese Tracker 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05392v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05392v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunfeng Li, Bo Wang, Xueyi Wu, Zhuoyan Liu, Ye Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although single object trackers have achieved advanced performance, their
+large-scale models make it difficult to apply them on the platforms with
+limited resources. Moreover, existing lightweight trackers only achieve balance
+between 2-3 points in terms of parameters, performance, Flops and FPS. To
+achieve the optimal balance among these points, this paper propose a
+lightweight full-convolutional Siamese tracker called LightFC. LightFC employs
+a novel efficient cross-correlation module (ECM) and a novel efficient
+rep-center head (ERH) to enhance the nonlinear expressiveness of the
+convolutional tracking pipeline. The ECM employs an attention-like module
+design, which conducts spatial and channel linear fusion of fused features and
+enhances the nonlinearly of the fused features. Additionally, it references
+successful factors of current lightweight trackers and introduces
+skip-connections and reuse of search area features. The ERH reparameterizes the
+feature dimensional stage in the standard center head and introduces channel
+attention to optimize the bottleneck of key feature flows. Comprehensive
+experiments show that LightFC achieves the optimal balance between performance,
+parameters, Flops and FPS. The precision score of LightFC outperforms
+MixFormerV2-S by 3.7 \% and 6.5 \% on LaSOT and TNL2K, respectively, while
+using 5x fewer parameters and 4.6x fewer Flops. Besides, LightFC runs 2x faster
+than MixFormerV2-S on CPUs. Our code and raw results can be found at
+https://github.com/LiYunfengLYF/LightFC
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ fRegGAN with K-space Loss Regularization for Medical Image Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15938v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15938v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ivo M. Baltruschat, Felix Kreis, Alexander Hoelscher, Melanie Dohmen, Matthias Lenga
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative adversarial networks (GANs) have shown remarkable success in
+generating realistic images and are increasingly used in medical imaging for
+image-to-image translation tasks. However, GANs tend to suffer from a frequency
+bias towards low frequencies, which can lead to the removal of important
+structures in the generated images. To address this issue, we propose a novel
+frequency-aware image-to-image translation framework based on the supervised
+RegGAN approach, which we call fRegGAN. The framework employs a K-space loss to
+regularize the frequency content of the generated images and incorporates
+well-known properties of MRI K-space geometry to guide the network training
+process. By combine our method with the RegGAN approach, we can mitigate the
+effect of training with misaligned data and frequency bias at the same time. We
+evaluate our method on the public BraTS dataset and outperform the baseline
+methods in terms of both quantitative and qualitative metrics when synthesizing
+T2-weighted from T1-weighted MR images. Detailed ablation studies are provided
+to understand the effect of each modification on the final performance. The
+proposed method is a step towards improving the performance of image-to-image
+translation and synthesis in the medical domain and shows promise for other
+applications in the field of image processing and generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Scene-Text Synthesis Engine Achieved Through Learning from Decomposed
+  Real-World Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.02397v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.02397v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengmi Tang, Tomo Miyazaki, Shinichiro Omachi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene-text image synthesis techniques that aim to naturally compose text
+instances on background scene images are very appealing for training deep
+neural networks due to their ability to provide accurate and comprehensive
+annotation information. Prior studies have explored generating synthetic text
+images on two-dimensional and three-dimensional surfaces using rules derived
+from real-world observations. Some of these studies have proposed generating
+scene-text images through learning; however, owing to the absence of a suitable
+training dataset, unsupervised frameworks have been explored to learn from
+existing real-world data, which might not yield reliable performance. To ease
+this dilemma and facilitate research on learning-based scene text synthesis, we
+introduce DecompST, a real-world dataset prepared from some public benchmarks,
+containing three types of annotations: quadrilateral-level BBoxes, stroke-level
+text masks, and text-erased images. Leveraging the DecompST dataset, we propose
+a Learning-Based Text Synthesis engine (LBTS) that includes a text location
+proposal network (TLPNet) and a text appearance adaptation network (TAANet).
+TLPNet first predicts the suitable regions for text embedding, after which
+TAANet adaptively adjusts the geometry and color of the text instance to match
+the background context. After training, those networks can be integrated and
+utilized to generate the synthetic dataset for scene text analysis tasks.
+Comprehensive experiments were conducted to validate the effectiveness of the
+proposed LBTS along with existing methods, and the experimental results
+indicate the proposed LBTS can generate better pretraining data for scene text
+detectors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Getting ViT in Shape: Scaling Laws for Compute-Optimal Model Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13035v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13035v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ibrahim Alabdulmohsin, Xiaohua Zhai, Alexander Kolesnikov, Lucas Beyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scaling laws have been recently employed to derive compute-optimal model size
+(number of parameters) for a given compute duration. We advance and refine such
+methods to infer compute-optimal model shapes, such as width and depth, and
+successfully implement this in vision transformers. Our shape-optimized vision
+transformer, SoViT, achieves results competitive with models that exceed twice
+its size, despite being pre-trained with an equivalent amount of compute. For
+example, SoViT-400m/14 achieves 90.3% fine-tuning accuracy on ILSRCV2012,
+surpassing the much larger ViT-g/14 and approaching ViT-G/14 under identical
+settings, with also less than half the inference cost. We conduct a thorough
+evaluation across multiple tasks, such as image classification, captioning, VQA
+and zero-shot transfer, demonstrating the effectiveness of our model across a
+broad range of domains and identifying limitations. Overall, our findings
+challenge the prevailing approach of blindly scaling up vision models and pave
+a path for a more informed scaling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures, 9 tables. Version 2: Layout fixes</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Snapshot of Algebraic Vision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.11443v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.11443v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joe Kileel, Kathlén Kohn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this survey article, we present interactions between algebraic geometry
+and computer vision, which have recently come under the header of algebraic
+vision. The subject has given new insights in multiple view geometry and its
+application to 3D scene reconstruction and carried a host of novel problems and
+ideas back into algebraic geometry.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v2: incorporated referees' suggestions</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AutoDIR: Automatic All-in-One Image Restoration with Latent Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10123v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10123v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yitong Jiang, Zhaoyang Zhang, Tianfan Xue, Jinwei Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we aim to solve complex real-world image restoration
+situations, in which, one image may have a variety of unknown degradations. To
+this end, we propose an all-in-one image restoration framework with latent
+diffusion (AutoDIR), which can automatically detect and address multiple
+unknown degradations. Our framework first utilizes a Blind Image Quality
+Assessment Module (BIQA) to automatically detect and identify the unknown
+dominant image degradation type of the image. Then, an All-in-One Image
+Refinement (AIR) Module handles multiple kinds of degradation image restoration
+with the guidance of BIQA. Finally, a Structure Correction Module (SCM) is
+proposed to recover the image details distorted by AIR. Our comprehensive
+evaluation demonstrates that AutoDIR outperforms state-of-the-art approaches by
+achieving superior restoration results while supporting a wider range of tasks.
+Notably, AutoDIR is also the first method to automatically handle real-scenario
+images with multiple unknown degradations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion Unit: Interpretable Edge Enhancement and Suppression Learning
+  for 3D Point Cloud Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.09483v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.09483v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyi Xiu, Xin Liu, Weimin Wang, Kyoung-Sook Kim, Takayuki Shinohara, Qiong Chang, Masashi Matsuoka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D point clouds are discrete samples of continuous surfaces which can be used
+for various applications. However, the lack of true connectivity information,
+i.e., edge information, makes point cloud recognition challenging. Recent
+edge-aware methods incorporate edge modeling into network designs to better
+describe local structures. Although these methods show that incorporating edge
+information is beneficial, how edge information helps remains unclear, making
+it difficult for users to analyze its usefulness. To shed light on this issue,
+in this study, we propose a new algorithm called Diffusion Unit (DU) that
+handles edge information in a principled and interpretable manner while
+providing decent improvement. First, we theoretically show that DU learns to
+perform task-beneficial edge enhancement and suppression. Second, we
+experimentally observe and verify the edge enhancement and suppression
+behavior. Third, we empirically demonstrate that this behavior contributes to
+performance improvement. Extensive experiments and analyses performed on
+challenging benchmarks verify the effectiveness of DU. Specifically, our method
+achieves state-of-the-art performance in object part segmentation using
+ShapeNet part and scene segmentation using S3DIS. Our source code is available
+at https://github.com/martianxiu/DiffusionUnit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Neurocomputing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Patch of Invisibility: Naturalistic Physical Black-Box Adversarial
+  Attacks on Object Detectors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.04238v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.04238v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raz Lapid, Eylon Mizrahi, Moshe Sipper
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial attacks on deep-learning models have been receiving increased
+attention in recent years. Work in this area has mostly focused on
+gradient-based techniques, so-called ``white-box'' attacks, wherein the
+attacker has access to the targeted model's internal parameters; such an
+assumption is usually unrealistic in the real world. Some attacks additionally
+use the entire pixel space to fool a given model, which is neither practical
+nor physical (i.e., real-world). On the contrary, we propose herein a direct,
+black-box, gradient-free method that uses the learned image manifold of a
+pretrained generative adversarial network (GAN) to generate naturalistic
+physical adversarial patches for object detectors. To our knowledge this is the
+first and only method that performs black-box physical attacks directly on
+object-detection models, which results with a model-agnostic attack. We show
+that our proposed method works both digitally and physically. We compared our
+approach against four different black-box attacks with different
+configurations. Our approach outperformed all other approaches that were tested
+in our experiments by a large margin.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NeuroInspect: Interpretable Neuron-based Debugging Framework through
+  Class-conditional Visualizations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07184v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07184v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeong-Joon Ju, Ji-Hoon Park, Seong-Whan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite deep learning (DL) has achieved remarkable progress in various
+domains, the DL models are still prone to making mistakes. This issue
+necessitates effective debugging tools for DL practitioners to interpret the
+decision-making process within the networks. However, existing debugging
+methods often demand extra data or adjustments to the decision process,
+limiting their applicability. To tackle this problem, we present NeuroInspect,
+an interpretable neuron-based debugging framework with three key stages:
+counterfactual explanations, feature visualizations, and false correlation
+mitigation. Our debugging framework first pinpoints neurons responsible for
+mistakes in the network and then visualizes features embedded in the neurons to
+be human-interpretable. To provide these explanations, we introduce
+CLIP-Illusion, a novel feature visualization method that generates images
+representing features conditioned on classes to examine the connection between
+neurons and the decision layer. We alleviate convoluted explanations of the
+conventional visualization approach by employing class information, thereby
+isolating mixed properties. This process offers more human-interpretable
+explanations for model errors without altering the trained network or requiring
+additional data. Furthermore, our framework mitigates false correlations
+learned from a dataset under a stochastic perspective, modifying decisions for
+the neurons considered as the main causes. We validate the effectiveness of our
+framework by addressing false correlations and improving inferences for classes
+with the worst performance in real-world settings. Moreover, we demonstrate
+that NeuroInspect helps debug the mistakes of DL models through evaluation for
+human understanding. The code is openly available at
+https://github.com/yeongjoonJu/NeuroInspect.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient High-Resolution Template Matching with Vector Quantized
+  Nearest Neighbour Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15010v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15010v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ankit Gupta, Ida-Maria Sintorn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Template matching is a fundamental problem in computer vision with
+applications in fields including object detection, image registration, and
+object tracking. Current methods rely on nearest-neighbour (NN) matching, where
+the query feature space is converted to NN space by representing each query
+pixel with its NN in the template. NN-based methods have been shown to perform
+better in occlusions, appearance changes, and non-rigid transformations;
+however, they scale poorly with high-resolution data and high feature
+dimensions. We present an NN-based method which efficiently reduces the NN
+computations and introduces filtering in the NN fields (NNFs). A vector
+quantization step is introduced before the NN calculation to represent the
+template with $k$ features, and the filter response over the NNFs is used to
+compare the template and query distributions over the features. We show that
+state-of-the-art performance is achieved in low-resolution data, and our method
+outperforms previous methods at higher resolution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ReForm-Eval: Evaluating Large Vision Language Models via Unified
+  Re-Formulation of Task-Oriented Benchmarks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02569v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02569v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zejun Li, Ye Wang, Mengfei Du, Qingwen Liu, Binhao Wu, Jiwen Zhang, Chengxing Zhou, Zhihao Fan, Jie Fu, Jingjing Chen, Xuanjing Huang, Zhongyu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent years have witnessed remarkable progress in the development of large
+vision-language models (LVLMs). Benefiting from the strong language backbones
+and efficient cross-modal alignment strategies, LVLMs exhibit surprising
+capabilities to perceive visual signals and perform visually grounded
+reasoning. However, the capabilities of LVLMs have not been comprehensively and
+quantitatively evaluate. Most existing multi-modal benchmarks require
+task-oriented input-output formats, posing great challenges to automatically
+assess the free-form text output of LVLMs. To effectively leverage the
+annotations available in existing benchmarks and reduce the manual effort
+required for constructing new benchmarks, we propose to re-formulate existing
+benchmarks into unified LVLM-compatible formats. Through systematic data
+collection and reformulation, we present the ReForm-Eval benchmark, offering
+substantial data for evaluating various capabilities of LVLMs. Based on
+ReForm-Eval, we conduct extensive experiments, thoroughly analyze the strengths
+and weaknesses of existing LVLMs, and identify the underlying factors. Our
+benchmark and evaluation framework will be open-sourced as a cornerstone for
+advancing the development of LVLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages, 11 figures, 24 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GeoDiffusion: Text-<span class="highlight-title">Prompt</span>ed Geometric Control for Object Detection Data
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.04607v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.04607v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Chen, Enze Xie, Zhe Chen, Yibo Wang, Lanqing Hong, Zhenguo Li, Dit-Yan Yeung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have attracted significant attention due to the remarkable
+ability to create content and generate data for tasks like image
+classification. However, the usage of diffusion models to generate the
+high-quality object detection data remains an underexplored area, where not
+only image-level perceptual quality but also geometric conditions such as
+bounding boxes and camera views are essential. Previous studies have utilized
+either copy-paste synthesis or layout-to-image (L2I) generation with
+specifically designed modules to encode semantic layouts. In this paper, we
+propose GeoDiffusion, a simple framework that can flexibly translate various
+geometric conditions into text prompts and empower pre-trained text-to-image
+(T2I) diffusion models for high-quality detection data generation. Unlike
+previous L2I methods, our GeoDiffusion is able to encode not only the bounding
+boxes but also extra geometric conditions such as camera views in self-driving
+scenes. Extensive experiments demonstrate GeoDiffusion outperforms previous L2I
+methods while maintaining 4x training time faster. To the best of our
+knowledge, this is the first work to adopt diffusion models for layout-to-image
+generation with geometric conditions and demonstrate that L2I-generated images
+can be beneficial for improving the performance of object detectors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://kaichen1998.github.io/projects/geodiffusion/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ReContrast: Domain-Specific Anomaly Detection via Contrastive
+  Reconstruction <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02602v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02602v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia Guo, Shuai Lu, Lize Jia, Weihang Zhang, Huiqi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most advanced unsupervised anomaly detection (UAD) methods rely on modeling
+feature representations of frozen encoder networks pre-trained on large-scale
+datasets, e.g. ImageNet. However, the features extracted from the encoders that
+are borrowed from natural image domains coincide little with the features
+required in the target UAD domain, such as industrial inspection and medical
+imaging. In this paper, we propose a novel epistemic UAD method, namely
+ReContrast, which optimizes the entire network to reduce biases towards the
+pre-trained image domain and orients the network in the target domain. We start
+with a feature reconstruction approach that detects anomalies from errors.
+Essentially, the elements of contrastive learning are elegantly embedded in
+feature reconstruction to prevent the network from training instability,
+pattern collapse, and identical shortcut, while simultaneously optimizing both
+the encoder and decoder on the target domain. To demonstrate our transfer
+ability on various image domains, we conduct extensive experiments across two
+popular industrial defect detection benchmarks and three medical image UAD
+tasks, which shows our superiority over current state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023 Poster</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhanced Meta Label Correction for Coping with Label Corruption <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12961v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12961v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mitchell Keren Taraday, Chaim Baskin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional methods for learning with the presence of noisy labels have
+successfully handled datasets with artificially injected noise but still fall
+short of adequately handling real-world noise. With the increasing use of
+meta-learning in the diverse fields of machine learning, researchers leveraged
+auxiliary small clean datasets to meta-correct the training labels.
+Nonetheless, existing meta-label correction approaches are not fully exploiting
+their potential. In this study, we propose an Enhanced Meta Label Correction
+approach abbreviated as EMLC for the learning with noisy labels (LNL) problem.
+We re-examine the meta-learning process and introduce faster and more accurate
+meta-gradient derivations. We propose a novel teacher architecture tailored
+explicitly to the LNL problem, equipped with novel training objectives. EMLC
+outperforms prior approaches and achieves state-of-the-art results in all
+standard benchmarks. Notably, EMLC enhances the previous art on the noisy
+real-world dataset Clothing1M by $1.52\%$ while requiring $\times 0.5$ the time
+per epoch and with much faster convergence of the meta-objective when compared
+to the baseline approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Connections between Operator-splitting Methods and Deep Neural Networks
+  with Applications in Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09052v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09052v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Liu, Xue-Cheng Tai, Raymond Chan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural network is a powerful tool for many tasks. Understanding why it
+is so successful and providing a mathematical explanation is an important
+problem and has been one popular research direction in past years. In the
+literature of mathematical analysis of deep neural networks, a lot of works is
+dedicated to establishing representation theories. How to make connections
+between deep neural networks and mathematical algorithms is still under
+development. In this paper, we give an algorithmic explanation for deep neural
+networks, especially in their connections with operator splitting. We show that
+with certain splitting strategies, operator-splitting methods have the same
+structure as networks. Utilizing this connection and the Potts model for image
+segmentation, two networks inspired by operator-splitting methods are proposed.
+The two networks are essentially two operator-splitting algorithms solving the
+Potts model. Numerical experiments are presented to demonstrate the
+effectiveness of the proposed networks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DCPT: Darkness Clue-<span class="highlight-title">Prompt</span>ed Tracking in Nighttime UAVs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10491v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10491v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawen Zhu, Huayi Tang, Zhi-Qi Cheng, Jun-Yan He, Bin Luo, Shihao Qiu, Shengming Li, Huchuan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing nighttime unmanned aerial vehicle (UAV) trackers follow an
+"Enhance-then-Track" architecture - first using a light enhancer to brighten
+the nighttime video, then employing a daytime tracker to locate the object.
+This separate enhancement and tracking fails to build an end-to-end trainable
+vision system. To address this, we propose a novel architecture called Darkness
+Clue-Prompted Tracking (DCPT) that achieves robust UAV tracking at night by
+efficiently learning to generate darkness clue prompts. Without a separate
+enhancer, DCPT directly encodes anti-dark capabilities into prompts using a
+darkness clue prompter (DCP). Specifically, DCP iteratively learns emphasizing
+and undermining projections for darkness clues. It then injects these learned
+visual prompts into a daytime tracker with fixed parameters across transformer
+layers. Moreover, a gated feature aggregation mechanism enables adaptive fusion
+between prompts and between prompts and the base model. Extensive experiments
+show state-of-the-art performance for DCPT on multiple dark scenario
+benchmarks. The unified end-to-end learning of enhancement and tracking in DCPT
+enables a more trainable system. The darkness clue prompting efficiently
+injects anti-dark knowledge without extra modules. Code is available at
+\href{https://github.com/bearyi26/DCPT}{here}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Spatio-Temporal Attention-Based Method for Detecting Student Classroom
+  Behaviors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02523v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02523v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fan Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurately detecting student behavior from classroom videos is beneficial for
+analyzing their classroom status and improving teaching efficiency. However,
+low accuracy in student classroom behavior detection is a prevalent issue. To
+address this issue, we propose a Spatio-Temporal Attention-Based Method for
+Detecting Student Classroom Behaviors (BDSTA). Firstly, the SlowFast network is
+used to generate motion and environmental information feature maps from the
+video. Then, the spatio-temporal attention module is applied to the feature
+maps, including information aggregation, compression and stimulation processes.
+Subsequently, attention maps in the time, channel and space dimensions are
+obtained, and multi-label behavior classification is performed based on these
+attention maps. To solve the long-tail data problem that exists in student
+classroom behavior datasets, we use an improved focal loss function to assign
+more weight to the tail class data during training. Experimental results are
+conducted on a self-made student classroom behavior dataset named STSCB.
+Compared with the SlowFast model, the average accuracy of student behavior
+classification detection improves by 8.94\% using BDSTA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning for Video-based Person Re-Identification: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11332v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11332v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khawar Islam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video-based person re-identification (video re-ID) has lately fascinated
+growing attention due to its broad practical applications in various areas,
+such as surveillance, smart city, and public safety. Nevertheless, video re-ID
+is quite difficult and is an ongoing stage due to numerous uncertain challenges
+such as viewpoint, occlusion, pose variation, and uncertain video sequence,
+etc. In the last couple of years, deep learning on video re-ID has continuously
+achieved surprising results on public datasets, with various approaches being
+developed to handle diverse problems in video re-ID. Compared to image-based
+re-ID, video re-ID is much more challenging and complex. To encourage future
+research and challenges, this first comprehensive paper introduces a review of
+up-to-date advancements in deep learning approaches for video re-ID. It broadly
+covers three important aspects, including brief video re-ID methods with their
+limitations, major milestones with technical challenges, and architectural
+design. It offers comparative performance analysis on various available
+datasets, guidance to improve video re-ID with valuable thoughts, and exciting
+research directions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Recent Advances in Vision <span class="highlight-title">Transformer</span>: A <span class="highlight-title">Survey</span> and Outlook of Recent
+  Work <span class="chip">AAAI 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.01536v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.01536v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Khawar Islam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Transformers (ViTs) are becoming more popular and dominating technique
+for various vision tasks, compare to Convolutional Neural Networks (CNNs). As a
+demanding technique in computer vision, ViTs have been successfully solved
+various vision problems while focusing on long-range relationships. In this
+paper, we begin by introducing the fundamental concepts and background of the
+self-attention mechanism. Next, we provide a comprehensive overview of recent
+top-performing ViT methods describing in terms of strength and weakness,
+computational cost as well as training and testing dataset. We thoroughly
+compare the performance of various ViT algorithms and most representative CNN
+methods on popular benchmark datasets. Finally, we explore some limitations
+with insightful observations and provide further research direction. The
+project page along with the collections of papers are available at
+https://github.com/khawar512/ViT-Survey
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Added AAAI 2022 methods and working on ICLR 2022 methods and ICML
+  2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CLIP4STR: A Simple Baseline for Scene Text Recognition with <span class="highlight-title">Pre-train</span>ed
+  Vision-Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14014v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14014v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuai Zhao, Xiaohan Wang, Linchao Zhu, Ruijie Quan, Yi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained vision-language models~(VLMs) are the de-facto foundation models
+for various downstream tasks. However, scene text recognition methods still
+prefer backbones pre-trained on a single modality, namely, the visual modality,
+despite the potential of VLMs to serve as powerful scene text readers. For
+example, CLIP can robustly identify regular (horizontal) and irregular
+(rotated, curved, blurred, or occluded) text in images. With such merits, we
+transform CLIP into a scene text reader and introduce CLIP4STR, a simple yet
+effective STR method built upon image and text encoders of CLIP. It has two
+encoder-decoder branches: a visual branch and a cross-modal branch. The visual
+branch provides an initial prediction based on the visual feature, and the
+cross-modal branch refines this prediction by addressing the discrepancy
+between the visual feature and text semantics. To fully leverage the
+capabilities of both branches, we design a dual predict-and-refine decoding
+scheme for inference. CLIP4STR achieves new state-of-the-art performance on 11
+STR benchmarks. Additionally, a comprehensive empirical study is provided to
+enhance the understanding of the adaptation of CLIP to STR. We believe our
+method establishes a simple but strong baseline for future STR research with
+VLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint, work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ D-IF: Uncertainty-aware Human Digitization via Implicit Distribution
+  Field 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.08857v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.08857v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueting Yang, Yihao Luo, Yuliang Xiu, Wei Wang, Hao Xu, Zhaoxin Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Realistic virtual humans play a crucial role in numerous industries, such as
+metaverse, intelligent healthcare, and self-driving simulation. But creating
+them on a large scale with high levels of realism remains a challenge. The
+utilization of deep implicit function sparks a new era of image-based 3D
+clothed human reconstruction, enabling pixel-aligned shape recovery with fine
+details. Subsequently, the vast majority of works locate the surface by
+regressing the deterministic implicit value for each point. However, should all
+points be treated equally regardless of their proximity to the surface? In this
+paper, we propose replacing the implicit value with an adaptive uncertainty
+distribution, to differentiate between points based on their distance to the
+surface. This simple ``value to distribution'' transition yields significant
+improvements on nearly all the baselines. Furthermore, qualitative results
+demonstrate that the models trained using our uncertainty distribution loss,
+can capture more intricate wrinkles, and realistic limbs. Code and models are
+available for research purposes at https://github.com/psyai-net/D-IF_release.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Show-1: Marrying Pixel and Latent Diffusion Models for Text-to-Video
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15818v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15818v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Junhao Zhang, Jay Zhangjie Wu, Jia-Wei Liu, Rui Zhao, Lingmin Ran, Yuchao Gu, Difei Gao, Mike Zheng Shou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Significant advancements have been achieved in the realm of large-scale
+pre-trained text-to-video Diffusion Models (VDMs). However, previous methods
+either rely solely on pixel-based VDMs, which come with high computational
+costs, or on latent-based VDMs, which often struggle with precise text-video
+alignment. In this paper, we are the first to propose a hybrid model, dubbed as
+Show-1, which marries pixel-based and latent-based VDMs for text-to-video
+generation. Our model first uses pixel-based VDMs to produce a low-resolution
+video of strong text-video correlation. After that, we propose a novel expert
+translation method that employs the latent-based VDMs to further upsample the
+low-resolution video to high resolution. Compared to latent VDMs, Show-1 can
+produce high-quality videos of precise text-video alignment; Compared to pixel
+VDMs, Show-1 is much more efficient (GPU memory usage during inference is 15G
+vs 72G). We also validate our model on standard video generation benchmarks.
+Our code and model weights are publicly available at
+https://github.com/showlab/Show-1.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>project page is https://showlab.github.io/Show-1</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MoConVQ: Unified Physics-Based Motion Control via Scalable Discrete
+  Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10198v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10198v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Heyuan Yao, Zhenhua Song, Yuyang Zhou, Tenglong Ao, Baoquan Chen, Libin Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we present MoConVQ, a novel unified framework for physics-based
+motion control leveraging scalable discrete representations. Building upon
+vector quantized variational autoencoders (VQ-VAE) and model-based
+reinforcement learning, our approach effectively learns motion embeddings from
+a large, unstructured dataset spanning tens of hours of motion examples. The
+resultant motion representation not only captures diverse motion skills but
+also offers a robust and intuitive interface for various applications. We
+demonstrate the versatility of MoConVQ through several applications: universal
+tracking control from various motion sources, interactive character control
+with latent motion representations using supervised learning, physics-based
+motion generation from natural language descriptions using the GPT framework,
+and, most interestingly, seamless integration with large language models (LLMs)
+with in-context learning to tackle complex and abstract tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://pku-mocca.github.io/MoConVQ-page/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UniPC: A Unified Predictor-Corrector Framework for Fast Sampling of
+  Diffusion Models <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.04867v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.04867v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenliang Zhao, Lujia Bai, Yongming Rao, Jie Zhou, Jiwen Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion probabilistic models (DPMs) have demonstrated a very promising
+ability in high-resolution image synthesis. However, sampling from a
+pre-trained DPM is time-consuming due to the multiple evaluations of the
+denoising network, making it more and more important to accelerate the sampling
+of DPMs. Despite recent progress in designing fast samplers, existing methods
+still cannot generate satisfying images in many applications where fewer steps
+(e.g., $<$10) are favored. In this paper, we develop a unified corrector (UniC)
+that can be applied after any existing DPM sampler to increase the order of
+accuracy without extra model evaluations, and derive a unified predictor (UniP)
+that supports arbitrary order as a byproduct. Combining UniP and UniC, we
+propose a unified predictor-corrector framework called UniPC for the fast
+sampling of DPMs, which has a unified analytical form for any order and can
+significantly improve the sampling quality over previous methods, especially in
+extremely few steps. We evaluate our methods through extensive experiments
+including both unconditional and conditional sampling using pixel-space and
+latent-space DPMs. Our UniPC can achieve 3.87 FID on CIFAR10 (unconditional)
+and 7.51 FID on ImageNet 256$\times$256 (conditional) with only 10 function
+evaluations. Code is available at https://github.com/wl-zhao/UniPC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2023. Project page:
+  https://unipc.ivg-research.xyz</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ImagenHub: Standardizing the evaluation of conditional image generation
+  models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01596v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01596v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Max Ku, Tianle Li, Kai Zhang, Yujie Lu, Xingyu Fu, Wenwen Zhuang, Wenhu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, a myriad of conditional image generation and editing models have
+been developed to serve different downstream tasks, including text-to-image
+generation, text-guided image editing, subject-driven image generation,
+control-guided image generation, etc. However, we observe huge inconsistencies
+in experimental conditions: datasets, inference, and evaluation metrics -
+render fair comparisons difficult. This paper proposes ImagenHub, which is a
+one-stop library to standardize the inference and evaluation of all the
+conditional image generation models. Firstly, we define seven prominent tasks
+and curate high-quality evaluation datasets for them. Secondly, we built a
+unified inference pipeline to ensure fair comparison. Thirdly, we design two
+human evaluation scores, i.e. Semantic Consistency and Perceptual Quality,
+along with comprehensive guidelines to evaluate generated images. We train
+expert raters to evaluate the model outputs based on the proposed metrics. Our
+human evaluation achieves a high inter-worker agreement of Krippendorff's alpha
+on 76% models with a value higher than 0.4. We comprehensively evaluated a
+total of around 30 models and observed three key takeaways: (1) the existing
+models' performance is generally unsatisfying except for Text-guided Image
+Generation and Subject-driven Image Generation, with 74% models achieving an
+overall score lower than 0.5. (2) we examined the claims from published papers
+and found 83% of them hold with a few exceptions. (3) None of the existing
+automatic metrics has a Spearman's correlation higher than 0.2 except
+subject-driven image generation. Moving forward, we will continue our efforts
+to evaluate newly published models and update our leaderboard to keep track of
+the progress in conditional image generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can <span class="highlight-title">GPT</span>-4V(ision) Serve Medical Applications? Case Studies on <span class="highlight-title">GPT</span>-4V for
+  Multimodal Medical Diagnosis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09909v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09909v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoyi Wu, Jiayu Lei, Qiaoyu Zheng, Weike Zhao, Weixiong Lin, Xiaoman Zhang, Xiao Zhou, Ziheng Zhao, Ya Zhang, Yanfeng Wang, Weidi Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Driven by the large foundation models, the development of artificial
+intelligence has witnessed tremendous progress lately, leading to a surge of
+general interest from the public. In this study, we aim to assess the
+performance of OpenAI's newest model, GPT-4V(ision), specifically in the realm
+of multimodal medical diagnosis. Our evaluation encompasses 17 human body
+systems, including Central Nervous System, Head and Neck, Cardiac, Chest,
+Hematology, Hepatobiliary, Gastrointestinal, Urogenital, Gynecology,
+Obstetrics, Breast, Musculoskeletal, Spine, Vascular, Oncology, Trauma,
+Pediatrics, with images taken from 8 modalities used in daily clinic routine,
+e.g., X-ray, Computed Tomography (CT), Magnetic Resonance Imaging (MRI),
+Positron Emission Tomography (PET), Digital Subtraction Angiography (DSA),
+Mammography, Ultrasound, and Pathology. We probe the GPT-4V's ability on
+multiple clinical tasks with or without patent history provided, including
+imaging modality and anatomy recognition, disease diagnosis, report generation,
+disease localisation.
+  Our observation shows that, while GPT-4V demonstrates proficiency in
+distinguishing between medical image modalities and anatomy, it faces
+significant challenges in disease diagnosis and generating comprehensive
+reports. These findings underscore that while large multimodal models have made
+significant advancements in computer vision and natural language processing, it
+remains far from being used to effectively support real-world medical
+applications and clinical decision-making.
+  All images used in this report can be found in
+https://github.com/chaoyi-wu/GPT-4V_Medical_Evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Label-efficient Segmentation via Affinity Propagation <span class="chip">NeurIPS2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10533v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10533v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wentong Li, Yuqian Yuan, Song Wang, Wenyu Liu, Dongqi Tang, Jian Liu, Jianke Zhu, Lei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Weakly-supervised segmentation with label-efficient sparse annotations has
+attracted increasing research attention to reduce the cost of laborious
+pixel-wise labeling process, while the pairwise affinity modeling techniques
+play an essential role in this task. Most of the existing approaches focus on
+using the local appearance kernel to model the neighboring pairwise potentials.
+However, such a local operation fails to capture the long-range dependencies
+and ignores the topology of objects. In this work, we formulate the affinity
+modeling as an affinity propagation process, and propose a local and a global
+pairwise affinity terms to generate accurate soft pseudo labels. An efficient
+algorithm is also developed to reduce significantly the computational cost. The
+proposed approach can be conveniently plugged into existing segmentation
+networks. Experiments on three typical label-efficient segmentation tasks, i.e.
+box-supervised instance segmentation, point/scribble-supervised semantic
+segmentation and CLIP-guided semantic segmentation, demonstrate the superior
+performance of the proposed approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS2023 Acceptance. Project
+  Page:https://LiWentomng.github.io/apro/. Code:
+  https://github.com/CircleRadon/APro</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ R&B: Region and Boundary Aware Zero-shot Grounded Text-to-image
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08872v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08872v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayu Xiao, Liang Li, Henglei Lv, Shuhui Wang, Qingming Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent text-to-image (T2I) diffusion models have achieved remarkable progress
+in generating high-quality images given text-prompts as input. However, these
+models fail to convey appropriate spatial composition specified by a layout
+instruction. In this work, we probe into zero-shot grounded T2I generation with
+diffusion models, that is, generating images corresponding to the input layout
+information without training auxiliary modules or finetuning diffusion models.
+We propose a Region and Boundary (R&B) aware cross-attention guidance approach
+that gradually modulates the attention maps of diffusion model during
+generative process, and assists the model to synthesize images (1) with high
+fidelity, (2) highly compatible with textual input, and (3) interpreting layout
+instructions accurately. Specifically, we leverage the discrete sampling to
+bridge the gap between consecutive attention maps and discrete layout
+constraints, and design a region-aware loss to refine the generative layout
+during diffusion process. We further propose a boundary-aware loss to
+strengthen object discriminability within the corresponding regions.
+Experimental results show that our method outperforms existing state-of-the-art
+zero-shot grounded T2I generation methods by a large margin both qualitatively
+and quantitatively on several benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Under review. Project page:
+  https://sagileo.github.io/Region-and-Boundary</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nerfstudio: A Modular Framework for Neural Radiance Field Development 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.04264v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.04264v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Tancik, Ethan Weber, Evonne Ng, Ruilong Li, Brent Yi, Justin Kerr, Terrance Wang, Alexander Kristoffersen, Jake Austin, Kamyar Salahi, Abhik Ahuja, David McAllister, Angjoo Kanazawa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Fields (NeRF) are a rapidly growing area of research with
+wide-ranging applications in computer vision, graphics, robotics, and more. In
+order to streamline the development and deployment of NeRF research, we propose
+a modular PyTorch framework, Nerfstudio. Our framework includes plug-and-play
+components for implementing NeRF-based methods, which make it easy for
+researchers and practitioners to incorporate NeRF into their projects.
+Additionally, the modular design enables support for extensive real-time
+visualization tools, streamlined pipelines for importing captured in-the-wild
+data, and tools for exporting to video, point cloud and mesh representations.
+The modularity of Nerfstudio enables the development of Nerfacto, our method
+that combines components from recent papers to achieve a balance between speed
+and quality, while also remaining flexible to future modifications. To promote
+community-driven development, all associated code and data are made publicly
+available with open-source licensing at https://nerf.studio.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page at https://nerf.studio</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploiting Manifold Structured Data Priors for Improved MR
+  Fingerprinting Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05647v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05647v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Li, Yuping Ji, Yue Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating tissue parameter maps with high accuracy and precision from highly
+undersampled measurements presents one of the major challenges in MR
+fingerprinting (MRF). Many existing works project the recovered voxel
+fingerprints onto the Bloch manifold to improve reconstruction performance.
+However, little research focuses on exploiting the latent manifold structure
+priors among fingerprints. To fill this gap, we propose a novel MRF
+reconstruction framework based on manifold structured data priors. Since it is
+difficult to directly estimate the fingerprint manifold structure, we model the
+tissue parameters as points on a low-dimensional parameter manifold. We reveal
+that the fingerprint manifold shares the same intrinsic topology as the
+parameter manifold, although being embedded in different Euclidean spaces. To
+exploit the non-linear and non-local redundancies in MRF data, we divide the
+MRF data into spatial patches, and the similarity measurement among data
+patches can be accurately obtained using the Euclidean distance between the
+corresponding patches in the parameter manifold. The measured similarity is
+then used to construct the graph Laplacian operator, which represents the
+fingerprint manifold structure. Thus, the fingerprint manifold structure is
+introduced in the reconstruction framework by using the low-dimensional
+parameter manifold. Additionally, we incorporate the locally low-rank prior in
+the reconstruction framework to further utilize the local correlations within
+each patch for improved reconstruction performance. We also adopt a
+GPU-accelerated NUFFT library to accelerate reconstruction in non-Cartesian
+sampling scenarios. Experimental results demonstrate that our method can
+achieve significantly improved reconstruction performance with reduced
+computational time over the state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 10 figures, will submit to IEEE Transactions on Medical
+  Imaging</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Embarrassingly Simple Approach for Wafer Feature Extraction and
+  Defect Pattern Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11632v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11632v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nitish Shukla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying defect patterns in a wafer map during manufacturing is crucial to
+find the root cause of the underlying issue and provides valuable insights on
+improving yield in the foundry. Currently used methods use deep neural networks
+to identify the defects. These methods are generally very huge and have
+significant inference time. They also require GPU support to efficiently
+operate. All these issues make these models not fit for on-line prediction in
+the manufacturing foundry. In this paper, we propose an extremely simple yet
+effective technique to extract features from wafer images. The proposed method
+is extremely fast, intuitive, and non-parametric while being explainable. The
+experiment results show that the proposed pipeline outperforms conventional
+deep learning models. Our feature extraction requires no training or
+fine-tuning while preserving the relative shape and location of data points as
+revealed by our interpretability analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>study is not relevant</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Mixed-Type Wafer Defect Pattern Recognition Using Compact
+  Deformable Convolutional <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13827v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13827v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nitish Shukla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Manufacturing wafers is an intricate task involving thousands of steps.
+Defect Pattern Recognition (DPR) of wafer maps is crucial to find the root
+cause of the issue and further improving the yield in the wafer foundry.
+Mixed-type DPR is much more complicated compared to single-type DPR due to
+varied spatial features, the uncertainty of defects, and the number of defects
+present. To accurately predict the number of defects as well as the types of
+defects, we propose a novel compact deformable convolutional transformer (DC
+Transformer). Specifically, DC Transformer focuses on the global features
+present in the wafer map by virtue of learnable deformable kernels and
+multi-head attention to the global features. The proposed method succinctly
+models the internal relationship between the wafer maps and the defects. DC
+Transformer is evaluated on a real dataset containing 38 defect patterns.
+Experimental results show that DC Transformer performs exceptionally well in
+recognizing both single and mixed-type defects. The proposed method outperforms
+the current state of the models by a considerable margin
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Study is not relevant</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Prototype-oriented Unsupervised Change Detection for Disaster Management 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09759v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09759v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youngtack Oh, Minseok Seo, Doyi Kim, Junghoon Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Climate change has led to an increased frequency of natural disasters such as
+floods and cyclones. This emphasizes the importance of effective disaster
+monitoring. In response, the remote sensing community has explored change
+detection methods. These methods are primarily categorized into supervised
+techniques, which yield precise results but come with high labeling costs, and
+unsupervised techniques, which eliminate the need for labeling but involve
+intricate hyperparameter tuning. To address these challenges, we propose a
+novel unsupervised change detection method named Prototype-oriented
+Unsupervised Change Detection for Disaster Management (PUCD). PUCD captures
+changes by comparing features from pre-event, post-event, and
+prototype-oriented change synthesis images via a foundational model, and
+refines results using the Segment Anything Model (SAM). Although PUCD is an
+unsupervised change detection, it does not require complex hyperparameter
+tuning. We evaluate PUCD framework on the LEVIR-Extension dataset and the
+disaster dataset and it achieves state-of-the-art performance compared to other
+methods on the LEVIR-Extension dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4page, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ InstructDET: Diversifying Referring Object Detection with Generalized
+  Instructions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05136v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05136v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ronghao Dang, Jiangyan Feng, Haodong Zhang, Chongjian Ge, Lin Song, Lijun Gong, Chengju Liu, Qijun Chen, Feng Zhu, Rui Zhao, Yibing Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose InstructDET, a data-centric method for referring object detection
+(ROD) that localizes target objects based on user instructions. While deriving
+from referring expressions (REC), the instructions we leverage are greatly
+diversified to encompass common user intentions related to object detection.
+For one image, we produce tremendous instructions that refer to every single
+object and different combinations of multiple objects. Each instruction and its
+corresponding object bounding boxes (bbxs) constitute one training data pair.
+In order to encompass common detection expressions, we involve emerging
+vision-language model (VLM) and large language model (LLM) to generate
+instructions guided by text prompts and object bbxs, as the generalizations of
+foundation models are effective to produce human-like expressions (e.g.,
+describing object property, category, and relationship). We name our
+constructed dataset as InDET. It contains images, bbxs and generalized
+instructions that are from foundation models. Our InDET is developed from
+existing REC datasets and object detection datasets, with the expanding
+potential that any image with object bbxs can be incorporated through using our
+InstructDET method. By using our InDET dataset, we show that a conventional ROD
+model surpasses existing methods on standard REC datasets and our InDET test
+set. Our data-centric method InstructDET, with automatic data expansion by
+leveraging foundation models, directs a promising field that ROD can be greatly
+diversified to execute common object detection instructions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages (include Appendix) Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EARL: An Elliptical Distribution aided Adaptive Rotation Label
+  Assignment for Oriented Object Detection in Remote Sensing Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.05856v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.05856v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jian Guan, Mingjie Xie, Youtian Lin, Guangjun He, Pengming Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Label assignment is a crucial process in object detection, which
+significantly influences the detection performance by determining positive or
+negative samples during training process. However, existing label assignment
+strategies barely consider the characteristics of targets in remote sensing
+images (RSIs) thoroughly, e.g., large variations in scales and aspect ratios,
+leading to insufficient and imbalanced sampling and introducing more
+low-quality samples, thereby limiting detection performance. To solve the above
+problems, an Elliptical Distribution aided Adaptive Rotation Label Assignment
+(EARL) is proposed to select high-quality positive samples adaptively in
+anchor-free detectors. Specifically, an adaptive scale sampling (ADS) strategy
+is presented to select samples adaptively among multi-level feature maps
+according to the scales of targets, which achieves sufficient sampling with
+more balanced scale-level sample distribution. In addition, a dynamic
+elliptical distribution aided sampling (DED) strategy is proposed to make the
+sample distribution more flexible to fit the shapes and orientations of
+targets, and filter out low-quality samples. Furthermore, a spatial distance
+weighting (SDW) module is introduced to integrate the adaptive distance
+weighting into loss function, which makes the detector more focused on the
+high-quality samples. Extensive experiments on several popular datasets
+demonstrate the effectiveness and superiority of our proposed EARL, where
+without bells and whistles, it can be easily applied to different detectors and
+achieve state-of-the-art performance. The source code will be available at:
+https://github.com/Justlovesmile/EARL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Defending Black-box Classifiers by Bayesian Boundary Correction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16979v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16979v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        He Wang, Yunfeng Diao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classifiers based on deep neural networks have been recently challenged by
+Adversarial Attack, where the widely existing vulnerability has invoked the
+research in defending them from potential threats. Given a vulnerable
+classifier, existing defense methods are mostly white-box and often require
+re-training the victim under modified loss functions/training regimes. While
+the model/data/training specifics of the victim are usually unavailable to the
+user, re-training is unappealing, if not impossible for reasons such as limited
+computational resources. To this end, we propose a new black-box defense
+framework. It can turn any pre-trained classifier into a resilient one with
+little knowledge of the model specifics. This is achieved by new joint Bayesian
+treatments on the clean data, the adversarial examples and the classifier, for
+maximizing their joint probability. It is further equipped with a new
+post-train strategy which keeps the victim intact. We name our framework
+Bayesian Boundary Correction (BBC). BBC is a general and flexible framework
+that can easily adapt to different data types. We instantiate BBC for image
+classification and skeleton-based human activity recognition, for both static
+and dynamic data. Exhaustive evaluation shows that BBC has superior robustness
+and can enhance robustness without severely hurting the clean accuracy,
+compared with existing defense methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2203.04713</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Prefix-diffusion: A Lightweight Diffusion Model for Diverse Image
+  Captioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.04965v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.04965v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guisheng Liu, Yi Li, Zhengcong Fei, Haiyan Fu, Xiangyang Luo, Yanqing Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While impressive performance has been achieved in image captioning, the
+limited diversity of the generated captions and the large parameter scale
+remain major barriers to the real-word application of these systems. In this
+work, we propose a lightweight image captioning network in combination with
+continuous diffusion, called Prefix-diffusion. To achieve diversity, we design
+an efficient method that injects prefix image embeddings into the denoising
+process of the diffusion model. In order to reduce trainable parameters, we
+employ a pre-trained model to extract image features and further design an
+extra mapping network. Prefix-diffusion is able to generate diverse captions
+with relatively less parameters, while maintaining the fluency and relevance of
+the captions benefiting from the generative capabilities of the diffusion
+model. Our work paves the way for scaling up diffusion models for image
+captioning, and achieves promising performance compared with recent approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages,4 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IPMix: Label-Preserving Data Augmentation Method for Training Robust
+  Classifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04780v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04780v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenglin Huang, Xianan Bao, Na Zhang, Qingqi Zhang, Xiaomei Tu, Biao Wu, Xi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data augmentation has been proven effective for training high-accuracy
+convolutional neural network classifiers by preventing overfitting. However,
+building deep neural networks in real-world scenarios requires not only high
+accuracy on clean data but also robustness when data distributions shift. While
+prior methods have proposed that there is a trade-off between accuracy and
+robustness, we propose IPMix, a simple data augmentation approach to improve
+robustness without hurting clean accuracy. IPMix integrates three levels of
+data augmentation (image-level, patch-level, and pixel-level) into a coherent
+and label-preserving technique to increase the diversity of training data with
+limited computational overhead. To further improve the robustness, IPMix
+introduces structural complexity at different levels to generate more diverse
+images and adopts the random mixing method for multi-scale information fusion.
+Experiments demonstrate that IPMix outperforms state-of-the-art corruption
+robustness on CIFAR-C and ImageNet-C. In addition, we show that IPMix also
+significantly improves the other safety measures, including robustness to
+adversarial perturbations, calibration, prediction consistency, and anomaly
+detection, achieving state-of-the-art or comparable results on several
+benchmarks, including ImageNet-R, ImageNet-A, and ImageNet-O.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zero-Shot Neural Architecture Search: Challenges, Solutions, and
+  Opportunities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.01998v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.01998v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guihong Li, Duc Hoang, Kartikeya Bhardwaj, Ming Lin, Zhangyang Wang, Radu Marculescu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, zero-shot (or training-free) Neural Architecture Search (NAS)
+approaches have been proposed to liberate NAS from the expensive training
+process. The key idea behind zero-shot NAS approaches is to design proxies that
+can predict the accuracy of some given networks without training the network
+parameters. The proxies proposed so far are usually inspired by recent progress
+in theoretical understanding of deep learning and have shown great potential on
+several datasets and NAS benchmarks. This paper aims to comprehensively review
+and compare the state-of-the-art (SOTA) zero-shot NAS approaches, with an
+emphasis on their hardware awareness. To this end, we first review the
+mainstream zero-shot proxies and discuss their theoretical underpinnings. We
+then compare these zero-shot proxies through large-scale experiments and
+demonstrate their effectiveness in both hardware-aware and hardware-oblivious
+NAS scenarios. Finally, we point out several promising ideas to design better
+proxies. Our source code and the list of related papers are available on
+https://github.com/SLDGroup/survey-zero-shot-nas.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Replacing softmax with ReLU in Vision <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08586v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08586v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mitchell Wortsman, Jaehoon Lee, Justin Gilmer, Simon Kornblith
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Previous research observed accuracy degradation when replacing the attention
+softmax with a point-wise activation such as ReLU. In the context of vision
+transformers, we find that this degradation is mitigated when dividing by
+sequence length. Our experiments training small to large vision transformers on
+ImageNet-21k indicate that ReLU-attention can approach or match the performance
+of softmax-attention in terms of scaling behavior as a function of compute.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stable and low-precision training for large-scale vision-language models <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.13013v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.13013v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mitchell Wortsman, Tim Dettmers, Luke Zettlemoyer, Ari Morcos, Ali Farhadi, Ludwig Schmidt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce new methods for 1) accelerating and 2) stabilizing training for
+large language-vision models. 1) For acceleration, we introduce SwitchBack, a
+linear layer for int8 quantized training which provides a speed-up of 13-25%
+while matching the performance of bfloat16 training within 0.1 percentage
+points for the 1B parameter CLIP ViT-Huge -- the largest int8 training to date.
+Our main focus is int8 as GPU support for float8 is rare, though we also
+analyze float8 training through simulation. While SwitchBack proves effective
+for float8, we show that standard techniques are also successful if the network
+is trained and initialized so that large feature magnitudes are discouraged,
+which we accomplish via layer-scale initialized with zeros. 2) For stability,
+we analyze loss spikes and find they consistently occur 1-8 iterations after
+the squared gradients become under-estimated by their AdamW second moment
+estimator. As a result, we recommend an AdamW-Adafactor hybrid which avoids
+loss spikes when training a CLIP ViT-Huge model and outperforms gradient
+clipping at the scales we test.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PaLI-3 Vision Language Models: Smaller, Faster, Stronger 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09199v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09199v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xi Chen, Xiao Wang, Lucas Beyer, Alexander Kolesnikov, Jialin Wu, Paul Voigtlaender, Basil Mustafa, Sebastian Goodman, Ibrahim Alabdulmohsin, Piotr Padlewski, Daniel Salz, Xi Xiong, Daniel Vlasic, Filip Pavetic, Keran Rong, Tianli Yu, Daniel Keysers, Xiaohua Zhai, Radu Soricut
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents PaLI-3, a smaller, faster, and stronger vision language
+model (VLM) that compares favorably to similar models that are 10x larger. As
+part of arriving at this strong performance, we compare Vision Transformer
+(ViT) models pretrained using classification objectives to contrastively
+(SigLIP) pretrained ones. We find that, while slightly underperforming on
+standard image classification benchmarks, SigLIP-based PaLI shows superior
+performance across various multimodal benchmarks, especially on localization
+and visually-situated text understanding. We scale the SigLIP image encoder up
+to 2 billion parameters, and achieves a new state-of-the-art on multilingual
+cross-modal retrieval. We hope that PaLI-3, at only 5B parameters, rekindles
+research on fundamental pieces of complex VLMs, and could fuel a new generation
+of scaled-up models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Provably Uncertainty-Guided Universal Domain Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.09616v8">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.09616v8.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Wang, Lin Zhang, Ran Song, Paul L. Rosin, Yibin Li, Wei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Universal domain adaptation (UniDA) aims to transfer the knowledge from a
+labeled source domain to an unlabeled target domain without any assumptions of
+the label sets, which requires distinguishing the unknown samples from the
+known ones in the target domain. A main challenge of UniDA is that the
+nonidentical label sets cause the misalignment between the two domains.
+Moreover, the domain discrepancy and the supervised objectives in the source
+domain easily lead the whole model to be biased towards the common classes and
+produce overconfident predictions for unknown samples. To address the above
+challenging problems, we propose a new uncertainty-guided UniDA framework.
+Firstly, we introduce an empirical estimation of the probability of a target
+sample belonging to the unknown class which fully exploits the distribution of
+the target samples in the latent space. Then, based on the estimation, we
+propose a novel neighbors searching scheme in a linear subspace with a
+$\delta$-filter to estimate the uncertainty score of a target sample and
+discover unknown samples. It fully utilizes the relationship between a target
+sample and its neighbors in the source domain to avoid the influence of domain
+misalignment. Secondly, this paper well balances the confidences of predictions
+for both known and unknown samples through an uncertainty-guided margin loss
+based on the confidences of discovered unknown samples, which can reduce the
+gap between the intra-class variances of known classes with respect to the
+unknown class. Finally, experiments on three public datasets demonstrate that
+our method significantly outperforms existing state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages. arXiv admin note: text overlap with arXiv:2207.09280</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Video-Text Retrieval by Supervised Sparse Multi-Grained Learning <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.09473v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.09473v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yimu Wang, Peng Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While recent progress in video-text retrieval has been advanced by the
+exploration of better representation learning, in this paper, we present a
+novel multi-grained sparse learning framework, S3MA, to learn an aligned sparse
+space shared between the video and the text for video-text retrieval. The
+shared sparse space is initialized with a finite number of sparse concepts,
+each of which refers to a number of words. With the text data at hand, we learn
+and update the shared sparse space in a supervised manner using the proposed
+similarity and alignment losses. Moreover, to enable multi-grained alignment,
+we incorporate frame representations for better modeling the video modality and
+calculating fine-grained and coarse-grained similarities. Benefiting from the
+learned shared sparse space and multi-grained similarities, extensive
+experiments on several video-text retrieval benchmarks demonstrate the
+superiority of S3MA over existing methods. Our code is available at
+https://github.com/yimuwangcs/Better_Cross_Modal_Retrieval.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automated deep learning segmentation of high-resolution 7 T postmortem
+  MRI for quantitative analysis of structure-pathology correlations in
+  neurodegenerative diseases 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12237v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12237v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pulkit Khandelwal, Michael Tran Duong, Shokufeh Sadaghiani, Sydney Lim, Amanda Denning, Eunice Chung, Sadhana Ravikumar, Sanaz Arezoumandan, Claire Peterson, Madigan Bedard, Noah Capp, Ranjit Ittyerah, Elyse Migdal, Grace Choi, Emily Kopp, Bridget Loja, Eusha Hasan, Jiacheng Li, Alejandra Bahena, Karthik Prabhakaran, Gabor Mizsei, Marianna Gabrielyan, Theresa Schuck, Winifred Trotman, John Robinson, Daniel Ohm, Edward B. Lee, John Q. Trojanowski, Corey McMillan, Murray Grossman, David J. Irwin, John Detre, M. Dylan Tisdall, Sandhitsu R. Das, Laura E. M. Wisse, David A. Wolk, Paul A. Yushkevich
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Postmortem MRI allows brain anatomy to be examined at high resolution and to
+link pathology measures with morphometric measurements. However, automated
+segmentation methods for brain mapping in postmortem MRI are not well
+developed, primarily due to limited availability of labeled datasets, and
+heterogeneity in scanner hardware and acquisition protocols. In this work, we
+present a high resolution of 135 postmortem human brain tissue specimens imaged
+at 0.3 mm$^{3}$ isotropic using a T2w sequence on a 7T whole-body MRI scanner.
+We developed a deep learning pipeline to segment the cortical mantle by
+benchmarking the performance of nine deep neural architectures, followed by
+post-hoc topological correction. We then segment four subcortical structures
+(caudate, putamen, globus pallidus, and thalamus), white matter
+hyperintensities, and the normal appearing white matter. We show generalizing
+capabilities across whole brain hemispheres in different specimens, and also on
+unseen images acquired at 0.28 mm^3 and 0.16 mm^3 isotropic T2*w FLASH sequence
+at 7T. We then compute localized cortical thickness and volumetric measurements
+across key regions, and link them with semi-quantitative neuropathological
+ratings. Our code, Jupyter notebooks, and the containerized executables are
+publicly available at: https://pulkit-khandelwal.github.io/exvivo-brain-upenn
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint submitted to NeuroImage Project website:
+  https://pulkit-khandelwal.github.io/exvivo-brain-upenn</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Surveillance AI Pipeline 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15084v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15084v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pratyusha Ria Kalluri, William Agnew, Myra Cheng, Kentrell Owens, Luca Soldaini, Abeba Birhane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A rapidly growing number of voices argue that AI research, and computer
+vision in particular, is powering mass surveillance. Yet the direct path from
+computer vision research to surveillance has remained obscured and difficult to
+assess. Here, we reveal the Surveillance AI pipeline by analyzing three decades
+of computer vision research papers and downstream patents, more than 40,000
+documents. We find the large majority of annotated computer vision papers and
+patents self-report their technology enables extracting data about humans.
+Moreover, the majority of these technologies specifically enable extracting
+data about human bodies and body parts. We present both quantitative and rich
+qualitative analysis illuminating these practices of human data extraction.
+Studying the roots of this pipeline, we find that institutions that
+prolifically produce computer vision research, namely elite universities and
+"big tech" corporations, are subsequently cited in thousands of surveillance
+patents. Further, we find consistent evidence against the narrative that only
+these few rogue entities are contributing to surveillance. Rather, we expose
+the fieldwide norm that when an institution, nation, or subfield authors
+computer vision papers with downstream patents, the majority of these papers
+are used in surveillance patents. In total, we find the number of papers with
+downstream surveillance patents increased more than five-fold between the 1990s
+and the 2010s, with computer vision research now having been used in more than
+11,000 surveillance patents. Finally, in addition to the high levels of
+surveillance we find documented in computer vision papers and patents, we
+unearth pervasive patterns of documents using language that obfuscates the
+extent of surveillance. Our analysis reveals the pipeline by which computer
+vision research has powered the ongoing expansion of surveillance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chinese Painting Style Transfer Using Deep Generative Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09978v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09978v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weijian Ma, Yanyang Kong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artistic style transfer aims to modify the style of the image while
+preserving its content. Style transfer using deep learning models has been
+widely studied since 2015, and most of the applications are focused on specific
+artists like Van Gogh, Monet, Cezanne. There are few researches and
+applications on traditional Chinese painting style transfer. In this paper, we
+will study and leverage different state-of-the-art deep generative models for
+Chinese painting style transfer and evaluate the performance both qualitatively
+and quantitatively. In addition, we propose our own algorithm that combines
+several style transfer models for our task. Specifically, we will transfer two
+main types of traditional Chinese painting style, known as "Gong-bi" and
+"Shui-mo" (to modern images like nature objects, portraits and landscapes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper is too old (written in 2019)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nerfbusters: Removing Ghostly Artifacts from Casually Captured NeRFs <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.10532v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.10532v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Frederik Warburg, Ethan Weber, Matthew Tancik, Aleksander Holynski, Angjoo Kanazawa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Casually captured Neural Radiance Fields (NeRFs) suffer from artifacts such
+as floaters or flawed geometry when rendered outside the camera trajectory.
+Existing evaluation protocols often do not capture these effects, since they
+usually only assess image quality at every 8th frame of the training capture.
+To push forward progress in novel-view synthesis, we propose a new dataset and
+evaluation procedure, where two camera trajectories are recorded of the scene:
+one used for training, and the other for evaluation. In this more challenging
+in-the-wild setting, we find that existing hand-crafted regularizers do not
+remove floaters nor improve scene geometry. Thus, we propose a 3D
+diffusion-based method that leverages local 3D priors and a novel density-based
+score distillation sampling loss to discourage artifacts during NeRF
+optimization. We show that this data-driven prior removes floaters and improves
+scene geometry for casual captures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023, project page: https://ethanweber.me/nerfbusters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enabling Neural Radiance Fields (NeRF) for Large-scale Aerial Images --
+  A Multi-tiling Approach and the Geometry Assessment of NeRF 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00530v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00530v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ningli Xu, Rongjun Qin, Debao Huang, Fabio Remondino
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Fields (NeRF) offer the potential to benefit 3D
+reconstruction tasks, including aerial photogrammetry. However, the scalability
+and accuracy of the inferred geometry are not well-documented for large-scale
+aerial assets,since such datasets usually result in very high memory
+consumption and slow convergence.. In this paper, we aim to scale the NeRF on
+large-scael aerial datasets and provide a thorough geometry assessment of NeRF.
+Specifically, we introduce a location-specific sampling technique as well as a
+multi-camera tiling (MCT) strategy to reduce memory consumption during image
+loading for RAM, representation training for GPU memory, and increase the
+convergence rate within tiles. MCT decomposes a large-frame image into multiple
+tiled images with different camera models, allowing these small-frame images to
+be fed into the training process as needed for specific locations without a
+loss of accuracy. We implement our method on a representative approach,
+Mip-NeRF, and compare its geometry performance with threephotgrammetric MVS
+pipelines on two typical aerial datasets against LiDAR reference data. Both
+qualitative and quantitative results suggest that the proposed NeRF approach
+produces better completeness and object details than traditional approaches,
+although as of now, it still falls short in terms of accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 Figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MetaGCD: Learning to Continually Learn in Generalized Category Discovery <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.11063v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.11063v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanan Wu, Zhixiang Chi, Yang Wang, Songhe Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we consider a real-world scenario where a model that is
+trained on pre-defined classes continually encounters unlabeled data that
+contains both known and novel classes. The goal is to continually discover
+novel classes while maintaining the performance in known classes. We name the
+setting Continual Generalized Category Discovery (C-GCD). Existing methods for
+novel class discovery cannot directly handle the C-GCD setting due to some
+unrealistic assumptions, such as the unlabeled data only containing novel
+classes. Furthermore, they fail to discover novel classes in a continual
+fashion. In this work, we lift all these assumptions and propose an approach,
+called MetaGCD, to learn how to incrementally discover with less forgetting.
+Our proposed method uses a meta-learning framework and leverages the offline
+labeled data to simulate the testing incremental learning process. A
+meta-objective is defined to revolve around two conflicting learning objectives
+to achieve novel class discovery without forgetting. Furthermore, a soft
+neighborhood-based contrastive network is proposed to discriminate uncorrelated
+images while attracting correlated images. We build strong baselines and
+conduct extensive experiments on three widely used benchmarks to demonstrate
+the superiority of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by ICCV2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HandNeRF: Learning to Reconstruct Hand-Object Interaction Scene from a
+  Single RGB Image 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.07891v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.07891v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongsuk Choi, Nikhil Chavan-Dafle, Jiacheng Yuan, Volkan Isler, Hyunsoo Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a method to learn hand-object interaction prior for
+reconstructing a 3D hand-object scene from a single RGB image. The inference as
+well as training-data generation for 3D hand-object scene reconstruction is
+challenging due to the depth ambiguity of a single image and occlusions by the
+hand and object. We turn this challenge into an opportunity by utilizing the
+hand shape to constrain the possible relative configuration of the hand and
+object geometry. We design a generalizable implicit function, HandNeRF, that
+explicitly encodes the correlation of the 3D hand shape features and 2D object
+features to predict the hand and object scene geometry. With experiments on
+real-world datasets, we show that HandNeRF is able to reconstruct hand-object
+scenes of novel grasp configurations more accurately than comparable methods.
+Moreover, we demonstrate that object reconstruction from HandNeRF ensures more
+accurate execution of downstream tasks, such as grasping and motion planning
+for robotic hand-over and manipulation. The code will be release here:
+https://github.com/SamsungLabs/HandNeRF
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages including the supplementary material, 8 tables, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">16</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Coherence-based Predictors for Dense Query Performance Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11405v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11405v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maria Vlachou, Craig Macdonald
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Query Performance Prediction (QPP) estimates the effectiveness of a search
+engine's results in response to a query without relevance judgments.
+Traditionally, post-retrieval predictors have focused upon either the
+distribution of the retrieval scores, or the coherence of the top-ranked
+documents using traditional bag-of-words index representations. More recently,
+BERT-based models using dense embedded document representations have been used
+to create new predictors, but mostly applied to predict the performance of
+rankings created by BM25. Instead, we aim to predict the effectiveness of
+rankings created by single-representation dense retrieval models (ANCE &
+TCT-ColBERT). Therefore, we propose a number of variants of existing
+unsupervised coherence-based predictors that employ neural embedding
+representations. In our experiments on the TREC Deep Learning Track datasets,
+we demonstrate improved accuracy upon dense retrieval (up to 92% compared to
+sparse variants for TCT-ColBERT and 188% for ANCE). Going deeper, we select the
+most representative and best performing predictors to study the importance of
+differences among predictors and query types on query performance. Using
+existing distribution-based evaluation QPP measures and a particular type of
+linear mixed models, we find that query types further significantly influence
+query performance (and are up to 35% responsible for the unstable performance
+of QPP predictors), and that this sensitivity is unique to dense retrieval
+models. Our approach introduces a new setting for obtaining richer information
+on query differences in dense QPP that can explain potential unstable
+performance of existing predictors and outlines the unique characteristics of
+different query types on dense retrieval models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Neural Networks for Recommendation: Reproducibility, Graph
+  Topology, and Node Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11270v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11270v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniele Malitesta, Claudio Pomo, Tommaso Di Noia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) have gained prominence in recommendation systems
+in recent years. By representing the user-item matrix as a bipartite and
+undirected graph, GNNs have demonstrated their potential to capture short- and
+long-distance user-item interactions, thereby learning more accurate preference
+patterns than traditional recommendation approaches. In contrast to previous
+tutorials on the same topic, this tutorial aims to present and examine three
+key aspects that characterize GNNs for recommendation: (i) the reproducibility
+of state-of-the-art approaches, (ii) the potential impact of graph topological
+characteristics on the performance of these models, and (iii) strategies for
+learning node representations when training features from scratch or utilizing
+pre-trained embeddings as additional item information (e.g., multimodal
+features). The goal is to provide three novel theoretical and practical
+perspectives on the field, currently subject to debate in graph learning but
+long been overlooked in the context of recommendation systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MeKB-Rec: Personal Knowledge Graph Learning for Cross-Domain
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11088v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11088v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Su, Yao Zhou, Zifei Shan, Qian Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is a long-standing challenge in modern recommender systems to effectively
+make recommendations for new users, namely the cold-start problem. Cross-Domain
+Recommendation (CDR) has been proposed to address this challenge, but current
+ways to represent users' interests across systems are still severely limited.
+We introduce Personal Knowledge Graph (PKG) as a domain-invariant interest
+representation, and propose a novel CDR paradigm named MeKB-Rec. We first link
+users and entities in a knowledge base to construct a PKG of users' interests,
+named MeKB. Then we learn a semantic representation of MeKB for the
+cross-domain recommendation. To efficiently utilize limited training data in
+CDR, MeKB-Rec employs Pretrained Language Models to inject world knowledge into
+understanding users' interests. Beyond most existing systems, our approach
+builds a semantic mapping across domains which breaks the requirement for
+in-domain user behaviors, enabling zero-shot recommendations for new users in a
+low-resource domain. We experiment MeKB-Rec on well-established public CDR
+datasets, and demonstrate that the new formulation % is more powerful than
+previous approaches, achieves a new state-of-the-art that significantly
+improves HR@10 and NDCG@10 metrics over best previous approaches by 24\%--91\%,
+with a 105\% improvement for HR@10 of zero-shot users with no behavior in the
+target domain. We deploy MeKB-Rec in WeiXin recommendation scenarios and
+achieve significant gains in core online metrics. MeKB-Rec is now serving
+hundreds of millions of users in real-world products.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 4 figures, conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nonet at SemEval-2023 Task 6: Methodologies for Legal Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11049v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11049v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubham Kumar Nigam, Aniket Deroy, Noel Shallum, Ayush Kumar Mishra, Anup Roy, Shubham Kumar Mishra, Arnab Bhattacharya, Saptarshi Ghosh, Kripabandhu Ghosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper describes our submission to the SemEval-2023 for Task 6 on
+LegalEval: Understanding Legal Texts. Our submission concentrated on three
+subtasks: Legal Named Entity Recognition (L-NER) for Task-B, Legal Judgment
+Prediction (LJP) for Task-C1, and Court Judgment Prediction with Explanation
+(CJPE) for Task-C2. We conducted various experiments on these subtasks and
+presented the results in detail, including data statistics and methodology. It
+is worth noting that legal tasks, such as those tackled in this research, have
+been gaining importance due to the increasing need to automate legal analysis
+and support. Our team obtained competitive rankings of 15$^{th}$, 11$^{th}$,
+and 1$^{st}$ in Task-B, Task-C1, and Task-C2, respectively, as reported on the
+leaderboard.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tensor Completion with Provable Consistency and Fairness Guarantees for
+  Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.01815v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.01815v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tung Nguyen, Jeffrey Uhlmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a new consistency-based approach for defining and solving
+nonnegative/positive matrix and tensor completion problems. The novelty of the
+framework is that instead of artificially making the problem well-posed in the
+form of an application-arbitrary optimization problem, e.g., minimizing a bulk
+structural measure such as rank or norm, we show that a single
+property/constraint: preserving unit-scale consistency, guarantees the
+existence of both a solution and, under relatively weak support assumptions,
+uniqueness. The framework and solution algorithms also generalize directly to
+tensors of arbitrary dimensions while maintaining computational complexity that
+is linear in problem size for fixed dimension d. In the context of recommender
+system (RS) applications, we prove that two reasonable properties that should
+be expected to hold for any solution to the RS problem are sufficient to permit
+uniqueness guarantees to be established within our framework. This is
+remarkable because it obviates the need for heuristic-based statistical or AI
+methods despite what appear to be distinctly human/subjective variables at the
+heart of the problem. Key theoretical contributions include a general
+unit-consistent tensor-completion framework with proofs of its properties,
+e.g., consensus-order and fairness, and algorithms with optimal runtime and
+space complexities, e.g., O(1) term-completion with preprocessing complexity
+that is linear in the number of known terms of the matrix/tensor. From a
+practical perspective, the seamless ability of the framework to generalize to
+exploit high-dimensional structural relationships among key state variables,
+e.g., user and product attributes, offers a means for extracting significantly
+more information than is possible for alternative methods that cannot
+generalize beyond direct user-product relationships.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Final published version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unbiased and Robust: External Attention-enhanced Graph Contrastive
+  Learning for Cross-domain Sequential Recommendation <span class="chip">ICDM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04633v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04633v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinhua Wang, Houping Yue, Zizheng Wang, Liancheng Xu, Jinyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-domain sequential recommenders (CSRs) are gaining considerable research
+attention as they can capture user sequential preference by leveraging side
+information from multiple domains. However, these works typically follow an
+ideal setup, i.e., different domains obey similar data distribution, which
+ignores the bias brought by asymmetric interaction densities (a.k.a. the
+inter-domain density bias). Besides, the frequently adopted mechanism (e.g.,
+the self-attention network) in sequence encoder only focuses on the
+interactions within a local view, which overlooks the global correlations
+between different training batches. To this end, we propose an External
+Attention-enhanced Graph Contrastive Learning framework, namely EA-GCL.
+Specifically, to remove the impact of the inter-domain density bias, an
+auxiliary Self-Supervised Learning (SSL) task is attached to the traditional
+graph encoder under a multi-task learning manner. To robustly capture users'
+behavioral patterns, we develop an external attention-based sequence encoder
+that contains an MLP-based memory-sharing structure. Unlike the self-attention
+mechanism, such a structure can effectively alleviate the bias interference
+from the batch-based training scheme. Extensive experiments on two real-world
+datasets demonstrate that EA-GCL outperforms several state-of-the-art baselines
+on CSR tasks. The source codes and relevant datasets are available at
+https://github.com/HoupingY/EA-GCL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 4 figures, accepted by ICDM 2023 (workshop-GML4Rec)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Is Chat<span class="highlight-title">GPT</span> Fair for Recommendation? Evaluating Fairness in Large
+  Language Model Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07609v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07609v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jizhi Zhang, Keqin Bao, Yang Zhang, Wenjie Wang, Fuli Feng, Xiangnan He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The remarkable achievements of Large Language Models (LLMs) have led to the
+emergence of a novel recommendation paradigm -- Recommendation via LLM
+(RecLLM). Nevertheless, it is important to note that LLMs may contain social
+prejudices, and therefore, the fairness of recommendations made by RecLLM
+requires further investigation. To avoid the potential risks of RecLLM, it is
+imperative to evaluate the fairness of RecLLM with respect to various sensitive
+attributes on the user side. Due to the differences between the RecLLM paradigm
+and the traditional recommendation paradigm, it is problematic to directly use
+the fairness benchmark of traditional recommendation. To address the dilemma,
+we propose a novel benchmark called Fairness of Recommendation via LLM
+(FaiRLLM). This benchmark comprises carefully crafted metrics and a dataset
+that accounts for eight sensitive attributes1 in two recommendation scenarios:
+music and movies. By utilizing our FaiRLLM benchmark, we conducted an
+evaluation of ChatGPT and discovered that it still exhibits unfairness to some
+sensitive attributes when generating recommendations. Our code and dataset can
+be found at https://github.com/jizhi-zhang/FaiRLLM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Recsys 2023 (Short)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TALLRec: An Effective and Efficient Tuning Framework to Align Large
+  Language Model with Recommendation <span class="chip">RecSys '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.00447v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.00447v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keqin Bao, Jizhi Zhang, Yang Zhang, Wenjie Wang, Fuli Feng, Xiangnan He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated remarkable performance across
+diverse domains, thereby prompting researchers to explore their potential for
+use in recommendation systems. Initial attempts have leveraged the exceptional
+capabilities of LLMs, such as rich knowledge and strong generalization through
+In-context Learning, which involves phrasing the recommendation task as
+prompts. Nevertheless, the performance of LLMs in recommendation tasks remains
+suboptimal due to a substantial disparity between the training tasks for LLMs
+and recommendation tasks, as well as inadequate recommendation data during
+pre-training. To bridge the gap, we consider building a Large Recommendation
+Language Model by tunning LLMs with recommendation data. To this end, we
+propose an efficient and effective Tuning framework for Aligning LLMs with
+Recommendation, namely TALLRec. We have demonstrated that the proposed TALLRec
+framework can significantly enhance the recommendation capabilities of LLMs in
+the movie and book domains, even with a limited dataset of fewer than 100
+samples. Additionally, the proposed framework is highly efficient and can be
+executed on a single RTX 3090 with LLaMA-7B. Furthermore, the fine-tuned LLM
+exhibits robust cross-domain generalization. Our code and data are available at
+https://github.com/SAI990323/TALLRec.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>RecSys '23: Proceedings of the 17th ACM Conference on Recommender
+  Systems; September 2023 Pages; 1007-1014</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RAH! RecSys-Assistant-Human: A Human-Centered Recommendation Framework
+  with LLM Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.09904v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.09904v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yubo Shu, Haonan Zhang, Hansu Gu, Peng Zhang, Tun Lu, Dongsheng Li, Ning Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid evolution of the web has led to an exponential growth in content.
+Recommender systems play a crucial role in Human-Computer Interaction (HCI) by
+tailoring content based on individual preferences. Despite their importance,
+challenges persist in balancing recommendation accuracy with user satisfaction,
+addressing biases while preserving user privacy, and solving cold-start
+problems in cross-domain situations. This research argues that addressing these
+issues is not solely the recommender systems' responsibility, and a
+human-centered approach is vital. We introduce the RAH Recommender system,
+Assistant, and Human) framework, an innovative solution with LLM-based agents
+such as Perceive, Learn, Act, Critic, and Reflect, emphasizing the alignment
+with user personalities. The framework utilizes the Learn-Act-Critic loop and a
+reflection mechanism for improving user alignment. Using the real-world data,
+our experiments demonstrate the RAH framework's efficacy in various
+recommendation domains, from reducing human burden to mitigating biases and
+enhancing user control. Notably, our contributions provide a human-centered
+recommendation framework that partners effectively with various recommendation
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient High-Resolution Template Matching with Vector Quantized
+  Nearest Neighbour Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15010v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15010v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ankit Gupta, Ida-Maria Sintorn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Template matching is a fundamental problem in computer vision with
+applications in fields including object detection, image registration, and
+object tracking. Current methods rely on nearest-neighbour (NN) matching, where
+the query feature space is converted to NN space by representing each query
+pixel with its NN in the template. NN-based methods have been shown to perform
+better in occlusions, appearance changes, and non-rigid transformations;
+however, they scale poorly with high-resolution data and high feature
+dimensions. We present an NN-based method which efficiently reduces the NN
+computations and introduces filtering in the NN fields (NNFs). A vector
+quantization step is introduced before the NN calculation to represent the
+template with $k$ features, and the filter response over the NNFs is used to
+compare the template and query distributions over the features. We show that
+state-of-the-art performance is achieved in low-resolution data, and our method
+outperforms previous methods at higher resolution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BLM-17m: A Large-Scale <span class="highlight-title">Dataset</span> for Black Lives Matter Topic Detection on
+  Twitter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2105.01331v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2105.01331v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hasan Kemik, Nusret Özateş, Meysam Asgari-Chenaghlu, Yang Li, Erik Cambria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Protection of human rights is one of the most important problems of our
+world. In this paper, our aim is to provide a dataset which covers one of the
+most significant human rights contradiction in recent months affected the whole
+world, George Floyd incident. We propose a labeled dataset for topic detection
+that contains 17 million tweets. These Tweets are collected from 25 May 2020 to
+21 August 2020 that covers 89 days from start of this incident. We labeled the
+dataset by monitoring most trending news topics from global and local
+newspapers. Apart from that, we present two baselines, TF-IDF and LDA. We
+evaluated the results of these two methods with three different k values for
+metrics of precision, recall and f1-score. The collected dataset is available
+at https://github.com/MeysamAsgariC/BLMT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hybrid Inverted Index Is a Robust Accelerator for Dense Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.05521v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.05521v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peitian Zhang, Zheng Liu, Shitao Xiao, Zhicheng Dou, Jing Yao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inverted file structure is a common technique for accelerating dense
+retrieval. It clusters documents based on their embeddings; during searching,
+it probes nearby clusters w.r.t. an input query and only evaluates documents
+within them by subsequent codecs, thus avoiding the expensive cost of
+exhaustive traversal. However, the clustering is always lossy, which results in
+the miss of relevant documents in the probed clusters and hence degrades
+retrieval quality. In contrast, lexical matching, such as overlaps of salient
+terms, tends to be strong feature for identifying relevant documents. In this
+work, we present the Hybrid Inverted Index (HI$^2$), where the embedding
+clusters and salient terms work collaboratively to accelerate dense retrieval.
+To make best of both effectiveness and efficiency, we devise a cluster selector
+and a term selector, to construct compact inverted lists and efficiently
+searching through them. Moreover, we leverage simple unsupervised algorithms as
+well as end-to-end knowledge distillation to learn these two modules, with the
+latter further boosting the effectiveness. Based on comprehensive experiments
+on popular retrieval benchmarks, we verify that clusters and terms indeed
+complement each other, enabling HI$^2$ to achieve lossless retrieval quality
+with competitive efficiency across various index settings. Our code and
+checkpoint are publicly available at
+https://github.com/namespace-Pt/Adon/tree/HI2.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Click<span class="highlight-title">Prompt</span>: CTR Models are Strong <span class="highlight-title">Prompt</span> Generators for Adapting
+  Language Models to CTR Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09234v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09234v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianghao Lin, Bo Chen, Hangyu Wang, Yunjia Xi, Yanru Qu, Xinyi Dai, Kangning Zhang, Ruiming Tang, Yong Yu, Weinan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Click-through rate (CTR) prediction has become increasingly indispensable for
+various Internet applications. Traditional CTR models convert the multi-field
+categorical data into ID features via one-hot encoding, and extract the
+collaborative signals among features. Such a paradigm suffers from the problem
+of semantic information loss. Another line of research explores the potential
+of pretrained language models (PLMs) for CTR prediction by converting input
+data into textual sentences through hard prompt templates. Although semantic
+signals are preserved, they generally fail to capture the collaborative
+information (e.g., feature interactions, pure ID features), not to mention the
+unacceptable inference overhead brought by the huge model size. In this paper,
+we aim to model both the semantic knowledge and collaborative knowledge for
+accurate CTR estimation, and meanwhile address the inference inefficiency
+issue. To benefit from both worlds and close their gaps, we propose a novel
+model-agnostic framework (i.e., ClickPrompt), where we incorporate CTR models
+to generate interaction-aware soft prompts for PLMs. We design a
+prompt-augmented masked language modeling (PA-MLM) pretraining task, where PLM
+has to recover the masked tokens based on the language context, as well as the
+soft prompts generated by CTR model. The collaborative and semantic knowledge
+from ID and textual features would be explicitly aligned and interacted via the
+prompt interface. Then, we can either tune the CTR model with PLM for superior
+performance, or solely tune the CTR model without PLM for inference efficiency.
+Experiments on four real-world datasets validate the effectiveness of
+ClickPrompt compared with existing baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning List-Level Domain-Invariant Representations for Ranking <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.10764v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.10764v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruicheng Xian, Honglei Zhuang, Zhen Qin, Hamed Zamani, Jing Lu, Ji Ma, Kai Hui, Han Zhao, Xuanhui Wang, Michael Bendersky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain adaptation aims to transfer the knowledge learned on (data-rich)
+source domains to (low-resource) target domains, and a popular method is
+invariant representation learning, which matches and aligns the data
+distributions on the feature space. Although this method is studied extensively
+and applied on classification and regression problems, its adoption on ranking
+problems is sporadic, and the few existing implementations lack theoretical
+justifications. This paper revisits invariant representation learning for
+ranking. Upon reviewing prior work, we found that they implement what we call
+item-level alignment, which aligns the distributions of the items being ranked
+from all lists in aggregate but ignores their list structure. However, the list
+structure should be leveraged, because it is intrinsic to ranking problems
+where the data and the metrics are defined and computed on lists, not the items
+by themselves. To close this discrepancy, we propose list-level alignment --
+learning domain-invariant representations at the higher level of lists. The
+benefits are twofold: it leads to the first domain adaptation generalization
+bound for ranking, in turn providing theoretical support for the proposed
+method, and it achieves better empirical transfer performance for unsupervised
+domain adaptation on ranking tasks, including passage reranking.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Video-Text Retrieval by Supervised Sparse Multi-Grained Learning <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.09473v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.09473v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yimu Wang, Peng Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While recent progress in video-text retrieval has been advanced by the
+exploration of better representation learning, in this paper, we present a
+novel multi-grained sparse learning framework, S3MA, to learn an aligned sparse
+space shared between the video and the text for video-text retrieval. The
+shared sparse space is initialized with a finite number of sparse concepts,
+each of which refers to a number of words. With the text data at hand, we learn
+and update the shared sparse space in a supervised manner using the proposed
+similarity and alignment losses. Moreover, to enable multi-grained alignment,
+we incorporate frame representations for better modeling the video modality and
+calculating fine-grained and coarse-grained similarities. Benefiting from the
+learned shared sparse space and multi-grained similarities, extensive
+experiments on several video-text retrieval benchmarks demonstrate the
+superiority of S3MA over existing methods. Our code is available at
+https://github.com/yimuwangcs/Better_Cross_Modal_Retrieval.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Solo: Data Discovery Using Natural Language Questions Via A
+  <span class="highlight-title">Self-Supervised</span> Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.03560v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.03560v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiming Wang, Raul Castro Fernandez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most deployed data discovery systems, such as Google Datasets, and open data
+portals only support keyword search. Keyword search is geared towards general
+audiences but limits the types of queries the systems can answer. We propose a
+new system that lets users write natural language questions directly. A major
+barrier to using this learned data discovery system is it needs
+expensive-to-collect training data, thus limiting its utility. In this paper,
+we introduce a self-supervised approach to assemble training datasets and train
+learned discovery systems without human intervention. It requires addressing
+several challenges, including the design of self-supervised strategies for data
+discovery, table representation strategies to feed to the models, and relevance
+models that work well with the synthetically generated questions. We combine
+all the above contributions into a system, Solo, that solves the problem end to
+end. The evaluation results demonstrate the new techniques outperform
+state-of-the-art approaches on well-known benchmarks. All in all, the technique
+is a stepping stone towards building learned discovery systems. The code is
+open-sourced at https://github.com/TheDataStation/solo
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at Sigmod 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">150</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Seeking Neural Nuggets: Knowledge Transfer in Large Language Models from
+  a Parametric Perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11451v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11451v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ming Zhong, Chenxin An, Weizhu Chen, Jiawei Han, Pengcheng He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) inherently encode a wealth of knowledge within
+their parameters through pre-training on extensive corpora. While prior
+research has delved into operations on these parameters to manipulate the
+underlying implicit knowledge (encompassing detection, editing, and merging),
+there remains an ambiguous understanding regarding their transferability across
+models with varying scales. In this paper, we seek to empirically investigate
+knowledge transfer from larger to smaller models through a parametric
+perspective. To achieve this, we employ sensitivity-based techniques to extract
+and align knowledge-specific parameters between different LLMs. Moreover, the
+LoRA module is used as the intermediary mechanism for injecting the extracted
+knowledge into smaller models. Evaluations across four benchmarks validate the
+efficacy of our proposed method. Our findings highlight the critical factors
+contributing to the process of parametric knowledge transfer, underscoring the
+transferability of model parameters across LLMs of different scales. We release
+code and data at \url{https://github.com/maszhongming/ParaKnowTransfer}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explaining Deep Neural Networks for Bearing Fault Detection with
+  Vibration Concepts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11450v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11450v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Decker, Michael Lebacher, Volker Tresp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Concept-based explanation methods, such as Concept Activation Vectors, are
+potent means to quantify how abstract or high-level characteristics of input
+data influence the predictions of complex deep neural networks. However,
+applying them to industrial prediction problems is challenging as it is not
+immediately clear how to define and access appropriate concepts for individual
+use cases and specific data types. In this work, we investigate how to leverage
+established concept-based explanation techniques in the context of bearing
+fault detection with deep neural networks trained on vibration signals. Since
+bearings are prevalent in almost every rotating equipment, ensuring the
+reliability of intransparent fault detection models is crucial to prevent
+costly repairs and downtimes of industrial machinery. Our evaluations
+demonstrate that explaining opaque models in terms of vibration concepts
+enables human-comprehensible and intuitive insights about their inner workings,
+but the underlying assumptions need to be carefully validated first.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2023 IEEE 21st International Conference on Industrial Informatics
+  (INDIN)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DELIFFAS: Deformable Light Fields for Fast Avatar Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11449v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11449v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Youngjoong Kwon, Lingjie Liu, Henry Fuchs, Marc Habermann, Christian Theobalt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating controllable and photorealistic digital human avatars is a
+long-standing and important problem in Vision and Graphics. Recent methods have
+shown great progress in terms of either photorealism or inference speed while
+the combination of the two desired properties still remains unsolved. To this
+end, we propose a novel method, called DELIFFAS, which parameterizes the
+appearance of the human as a surface light field that is attached to a
+controllable and deforming human mesh model. At the core, we represent the
+light field around the human with a deformable two-surface parameterization,
+which enables fast and accurate inference of the human appearance. This allows
+perceptual supervision on the full image compared to previous approaches that
+could only supervise individual pixels or small patches due to their slow
+runtime. Our carefully designed human representation and supervision strategy
+leads to state-of-the-art synthesis results and inference time. The video
+results and code are available at
+https://vcai.mpi-inf.mpg.de/projects/DELIFFAS.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stochastic Quantum Sampling for Non-Logconcave Distributions and
+  Estimating Partition Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11445v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11445v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guneykan Ozgul, Xiantao Li, Mehrdad Mahdavi, Chunhao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present quantum algorithms for sampling from non-logconcave probability
+distributions in the form of $\pi(x) \propto \exp(-\beta f(x))$. Here, $f$ can
+be written as a finite sum $f(x):= \frac{1}{N}\sum_{k=1}^N f_k(x)$. Our
+approach is based on quantum simulated annealing on slowly varying Markov
+chains derived from unadjusted Langevin algorithms, removing the necessity for
+function evaluations which can be computationally expensive for large data sets
+in mixture modeling and multi-stable systems. We also incorporate a stochastic
+gradient oracle that implements the quantum walk operators inexactly by only
+using mini-batch gradients. As a result, our stochastic gradient based
+algorithm only accesses small subsets of data points in implementing the
+quantum walk. One challenge of quantizing the resulting Markov chains is that
+they do not satisfy the detailed balance condition in general. Consequently,
+the mixing time of the algorithm cannot be expressed in terms of the spectral
+gap of the transition density, making the quantum algorithms nontrivial to
+analyze. To overcome these challenges, we first build a hypothetical Markov
+chain that is reversible, and also converges to the target distribution. Then,
+we quantified the distance between our algorithm's output and the target
+distribution by using this hypothetical chain as a bridge to establish the
+total complexity. Our quantum algorithms exhibit polynomial speedups in terms
+of both dimension and precision dependencies when compared to the best-known
+classical algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding deep neural networks through the lens of their
+  non-linearity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11439v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11439v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quentin Bouniot, Ievgen Redko, Anton Mallasto, Charlotte Laclau, Karol Arndt, Oliver Struckmeier, Markus Heinonen, Ville Kyrki, Samuel Kaski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The remarkable success of deep neural networks (DNN) is often attributed to
+their high expressive power and their ability to approximate functions of
+arbitrary complexity. Indeed, DNNs are highly non-linear models, and activation
+functions introduced into them are largely responsible for this. While many
+works studied the expressive power of DNNs through the lens of their
+approximation capabilities, quantifying the non-linearity of DNNs or of
+individual activation functions remains an open problem. In this paper, we
+propose the first theoretically sound solution to track non-linearity
+propagation in deep neural networks with a specific focus on computer vision
+applications. Our proposed affinity score allows us to gain insights into the
+inner workings of a wide range of different architectures and learning
+paradigms. We provide extensive experimental results that highlight the
+practical utility of the proposed affinity score and its potential for
+long-reaching applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Identifying Interpretable Visual Features in Artificial and Biological
+  Neural Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11431v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11431v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Klindt, Sophia Sanborn, Francisco Acosta, Frédéric Poitevin, Nina Miolane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Single neurons in neural networks are often ``interpretable'' in that they
+represent individual, intuitively meaningful features. However, many neurons
+exhibit $\textit{mixed selectivity}$, i.e., they represent multiple unrelated
+features. A recent hypothesis proposes that features in deep networks may be
+represented in $\textit{superposition}$, i.e., on non-orthogonal axes by
+multiple neurons, since the number of possible interpretable features in
+natural data is generally larger than the number of neurons in a given network.
+Accordingly, we should be able to find meaningful directions in activation
+space that are not aligned with individual neurons. Here, we propose (1) an
+automated method for quantifying visual interpretability that is validated
+against a large database of human psychophysics judgments of neuron
+interpretability, and (2) an approach for finding meaningful directions in
+network activation space. We leverage these methods to discover directions in
+convolutional neural networks that are more intuitively meaningful than
+individual neurons, as we confirm and investigate in a series of analyses.
+Moreover, we apply the same method to two recent datasets of visual neural
+responses in the brain and find that our conclusions largely transfer to real
+neural data, suggesting that superposition might be deployed by the brain. This
+also provides a link with disentanglement and raises fundamental questions
+about robust, efficient and factorized representations in both artificial and
+biological neural systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Butterfly Effects of SGD Noise: Error Amplification in Behavior Cloning
+  and Autoregression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11428v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11428v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adam Block, Dylan J. Foster, Akshay Krishnamurthy, Max Simchowitz, Cyril Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work studies training instabilities of behavior cloning with deep neural
+networks. We observe that minibatch SGD updates to the policy network during
+training result in sharp oscillations in long-horizon rewards, despite
+negligibly affecting the behavior cloning loss. We empirically disentangle the
+statistical and computational causes of these oscillations, and find them to
+stem from the chaotic propagation of minibatch SGD noise through unstable
+closed-loop dynamics. While SGD noise is benign in the single-step action
+prediction objective, it results in catastrophic error accumulation over long
+horizons, an effect we term gradient variance amplification (GVA). We show that
+many standard mitigation techniques do not alleviate GVA, but find an
+exponential moving average (EMA) of iterates to be surprisingly effective at
+doing so. We illustrate the generality of this phenomenon by showing the
+existence of GVA and its amelioration by EMA in both continuous control and
+autoregressive language generation. Finally, we provide theoretical vignettes
+that highlight the benefits of EMA in alleviating GVA and shed light on the
+extent to which classical convex models can help in understanding the benefits
+of iterate averaging in deep learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Group-blind optimal transport to group parity and its constrained
+  variants 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11407v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11407v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quan Zhou, Jakub Marecek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fairness holds a pivotal role in the realm of machine learning, particularly
+when it comes to addressing groups categorised by sensitive attributes, e.g.,
+gender, race. Prevailing algorithms in fair learning predominantly hinge on
+accessibility or estimations of these sensitive attributes, at least in the
+training process. We design a single group-blind projection map that aligns the
+feature distributions of both groups in the source data, achieving
+(demographic) group parity, without requiring values of the protected attribute
+for individual samples in the computation of the map, as well as its use.
+Instead, our approach utilises the feature distributions of the privileged and
+unprivileged groups in a boarder population and the essential assumption that
+the source data are unbiased representation of the population. We present
+numerical results on synthetic data and real data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Group Fairness in Online Settings Using Oblique Decision
+  Forests 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11401v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11401v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Somnath Basu Roy Chowdhury, Nicholas Monath, Ahmad Beirami, Rahul Kidambi, Avinava Dubey, Amr Ahmed, Snigdha Chaturvedi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fairness, especially group fairness, is an important consideration in the
+context of machine learning systems. The most commonly adopted group
+fairness-enhancing techniques are in-processing methods that rely on a mixture
+of a fairness objective (e.g., demographic parity) and a task-specific
+objective (e.g., cross-entropy) during the training process. However, when data
+arrives in an online fashion -- one instance at a time -- optimizing such
+fairness objectives poses several challenges. In particular, group fairness
+objectives are defined using expectations of predictions across different
+demographic groups. In the online setting, where the algorithm has access to a
+single instance at a time, estimating the group fairness objective requires
+additional storage and significantly more computation (e.g., forward/backward
+passes) than the task-specific objective at every time step. In this paper, we
+propose Aranyani, an ensemble of oblique decision trees, to make fair decisions
+in online settings. The hierarchical tree structure of Aranyani enables
+parameter isolation and allows us to efficiently compute the fairness gradients
+using aggregate statistics of previous decisions, eliminating the need for
+additional storage and forward/backward passes. We also present an efficient
+framework to train Aranyani and theoretically analyze several of its
+properties. We conduct empirical evaluations on 5 publicly available benchmarks
+(including vision and language datasets) to show that Aranyani achieves a
+better accuracy-fairness trade-off compared to baseline approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Last One Standing: A Comparative Analysis of Security and Privacy of
+  Soft <span class="highlight-title">Prompt</span> Tuning, LoRA, and In-Context Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11397v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11397v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Wen, Tianhao Wang, Michael Backes, Yang Zhang, Ahmed Salem
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are powerful tools for natural language
+processing, enabling novel applications and user experiences. However, to
+achieve optimal performance, LLMs often require adaptation with private data,
+which poses privacy and security challenges. Several techniques have been
+proposed to adapt LLMs with private data, such as Low-Rank Adaptation (LoRA),
+Soft Prompt Tuning (SPT), and In-Context Learning (ICL), but their comparative
+privacy and security properties have not been systematically investigated. In
+this work, we fill this gap by evaluating the robustness of LoRA, SPT, and ICL
+against three types of well-established attacks: membership inference, which
+exposes data leakage (privacy); backdoor, which injects malicious behavior
+(security); and model stealing, which can violate intellectual property
+(privacy and security). Our results show that there is no silver bullet for
+privacy and security in LLM adaptation and each technique has different
+strengths and weaknesses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VaR\ and CVaR Estimation in a Markov Cost Process: Lower and Upper
+  Bounds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11389v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11389v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sanjay Bhat, Prashanth L. A., Gugan Thoppe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We tackle the problem of estimating the Value-at-Risk (VaR) and the
+Conditional Value-at-Risk (CVaR) of the infinite-horizon discounted cost within
+a Markov cost process. First, we derive a minimax lower bound of
+$\Omega(1/\sqrt{n})$ that holds both in an expected and in a probabilistic
+sense. Then, using a finite-horizon truncation scheme, we derive an upper bound
+for the error in CVaR estimation, which matches our lower bound up to constant
+factors. Finally, we discuss an extension of our estimation scheme that covers
+more general risk measures satisfying a certain continuity criterion, e.g.,
+spectral risk measures, utility-based shortfall risk. To the best of our
+knowledge, our work is the first to provide lower and upper bounds on the
+estimation error for any risk measure within Markovian settings. We remark that
+our lower bounds also extend to the infinite-horizon discounted costs' mean.
+Even in that case, our result $\Omega(1/\sqrt{n}) $ improves upon the existing
+result $\Omega(1/n)$[13].
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Faster Algorithms for Generalized Mean Densest Subgraph Problem 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11377v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11377v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenglin Fan, Ping Li, Hanyu Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The densest subgraph of a large graph usually refers to some subgraph with
+the highest average degree, which has been extended to the family of $p$-means
+dense subgraph objectives by~\citet{veldt2021generalized}. The $p$-mean densest
+subgraph problem seeks a subgraph with the highest average $p$-th-power degree,
+whereas the standard densest subgraph problem seeks a subgraph with a simple
+highest average degree. It was shown that the standard peeling algorithm can
+perform arbitrarily poorly on generalized objective when $p>1$ but uncertain
+when $0<p<1$. In this paper, we are the first to show that a standard peeling
+algorithm can still yield $2^{1/p}$-approximation for the case $0<p < 1$.
+(Veldt 2021) proposed a new generalized peeling algorithm (GENPEEL), which for
+$p \geq 1$ has an approximation guarantee ratio $(p+1)^{1/p}$, and time
+complexity $O(mn)$, where $m$ and $n$ denote the number of edges and nodes in
+graph respectively. In terms of algorithmic contributions, we propose a new and
+faster generalized peeling algorithm (called GENPEEL++ in this paper), which
+for $p \in [1, +\infty)$ has an approximation guarantee ratio $(2(p+1))^{1/p}$,
+and time complexity $O(m(\log n))$, where $m$ and $n$ denote the number of
+edges and nodes in graph, respectively. This approximation ratio converges to 1
+as $p \rightarrow \infty$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2106.00909 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lie Group Decompositions for Equivariant Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11366v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11366v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mircea Mironenco, Patrick Forré
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Invariance and equivariance to geometrical transformations have proven to be
+very useful inductive biases when training (convolutional) neural network
+models, especially in the low-data regime. Much work has focused on the case
+where the symmetry group employed is compact or abelian, or both. Recent work
+has explored enlarging the class of transformations used to the case of Lie
+groups, principally through the use of their Lie algebra, as well as the group
+exponential and logarithm maps. The applicability of such methods to larger
+transformation groups is limited by the fact that depending on the group of
+interest $G$, the exponential map may not be surjective. Further limitations
+are encountered when $G$ is neither compact nor abelian. Using the structure
+and geometry of Lie groups and their homogeneous spaces, we present a framework
+by which it is possible to work with such groups primarily focusing on the Lie
+groups $G = \text{GL}^{+}(n, \mathbb{R})$ and $G = \text{SL}(n, \mathbb{R})$,
+as well as their representation as affine transformations $\mathbb{R}^{n}
+\rtimes G$. Invariant integration as well as a global parametrization is
+realized by decomposing the `larger` groups into subgroups and submanifolds
+which can be handled individually. Under this framework, we show how
+convolution kernels can be parametrized to build models equivariant with
+respect to affine transformations. We evaluate the robustness and
+out-of-distribution generalisation capability of our model on the standard
+affine-invariant benchmark classification task, where we outperform all
+previous equivariant models as well as all Capsule Network proposals.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual Cognitive Architecture: Incorporating Biases and Multi-Memory
+  Systems for Lifelong Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11341v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11341v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shruthi Gowda, Bahram Zonooz, Elahe Arani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial neural networks (ANNs) exhibit a narrow scope of expertise on
+stationary independent data. However, the data in the real world is continuous
+and dynamic, and ANNs must adapt to novel scenarios while also retaining the
+learned knowledge to become lifelong learners. The ability of humans to excel
+at these tasks can be attributed to multiple factors ranging from cognitive
+computational structures, cognitive biases, and the multi-memory systems in the
+brain. We incorporate key concepts from each of these to design a novel
+framework, Dual Cognitive Architecture (DUCA), which includes multiple
+sub-systems, implicit and explicit knowledge representation dichotomy,
+inductive bias, and a multi-memory system. The inductive bias learner within
+DUCA is instrumental in encoding shape information, effectively countering the
+tendency of ANNs to learn local textures. Simultaneously, the inclusion of a
+semantic memory submodule facilitates the gradual consolidation of knowledge,
+replicating the dynamics observed in fast and slow learning systems,
+reminiscent of the principles underpinning the complementary learning system in
+human cognition. DUCA shows improvement across different settings and datasets,
+and it also exhibits reduced task recency bias, without the need for extra
+information. To further test the versatility of lifelong learning methods on a
+challenging distribution shift, we introduce a novel domain-incremental dataset
+DN4IL. In addition to improving performance on existing benchmarks, DUCA also
+demonstrates superior performance on this complex dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in Transactions on Machine Learning Research (TMLR)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Contextualized Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11340v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11340v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Lengerich, Caleb N. Ellington, Andrea Rubbi, Manolis Kellis, Eric P. Xing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We examine Contextualized Machine Learning (ML), a paradigm for learning
+heterogeneous and context-dependent effects. Contextualized ML estimates
+heterogeneous functions by applying deep learning to the meta-relationship
+between contextual information and context-specific parametric models. This is
+a form of varying-coefficient modeling that unifies existing frameworks
+including cluster analysis and cohort modeling by introducing two reusable
+concepts: a context encoder which translates sample context into model
+parameters, and sample-specific model which operates on sample predictors. We
+review the process of developing contextualized models, nonparametric inference
+from contextualized models, and identifiability conditions of contextualized
+models. Finally, we present the open-source PyTorch package ContextualizedML.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Non-ergodicity in reinforcement learning: robustness via ergodicity
+  transformations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11335v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11335v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominik Baumann, Erfaun Noorani, James Price, Ole Peters, Colm Connaughton, Thomas B. Schön
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Envisioned application areas for reinforcement learning (RL) include
+autonomous driving, precision agriculture, and finance, which all require RL
+agents to make decisions in the real world. A significant challenge hindering
+the adoption of RL methods in these domains is the non-robustness of
+conventional algorithms. In this paper, we argue that a fundamental issue
+contributing to this lack of robustness lies in the focus on the expected value
+of the return as the sole "correct" optimization objective. The expected value
+is the average over the statistical ensemble of infinitely many trajectories.
+For non-ergodic returns, this average differs from the average over a single
+but infinitely long trajectory. Consequently, optimizing the expected value can
+lead to policies that yield exceptionally high returns with probability zero
+but almost surely result in catastrophic outcomes. This problem can be
+circumvented by transforming the time series of collected returns into one with
+ergodic increments. This transformation enables learning robust policies by
+optimizing the long-term return for individual agents rather than the average
+across infinitely many trajectories. We propose an algorithm for learning
+ergodicity transformations from data and demonstrate its effectiveness in an
+instructive, non-ergodic environment and on standard RL benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantifying Language Models' Sensitivity to Spurious Features in <span class="highlight-title">Prompt</span>
+  Design or: How I learned to start worrying about <span class="highlight-title">prompt</span> formatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11324v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11324v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Melanie Sclar, Yejin Choi, Yulia Tsvetkov, Alane Suhr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large language models (LLMs) are adopted as a fundamental component of
+language technologies, it is crucial to accurately characterize their
+performance. Because choices in prompt design can strongly influence model
+behavior, this design process is critical in effectively using any modern
+pre-trained generative language model. In this work, we focus on LLM
+sensitivity to a quintessential class of meaning-preserving design choices:
+prompt formatting. We find that several widely used open-source LLMs are
+extremely sensitive to subtle changes in prompt formatting in few-shot
+settings, with performance differences of up to 76 accuracy points when
+evaluated using LLaMA-2-13B. Sensitivity remains even when increasing model
+size, the number of few-shot examples, or performing instruction tuning. Our
+analysis suggests that work evaluating LLMs with prompting-based methods would
+benefit from reporting a range of performance across plausible prompt formats,
+instead of the currently-standard practice of reporting performance on a single
+format. We also show that format performance only weakly correlates between
+models, which puts into question the methodological validity of comparing
+models with an arbitrarily chosen, fixed prompt format. To facilitate
+systematic analysis we propose FormatSpread, an algorithm that rapidly
+evaluates a sampled set of plausible prompt formats for a given task, and
+reports the interval of expected performance without accessing model weights.
+Furthermore, we present a suite of analyses that characterize the nature of
+this sensitivity, including exploring the influence of particular atomic
+perturbations and the internal representation of particular formats.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Elucidating The Design Space of Classifier-Guided Diffusion Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11311v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11311v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajun Ma, Tianyang Hu, Wenjia Wang, Jiacheng Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Guidance in conditional diffusion generation is of great importance for
+sample quality and controllability. However, existing guidance schemes are to
+be desired. On one hand, mainstream methods such as classifier guidance and
+classifier-free guidance both require extra training with labeled data, which
+is time-consuming and unable to adapt to new conditions. On the other hand,
+training-free methods such as universal guidance, though more flexible, have
+yet to demonstrate comparable performance. In this work, through a
+comprehensive investigation into the design space, we show that it is possible
+to achieve significant performance improvements over existing guidance schemes
+by leveraging off-the-shelf classifiers in a training-free fashion, enjoying
+the best of both worlds. Employing calibration as a general guideline, we
+propose several pre-conditioning techniques to better exploit pretrained
+off-the-shelf classifiers for guiding diffusion generation. Extensive
+experiments on ImageNet validate our proposed method, showing that
+state-of-the-art diffusion models (DDPM, EDM, DiT) can be further improved (up
+to 20%) using off-the-shelf classifiers with barely any extra computational
+cost. With the proliferation of publicly available pretrained classifiers, our
+proposed approach has great potential and can be readily scaled up to
+text-to-image generation tasks. The code is available at
+https://github.com/AlexMaOLS/EluCD/tree/main.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MiniZero: Comparative Analysis of AlphaZero and MuZero on Go, Othello,
+  and Atari Games 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11305v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11305v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ti-Rong Wu, Hung Guei, Po-Wei Huang, Pei-Chiun Peng, Ting Han Wei, Chung-Chin Shih, Yun-Jui Tsai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents MiniZero, a zero-knowledge learning framework that
+supports four state-of-the-art algorithms, including AlphaZero, MuZero, Gumbel
+AlphaZero, and Gumbel MuZero. While these algorithms have demonstrated
+super-human performance in many games, it remains unclear which among them is
+most suitable or efficient for specific tasks. Through MiniZero, we
+systematically evaluate the performance of each algorithm in two board games,
+9x9 Go and 8x8 Othello, as well as 57 Atari games. Our empirical findings are
+summarized as follows. For two board games, using more simulations generally
+results in higher performance. However, the choice of AlphaZero and MuZero may
+differ based on game properties. For Atari games, both MuZero and Gumbel MuZero
+are worth considering. Since each game has unique characteristics, different
+algorithms and simulations yield varying results. In addition, we introduce an
+approach, called progressive simulation, which progressively increases the
+simulation budget during training to allocate computation more efficiently. Our
+empirical results demonstrate that progressive simulation achieves
+significantly superior performance in two board games. By making our framework
+and trained models publicly available, this paper contributes a benchmark for
+future research on zero-knowledge learning algorithms, assisting researchers in
+algorithm selection and comparison against these zero-knowledge learning
+baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Automatic Learning Rate Schedule Algorithm for Achieving Faster
+  Convergence and Steeper Descent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11291v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11291v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhao Song, Chiwun Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The delta-bar-delta algorithm is recognized as a learning rate adaptation
+technique that enhances the convergence speed of the training process in
+optimization by dynamically scheduling the learning rate based on the
+difference between the current and previous weight updates. While this
+algorithm has demonstrated strong competitiveness in full data optimization
+when compared to other state-of-the-art algorithms like Adam and SGD, it may
+encounter convergence issues in mini-batch optimization scenarios due to the
+presence of noisy gradients.
+  In this study, we thoroughly investigate the convergence behavior of the
+delta-bar-delta algorithm in real-world neural network optimization. To address
+any potential convergence challenges, we propose a novel approach called RDBD
+(Regrettable Delta-Bar-Delta). Our approach allows for prompt correction of
+biased learning rate adjustments and ensures the convergence of the
+optimization process. Furthermore, we demonstrate that RDBD can be seamlessly
+integrated with any optimization algorithm and significantly improve the
+convergence speed.
+  By conducting extensive experiments and evaluations, we validate the
+effectiveness and efficiency of our proposed RDBD approach. The results
+showcase its capability to overcome convergence issues in mini-batch
+optimization and its potential to enhance the convergence speed of various
+optimization algorithms. This research contributes to the advancement of
+optimization techniques in neural network training, providing practitioners
+with a reliable automatic learning rate scheduler for achieving faster
+convergence and improved optimization outcomes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating the Impact of Humanitarian Aid on Food Security 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11287v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11287v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jordi Cerdà-Bautista, José María Tárraga, Vasileios Sitokonstantinou, Gustau Camps-Valls
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the face of climate change-induced droughts, vulnerable regions encounter
+severe threats to food security, demanding urgent humanitarian assistance. This
+paper introduces a causal inference framework for the Horn of Africa, aiming to
+assess the impact of cash-based interventions on food crises. Our contributions
+encompass identifying causal relationships within the food security system,
+harmonizing a comprehensive database, and estimating the causal effect of
+humanitarian interventions on malnutrition. Our results revealed no significant
+effects, likely due to limited sample size, suboptimal data quality, and an
+imperfect causal graph resulting from our limited understanding of
+multidisciplinary systems like food security. This underscores the need to
+enhance data collection and refine causal models with domain experts for more
+effective future interventions and policies, improving transparency and
+accountability in humanitarian aid.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>3 pages, 1 figure, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self-supervision meets kernel graph neural models: From architecture to
+  augmentations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11281v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11281v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawang Dan, Ruofan Wu, Yunpeng Liu, Baokun Wang, Changhua Meng, Tengfei Liu, Tianyi Zhang, Ningtao Wang, Xing Fu, Qi Li, Weiqiang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph representation learning has now become the de facto standard when
+handling graph-structured data, with the framework of message-passing graph
+neural networks (MPNN) being the most prevailing algorithmic tool. Despite its
+popularity, the family of MPNNs suffers from several drawbacks such as
+transparency and expressivity. Recently, the idea of designing neural models on
+graphs using the theory of graph kernels has emerged as a more transparent as
+well as sometimes more expressive alternative to MPNNs known as kernel graph
+neural networks (KGNNs). Developments on KGNNs are currently a nascent field of
+research, leaving several challenges from algorithmic design and adaptation to
+other learning paradigms such as self-supervised learning. In this paper, we
+improve the design and learning of KGNNs. Firstly, we extend the algorithmic
+formulation of KGNNs by allowing a more flexible graph-level similarity
+definition that encompasses former proposals like random walk graph kernel, as
+well as providing a smoother optimization objective that alleviates the need of
+introducing combinatorial learning procedures. Secondly, we enhance KGNNs
+through the lens of self-supervision via developing a novel
+structure-preserving graph data augmentation method called latent graph
+augmentation (LGA). Finally, we perform extensive empirical evaluations to
+demonstrate the efficacy of our proposed mechanisms. Experimental results over
+benchmark datasets suggest that our proposed model achieves competitive
+performance that is comparable to or sometimes outperforming state-of-the-art
+graph representation learning frameworks with or without self-supervision on
+graph classification tasks. Comparisons against other previously established
+graph data augmentation methods verify that the proposed LGA augmentation
+scheme captures better semantics of graph-level invariance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gromov-Wassertein-like Distances in the Gaussian Mixture Models Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11256v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11256v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antoine Salmona, Julie Delon, Agnès Desolneux
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we introduce two Gromov-Wasserstein-type distances on the set
+of Gaussian mixture models. The first one takes the form of a
+Gromov-Wasserstein distance between two discrete distributionson the space of
+Gaussian measures. This distance can be used as an alternative to
+Gromov-Wasserstein for applications which only require to evaluate how far the
+distributions are from each other but does not allow to derive directly an
+optimal transportation plan between clouds of points. To design a way to define
+such a transportation plan, we introduce another distance between measures
+living in incomparable spaces that turns out to be closely related to
+Gromov-Wasserstein. When restricting the set of admissible transportation
+couplings to be themselves Gaussian mixture models in this latter, this defines
+another distance between Gaussian mixture models that can be used as another
+alternative to Gromov-Wasserstein and which allows to derive an optimal
+assignment between points. Finally, we design a transportation plan associated
+with the first distance by analogy with the second, and we illustrate their
+practical uses on medium-to-large scale problems such as shape matching and
+hyperspectral image color transfer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CrossCodeEval: A Diverse and Multilingual Benchmark for Cross-File Code
+  Completion <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11248v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11248v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yangruibo Ding, Zijian Wang, Wasi Uddin Ahmad, Hantian Ding, Ming Tan, Nihal Jain, Murali Krishna Ramanathan, Ramesh Nallapati, Parminder Bhatia, Dan Roth, Bing Xiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Code completion models have made significant progress in recent years, yet
+current popular evaluation datasets, such as HumanEval and MBPP, predominantly
+focus on code completion tasks within a single file. This over-simplified
+setting falls short of representing the real-world software development
+scenario where repositories span multiple files with numerous cross-file
+dependencies, and accessing and understanding cross-file context is often
+required to complete the code correctly.
+  To fill in this gap, we propose CrossCodeEval, a diverse and multilingual
+code completion benchmark that necessitates an in-depth cross-file contextual
+understanding to complete the code accurately. CrossCodeEval is built on a
+diverse set of real-world, open-sourced, permissively-licensed repositories in
+four popular programming languages: Python, Java, TypeScript, and C#. To create
+examples that strictly require cross-file context for accurate completion, we
+propose a straightforward yet efficient static-analysis-based approach to
+pinpoint the use of cross-file context within the current file.
+  Extensive experiments on state-of-the-art code language models like CodeGen
+and StarCoder demonstrate that CrossCodeEval is extremely challenging when the
+relevant cross-file context is absent, and we see clear improvements when
+adding these context into the prompt. However, despite such improvements, the
+pinnacle of performance remains notably unattained even with the
+highest-performing model, indicating that CrossCodeEval is also capable of
+assessing model's capability in leveraging extensive context to make better
+code completion. Finally, we benchmarked various methods in retrieving
+cross-file context, and show that CrossCodeEval can also be used to measure the
+capability of code retrievers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at NeurIPS 2023 (Datasets and Benchmarks Track)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Entity Matching using Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11244v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11244v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ralph Peeters, Christian Bizer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Entity Matching is the task of deciding whether two entity descriptions refer
+to the same real-world entity. Entity Matching is a central step in most data
+integration pipelines and an enabler for many e-commerce applications which
+require to match products offers from different vendors. State-of-the-art
+entity matching methods often rely on pre-trained language models (PLMs) such
+as BERT or RoBERTa. Two major drawbacks of these models for entity matching are
+that (i) the models require significant amounts of task-specific training data
+and (ii) the fine-tuned models are not robust concerning out-of-distribution
+entities. In this paper, we investigate using large language models (LLMs) for
+entity matching as a less domain-specific training data reliant and more robust
+alternative to PLM-based matchers. Our study covers hosted LLMs, such as GPT3.5
+and GPT4, as well as open source LLMs based on Llama2 which can be run locally.
+We evaluate these models in a zero-shot scenario as well as a scenario where
+task-specific training data is available. We compare different prompt designs
+as well as the prompt sensitivity of the models in the zero-shot scenario. We
+investigate (i) the selection of in-context demonstrations, (ii) the generation
+of matching rules, as well as (iii) fine-tuning GPT3.5 in the second scenario
+using the same pool of training data across the different approaches. Our
+experiments show that GPT4 without any task-specific training data outperforms
+fine-tuned PLMs (RoBERTa and Ditto) on three out of five benchmark datasets
+reaching F1 scores around 90%. The experiments with in-context learning and
+rule generation show that all models beside of GPT4 benefit from these
+techniques (on average 5.9% and 2.2% F1), while GPT4 does not need such
+additional guidance in most cases...
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Sample Better 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11232v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11232v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael S. Albergo, Eric Vanden-Eijnden
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  These lecture notes provide an introduction to recent advances in generative
+modeling methods based on the dynamical transportation of measures, by means of
+which samples from a simple base measure are mapped to samples from a target
+measure of interest. Special emphasis is put on the applications of these
+methods to Monte-Carlo (MC) sampling techniques, such as importance sampling
+and Markov Chain Monte-Carlo (MCMC) schemes. In this context, it is shown how
+the maps can be learned variationally using data generated by MC sampling, and
+how they can in turn be used to improve such sampling in a positive feedback
+loop.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Les Houches 2022 Summer School on Statistical Physics and Machine
+  Learning</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Zipformer: A faster and better encoder for automatic speech recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11230v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11230v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zengwei Yao, Liyong Guo, Xiaoyu Yang, Wei Kang, Fangjun Kuang, Yifan Yang, Zengrui Jin, Long Lin, Daniel Povey
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Conformer has become the most popular encoder model for automatic speech
+recognition (ASR). It adds convolution modules to a transformer to learn both
+local and global dependencies. In this work we describe a faster, more
+memory-efficient, and better-performing transformer, called Zipformer. Modeling
+changes include: 1) a U-Net-like encoder structure where middle stacks operate
+at lower frame rates; 2) reorganized block structure with more modules, within
+which we re-use attention weights for efficiency; 3) a modified form of
+LayerNorm called BiasNorm allows us to retain some length information; 4) new
+activation functions SwooshR and SwooshL work better than Swish. We also
+propose a new optimizer, called ScaledAdam, which scales the update by each
+tensor's current scale to keep the relative change about the same, and also
+explictly learns the parameter scale. It achieves faster convergence and better
+performance than Adam. Extensive experiments on LibriSpeech, Aishell-1, and
+WenetSpeech datasets demonstrate the effectiveness of our proposed Zipformer
+over other state-of-the-art ASR models. Our code is publicly available at
+https://github.com/k2-fsa/icefall.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding Fairness Surrogate Functions in Algorithmic Fairness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11211v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11211v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Yao, Zhanke Zhou, Zhicong Li, Bo Han, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It has been observed that machine learning algorithms exhibit biased
+predictions against certain population groups. To mitigate such bias while
+achieving comparable accuracy, a promising approach is to introduce surrogate
+functions of the concerned fairness definition and solve a constrained
+optimization problem. However, an intriguing issue in previous work is that
+such fairness surrogate functions may yield unfair results. In this work, in
+order to deeply understand this issue, taking a widely used fairness
+definition, demographic parity as an example, we both theoretically and
+empirically show that there is a surrogate-fairness gap between the fairness
+definition and the fairness surrogate function. The "gap" directly determines
+whether a surrogate function is an appropriate substitute for a fairness
+definition. Also, the theoretical analysis and experimental results about the
+"gap" motivate us that the unbounded surrogate functions will be affected by
+the points far from the decision boundary, which is the large margin points
+issue investigated in this paper. To address it, we propose the general sigmoid
+surrogate with a rigorous and reliable fairness guarantee. Interestingly, the
+theory also provides insights into two important issues that deal with the
+large margin points as well as obtaining a more balanced dataset are beneficial
+to fairness. Furthermore, we elaborate a novel and general algorithm called
+Balanced Surrogate, which iteratively reduces the "gap" to improve fairness.
+Finally, we provide empirical evidence showing that our methods achieve better
+fairness performance in three real-world datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can Large Language Models Explain Themselves? A Study of LLM-Generated
+  Self-Explanations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11207v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11207v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiyuan Huang, Siddarth Mamidanna, Shreedhar Jangam, Yilun Zhou, Leilani H. Gilpin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) such as ChatGPT have demonstrated superior
+performance on a variety of natural language processing (NLP) tasks including
+sentiment analysis, mathematical reasoning and summarization. Furthermore,
+since these models are instruction-tuned on human conversations to produce
+"helpful" responses, they can and often will produce explanations along with
+the response, which we call self-explanations. For example, when analyzing the
+sentiment of a movie review, the model may output not only the positivity of
+the sentiment, but also an explanation (e.g., by listing the sentiment-laden
+words such as "fantastic" and "memorable" in the review). How good are these
+automatically generated self-explanations? In this paper, we investigate this
+question on the task of sentiment analysis and for feature attribution
+explanation, one of the most commonly studied settings in the interpretability
+literature (for pre-ChatGPT models). Specifically, we study different ways to
+elicit the self-explanations, evaluate their faithfulness on a set of
+evaluation metrics, and compare them to traditional explanation methods such as
+occlusion or LIME saliency maps. Through an extensive set of experiments, we
+find that ChatGPT's self-explanations perform on par with traditional ones, but
+are quite different from them according to various agreement metrics, meanwhile
+being much cheaper to produce (as they are generated along with the
+prediction). In addition, we identified several interesting characteristics of
+them, which prompt us to rethink many current model interpretability practices
+in the era of ChatGPT(-like) LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated Learning with Nonvacuous Generalisation Bounds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11203v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11203v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pierre Jobic, Maxime Haddouche, Benjamin Guedj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a novel strategy to train randomised predictors in federated
+learning, where each node of the network aims at preserving its privacy by
+releasing a local predictor but keeping secret its training dataset with
+respect to the other nodes. We then build a global randomised predictor which
+inherits the properties of the local private predictors in the sense of a
+PAC-Bayesian generalisation bound. We consider the synchronous case where all
+nodes share the same training objective (derived from a generalisation bound),
+and the asynchronous case where each node may have its own personalised
+training objective. We show through a series of numerical experiments that our
+approach achieves a comparable predictive performance to that of the batch
+approach where all datasets are shared across nodes. Moreover the predictors
+are supported by numerically nonvacuous generalisation bounds while preserving
+privacy for each node. We explicitly compute the increment on predictive
+performance and generalisation bounds between batch and federated settings,
+highlighting the price to pay to preserve privacy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EEG motor imagery decoding: A framework for comparative analysis with
+  channel attention mechanisms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11198v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11198v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Wimpff, Leonardo Gizzi, Jan Zerfowski, Bin Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The objective of this study is to investigate the application of various
+channel attention mechanisms within the domain of brain-computer interface
+(BCI) for motor imagery decoding. Channel attention mechanisms can be seen as a
+powerful evolution of spatial filters traditionally used for motor imagery
+decoding. This study systematically compares such mechanisms by integrating
+them into a lightweight architecture framework to evaluate their impact. We
+carefully construct a straightforward and lightweight baseline architecture
+designed to seamlessly integrate different channel attention mechanisms. This
+approach is contrary to previous works which only investigate one attention
+mechanism and usually build a very complex, sometimes nested architecture. Our
+framework allows us to evaluate and compare the impact of different attention
+mechanisms under the same circumstances. The easy integration of different
+channel attention mechanisms as well as the low computational complexity
+enables us to conduct a wide range of experiments on three datasets to
+thoroughly assess the effectiveness of the baseline model and the attention
+mechanisms. Our experiments demonstrate the strength and generalizability of
+our architecture framework as well as how channel attention mechanisms can
+improve the performance while maintaining the small memory footprint and low
+computational complexity of our baseline architecture. Our architecture
+emphasizes simplicity, offering easy integration of channel attention
+mechanisms, while maintaining a high degree of generalizability across
+datasets, making it a versatile and efficient solution for EEG motor imagery
+decoding within brain-computer interfaces.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 4 figures, submitted to: Journal of Neural Engineering</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Modified EXP3 and Its Adaptive Variant in Adversarial Bandits with
+  Multi-User Delayed Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11188v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11188v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yandi Li, Jianxiong Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For the adversarial multi-armed bandit problem with delayed feedback, we
+consider that the delayed feedback results are from multiple users and are
+unrestricted on internal distribution. As the player picks an arm, feedback
+from multiple users may not be received instantly yet after an arbitrary delay
+of time which is unknown to the player in advance. For different users in a
+round, the delays in feedback have no latent correlation. Thus, we formulate an
+adversarial multi-armed bandit problem with multi-user delayed feedback and
+design a modified EXP3 algorithm named MUD-EXP3, which makes a decision at each
+round by considering the importance-weighted estimator of the received feedback
+from different users. On the premise of known terminal round index $T$, the
+number of users $M$, the number of arms $N$, and upper bound of delay
+$d_{max}$, we prove a regret of
+$\mathcal{O}(\sqrt{TM^2\ln{N}(N\mathrm{e}+4d_{max})})$. Furthermore, for the
+more common case of unknown $T$, an adaptive algorithm named AMUD-EXP3 is
+proposed with a sublinear regret with respect to $T$. Finally, extensive
+experiments are conducted to indicate the correctness and effectiveness of our
+algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is an extended version of "A Modified EXP3 in Adversarial
+  Bandits with Multi-User Delayed Feedback" published in COCOON 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficiently Visualizing Large Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11186v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11186v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Li, Yao Xiao, Yuchen Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most existing graph visualization methods based on dimension reduction are
+limited to relatively small graphs due to performance issues. In this work, we
+propose a novel dimension reduction method for graph visualization, called
+t-Distributed Stochastic Graph Neighbor Embedding (t-SGNE). t-SGNE is
+specifically designed to visualize cluster structures in the graph. As a
+variant of the standard t-SNE method, t-SGNE avoids the time-consuming
+computations of pairwise similarity. Instead, it uses the neighbor structures
+of the graph to reduce the time complexity from quadratic to linear, thus
+supporting larger graphs. In addition, to suit t-SGNE, we combined Laplacian
+Eigenmaps with the shortest path algorithm in graphs to form the graph
+embedding algorithm ShortestPath Laplacian Eigenmaps Embedding (SPLEE).
+Performing SPLEE to obtain a high-dimensional embedding of the large-scale
+graph and then using t-SGNE to reduce its dimension for visualization, we are
+able to visualize graphs with up to 300K nodes and 1M edges within 5 minutes
+and achieve approximately 10% improvement in visualization quality. Codes and
+data are available at
+https://github.com/Charlie-XIAO/embedding-visualization-test.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MST-GAT: A Multimodal Spatial-Temporal Graph Attention Network for Time
+  Series Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11169v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11169v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoyue Ding, Shiliang Sun, Jing Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal time series (MTS) anomaly detection is crucial for maintaining the
+safety and stability of working devices (e.g., water treatment system and
+spacecraft), whose data are characterized by multivariate time series with
+diverse modalities. Although recent deep learning methods show great potential
+in anomaly detection, they do not explicitly capture spatial-temporal
+relationships between univariate time series of different modalities, resulting
+in more false negatives and false positives. In this paper, we propose a
+multimodal spatial-temporal graph attention network (MST-GAT) to tackle this
+problem. MST-GAT first employs a multimodal graph attention network (M-GAT) and
+a temporal convolution network to capture the spatial-temporal correlation in
+multimodal time series. Specifically, M-GAT uses a multi-head attention module
+and two relational attention modules (i.e., intra- and inter-modal attention)
+to model modal correlations explicitly. Furthermore, MST-GAT optimizes the
+reconstruction and prediction modules simultaneously. Experimental results on
+four multimodal benchmarks demonstrate that MST-GAT outperforms the
+state-of-the-art baselines. Further analysis indicates that MST-GAT strengthens
+the interpretability of detected anomalies by locating the most anomalous
+univariate time series.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Information Fusion 2023 accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Serenade: A Model for Human-in-the-loop Automatic Chord Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11165v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11165v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hendrik Vincent Koops, Gianluca Micchi, Ilaria Manco, Elio Quinton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computational harmony analysis is important for MIR tasks such as automatic
+segmentation, corpus analysis and automatic chord label estimation. However,
+recent research into the ambiguous nature of musical harmony, causing limited
+inter-rater agreement, has made apparent that there is a glass ceiling for
+common metrics such as accuracy. Commonly, these issues are addressed either in
+the training data itself by creating majority-rule annotations or during the
+training phase by learning soft targets. We propose a novel alternative
+approach in which a human and an autoregressive model together co-create a
+harmonic annotation for an audio track. After automatically generating harmony
+predictions, a human sparsely annotates parts with low model confidence and the
+model then adjusts its predictions following human guidance. We evaluate our
+model on a dataset of popular music and we show that, with this
+human-in-the-loop approach, harmonic analysis performance improves over a
+model-only approach. The human contribution is amplified by the second,
+constrained prediction of the model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at MMRP23. 7 pages, 5 figures, 2 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Probing the Creativity of Large Language Models: Can models produce
+  divergent semantic association? <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11158v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11158v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Honghua Chen, Nai Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models possess remarkable capacity for processing language,
+but it remains unclear whether these models can further generate creative
+content. The present study aims to investigate the creative thinking of large
+language models through a cognitive perspective. We utilize the divergent
+association task (DAT), an objective measurement of creativity that asks models
+to generate unrelated words and calculates the semantic distance between them.
+We compare the results across different models and decoding strategies. Our
+findings indicate that: (1) When using the greedy search strategy, GPT-4
+outperforms 96% of humans, while GPT-3.5-turbo exceeds the average human level.
+(2) Stochastic sampling and temperature scaling are effective to obtain higher
+DAT scores for models except GPT-4, but face a trade-off between creativity and
+stability. These results imply that advanced large language models have
+divergent semantic associations, which is a fundamental process underlying
+creativity.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A new high-resolution indoor radon map for Germany using a machine
+  learning based probabilistic exposure model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11143v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11143v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eric Petermann, Peter Bossew, Joachim Kemski, Valeria Gruber, Nils Suhr, Bernd Hoffmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Radon is a carcinogenic, radioactive gas that can accumulate indoors. Indoor
+radon exposure at the national scale is usually estimated on the basis of
+extensive measurement campaigns. However, characteristics of the sample often
+differ from the characteristics of the population due to the large number of
+relevant factors such as the availability of geogenic radon or floor level.
+Furthermore, the sample size usually does not allow exposure estimation with
+high spatial resolution. We propose a model-based approach that allows a more
+realistic estimation of indoor radon distribution with a higher spatial
+resolution than a purely data-based approach. We applied a two-stage modelling
+approach: 1) a quantile regression forest using environmental and building data
+as predictors was applied to estimate the probability distribution function of
+indoor radon for each floor level of each residential building in Germany; (2)
+a probabilistic Monte Carlo sampling technique enabled the combination and
+population weighting of floor-level predictions. In this way, the uncertainty
+of the individual predictions is effectively propagated into the estimate of
+variability at the aggregated level. The results give an arithmetic mean of 63
+Bq/m3, a geometric mean of 41 Bq/m3 and a 95 %ile of 180 Bq/m3. The exceedance
+probability for 100 Bq/m3 and 300 Bq/m3 are 12.5 % (10.5 million people) and
+2.2 % (1.9 million people), respectively. In large cities, individual indoor
+radon exposure is generally lower than in rural areas, which is a due to the
+different distribution of the population on floor levels. The advantages of our
+approach are 1) an accurate exposure estimation even if the survey was not
+fully representative with respect to the main controlling factors, and 2) an
+estimate of the exposure distribution with a much higher spatial resolution
+than basic descriptive statistics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BayesDiff: Estimating Pixel-wise Uncertainty in Diffusion via Bayesian
+  Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11142v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11142v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siqi Kou, Lei Gan, Dequan Wang, Chongxuan Li, Zhijie Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have impressive image generation capability, but low-quality
+generations still exist, and their identification remains challenging due to
+the lack of a proper sample-wise metric. To address this, we propose BayesDiff,
+a pixel-wise uncertainty estimator for generations from diffusion models based
+on Bayesian inference. In particular, we derive a novel uncertainty iteration
+principle to characterize the uncertainty dynamics in diffusion, and leverage
+the last-layer Laplace approximation for efficient Bayesian inference. The
+estimated pixel-wise uncertainty can not only be aggregated into a sample-wise
+metric to filter out low-fidelity images but also aids in augmenting successful
+generations and rectifying artifacts in failed generations in text-to-image
+tasks. Extensive experiments demonstrate the efficacy of BayesDiff and its
+promise for practical applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Keep Various Trajectories: Promoting Exploration of Ensemble Policies in
+  Continuous Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11138v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11138v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Li, Chen Gong, Qiang He, Xinwen Hou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The combination of deep reinforcement learning (DRL) with ensemble methods
+has been proved to be highly effective in addressing complex sequential
+decision-making problems. This success can be primarily attributed to the
+utilization of multiple models, which enhances both the robustness of the
+policy and the accuracy of value function estimation. However, there has been
+limited analysis of the empirical success of current ensemble RL methods thus
+far. Our new analysis reveals that the sample efficiency of previous ensemble
+DRL algorithms may be limited by sub-policies that are not as diverse as they
+could be. Motivated by these findings, our study introduces a new ensemble RL
+algorithm, termed \textbf{T}rajectories-awar\textbf{E} \textbf{E}nsemble
+exploratio\textbf{N} (TEEN). The primary goal of TEEN is to maximize the
+expected return while promoting more diverse trajectories. Through extensive
+experiments, we demonstrate that TEEN not only enhances the sample diversity of
+the ensemble policy compared to using sub-policies alone but also improves the
+performance over ensemble RL algorithms. On average, TEEN outperforms the
+baseline ensemble DRL algorithms by 41\% in performance on the tested
+representative environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Non-parametric Conditional Independence Testing for Mixed
+  Continuous-Categorical Variables: A Novel Method and Numerical Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11132v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11132v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oana-Iuliana Popescu, Andreas Gerhardus, Jakob Runge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conditional independence testing (CIT) is a common task in machine learning,
+e.g., for variable selection, and a main component of constraint-based causal
+discovery. While most current CIT approaches assume that all variables are
+numerical or all variables are categorical, many real-world applications
+involve mixed-type datasets that include numerical and categorical variables.
+Non-parametric CIT can be conducted using conditional mutual information (CMI)
+estimators combined with a local permutation scheme. Recently, two novel CMI
+estimators for mixed-type datasets based on k-nearest-neighbors (k-NN) have
+been proposed. As with any k-NN method, these estimators rely on the definition
+of a distance metric. One approach computes distances by a one-hot encoding of
+the categorical variables, essentially treating categorical variables as
+discrete-numerical, while the other expresses CMI by entropy terms where the
+categorical variables appear as conditions only. In this work, we study these
+estimators and propose a variation of the former approach that does not treat
+categorical variables as numeric. Our numerical experiments show that our
+variant detects dependencies more robustly across different data distributions
+and preprocessing types.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FROST: Towards Energy-efficient AI-on-5G Platforms -- A GPU Power
+  Capping Evaluation <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11131v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11131v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ioannis Mavromatis, Stefano De Feo, Pietro Carnelli, Robert J. Piechocki, Aftab Khan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Open Radio Access Network (O-RAN) is a burgeoning market with projected
+growth in the upcoming years. RAN has the highest CAPEX impact on the network
+and, most importantly, consumes 73% of its total energy. That makes it an ideal
+target for optimisation through the integration of Machine Learning (ML).
+However, the energy consumption of ML is frequently overlooked in such
+ecosystems. Our work addresses this critical aspect by presenting FROST -
+Flexible Reconfiguration method with Online System Tuning - a solution for
+energy-aware ML pipelines that adhere to O-RAN's specifications and principles.
+FROST is capable of profiling the energy consumption of an ML pipeline and
+optimising the hardware accordingly, thereby limiting the power draw. Our
+findings indicate that FROST can achieve energy savings of up to 26.4% without
+compromising the model's accuracy or introducing significant time delays.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IEEE CSCN 2023, Munich, Germany</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Topological Expressivity of ReLU Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11130v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11130v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ekin Ergen, Moritz Grillo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the expressivity of ReLU neural networks in the setting of a binary
+classification problem from a topological perspective. Recently, empirical
+studies showed that neural networks operate by changing topology, transforming
+a topologically complicated data set into a topologically simpler one as it
+passes through the layers. This topological simplification has been measured by
+Betti numbers, which are algebraic invariants of a topological space. We use
+the same measure to establish lower and upper bounds on the topological
+simplification a ReLU neural network can achieve with a given architecture. We
+therefore contribute to a better understanding of the expressivity of ReLU
+neural networks in the context of binary classification problems by shedding
+light on their ability to capture the underlying topological structure of the
+data. In particular the results show that deep ReLU neural networks are
+exponentially more powerful than shallow ones in terms of topological
+simplification. This provides a mathematically rigorous explanation why deeper
+networks are better equipped to handle complex and topologically rich datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sensitivity-Aware Amortized Bayesian Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11122v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11122v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lasse Elsemüller, Hans Olischläger, Marvin Schmitt, Paul-Christian Bürkner, Ullrich Köthe, Stefan T. Radev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian inference is a powerful framework for making probabilistic
+inferences and decisions under uncertainty. Fundamental choices in modern
+Bayesian workflows concern the specification of the likelihood function and
+prior distributions, the posterior approximator, and the data. Each choice can
+significantly influence model-based inference and subsequent decisions, thereby
+necessitating sensitivity analysis. In this work, we propose a multifaceted
+approach to integrate sensitivity analyses into amortized Bayesian inference
+(ABI, i.e., simulation-based inference with neural networks). First, we utilize
+weight sharing to encode the structural similarities between alternative
+likelihood and prior specifications in the training process with minimal
+computational overhead. Second, we leverage the rapid inference of neural
+networks to assess sensitivity to various data perturbations or pre-processing
+procedures. In contrast to most other Bayesian approaches, both steps
+circumvent the costly bottleneck of refitting the model(s) for each choice of
+likelihood, prior, or dataset. Finally, we propose to use neural network
+ensembles to evaluate variation in results induced by unreliable approximation
+on unseen data. We demonstrate the effectiveness of our method in applied
+modeling problems, ranging from the estimation of disease outbreak dynamics and
+global warming thresholds to the comparison of human decision-making models.
+Our experiments showcase how our approach enables practitioners to effectively
+unveil hidden relationships between modeling choices and inferential
+conclusions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Super resolution of histopathological frozen sections via deep learning
+  preserving tissue structure 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11112v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11112v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Elad Yoshai, Gil Goldinger, Miki Haifler, Natan T. Shaked
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Histopathology plays a pivotal role in medical diagnostics. In contrast to
+preparing permanent sections for histopathology, a time-consuming process,
+preparing frozen sections is significantly faster and can be performed during
+surgery, where the sample scanning time should be optimized. Super-resolution
+techniques allow imaging the sample in lower magnification and sparing scanning
+time. In this paper, we present a new approach to super resolution for
+histopathological frozen sections, with focus on achieving better distortion
+measures, rather than pursuing photorealistic images that may compromise
+critical diagnostic information. Our deep-learning architecture focuses on
+learning the error between interpolated images and real images, thereby it
+generates high-resolution images while preserving critical image details,
+reducing the risk of diagnostic misinterpretation. This is done by leveraging
+the loss functions in the frequency domain, assigning higher weights to the
+reconstruction of complex, high-frequency components. In comparison to existing
+methods, we obtained significant improvements in terms of Structural Similarity
+Index (SSIM) and Peak Signal-to-Noise Ratio (PSNR), as well as indicated
+details that lost in the low-resolution frozen-section images, affecting the
+pathologist's clinical decisions. Our approach has a great potential in
+providing more-rapid frozen-section imaging, with less scanning, while
+preserving the high resolution in the imaged sample.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Minimally Informed Linear Discriminant Analysis: training an LDA model
+  with unlabelled data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11110v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11110v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolas Heintz, Tom Francart, Alexander Bertrand
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Linear Discriminant Analysis (LDA) is one of the oldest and most popular
+linear methods for supervised classification problems. In this paper, we
+demonstrate that it is possible to compute the exact projection vector from LDA
+models based on unlabelled data, if some minimal prior information is
+available. More precisely, we show that only one of the following three pieces
+of information is actually sufficient to compute the LDA projection vector if
+only unlabelled data are available: (1) the class average of one of the two
+classes, (2) the difference between both class averages (up to a scaling), or
+(3) the class covariance matrices (up to a scaling). These theoretical results
+are validated in numerical experiments, demonstrating that this minimally
+informed Linear Discriminant Analysis (MILDA) model closely matches the
+performance of a supervised LDA model. Furthermore, we show that the MILDA
+projection vector can be computed in a closed form with a computational cost
+comparable to LDA and is able to quickly adapt to non-stationary data, making
+it well-suited to use as an adaptive classifier.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Local Lipschitz Constant Computation of ReLU-FNNs: Upper Bound
+  Computation with Exactness Verification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11104v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11104v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yoshio Ebihara, Xin Dai, Victor Magron, Dimitri Peaucelle, Sophie Tarbouriech
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper is concerned with the computation of the local Lipschitz constant
+of feedforward neural networks (FNNs) with activation functions being rectified
+linear units (ReLUs). The local Lipschitz constant of an FNN for a target input
+is a reasonable measure for its quantitative evaluation of the reliability. By
+following a standard procedure using multipliers that capture the behavior of
+ReLUs,we first reduce the upper bound computation problem of the local
+Lipschitz constant into a semidefinite programming problem (SDP). Here we newly
+introduce copositive multipliers to capture the ReLU behavior accurately. Then,
+by considering the dual of the SDP for the upper bound computation, we second
+derive a viable test to conclude the exactness of the computed upper bound.
+However, these SDPs are intractable for practical FNNs with hundreds of ReLUs.
+To address this issue, we further propose a method to construct a reduced order
+model whose input-output property is identical to the original FNN over a
+neighborhood of the target input. We finally illustrate the effectiveness of
+the model reduction and exactness verification methods with numerical examples
+of practical FNNs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HGCVAE: Integrating Generative and Contrastive Learning for
+  Heterogeneous Graph Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11102v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11102v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yulan Hu, Zhirui Yang, Sheng Ouyang, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative self-supervised learning (SSL) has exhibited significant potential
+and garnered increasing interest in graph learning. In this study, we aim to
+explore the problem of generative SSL in the context of heterogeneous graph
+learning (HGL). The previous SSL approaches for heterogeneous graphs have
+primarily relied on contrastive learning, necessitating the design of complex
+views to capture heterogeneity. However, existing generative SSL methods have
+not fully leveraged the capabilities of generative models to address the
+challenges of HGL. In this paper, we present HGCVAE, a novel contrastive
+variational graph auto-encoder that liberates HGL from the burden of intricate
+heterogeneity capturing. Instead of focusing on complicated heterogeneity,
+HGCVAE harnesses the full potential of generative SSL. HGCVAE innovatively
+consolidates contrastive learning with generative SSL, introducing several key
+innovations. Firstly, we employ a progressive mechanism to generate
+high-quality hard negative samples for contrastive learning, utilizing the
+power of variational inference. Additionally, we present a dynamic mask
+strategy to ensure effective and stable learning. Moreover, we propose an
+enhanced scaled cosine error as the criterion for better attribute
+reconstruction. As an initial step in combining generative and contrastive SSL,
+HGCVAE achieves remarkable results compared to various state-of-the-art
+baselines, confirming its superiority.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sparse-DySta: Sparsity-Aware Dynamic and Static Scheduling for Sparse
+  Multi-DNN Workloads <span class="chip">MICRO'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11096v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11096v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongxiang Fan, Stylianos I. Venieris, Alexandros Kouris, Nicholas D. Lane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Running multiple deep neural networks (DNNs) in parallel has become an
+emerging workload in both edge devices, such as mobile phones where multiple
+tasks serve a single user for daily activities, and data centers, where various
+requests are raised from millions of users, as seen with large language models.
+To reduce the costly computational and memory requirements of these workloads,
+various efficient sparsification approaches have been introduced, resulting in
+widespread sparsity across different types of DNN models. In this context,
+there is an emerging need for scheduling sparse multi-DNN workloads, a problem
+that is largely unexplored in previous literature. This paper systematically
+analyses the use-cases of multiple sparse DNNs and investigates the
+opportunities for optimizations. Based on these findings, we propose Dysta, a
+novel bi-level dynamic and static scheduler that utilizes both static sparsity
+patterns and dynamic sparsity information for the sparse multi-DNN scheduling.
+Both static and dynamic components of Dysta are jointly designed at the
+software and hardware levels, respectively, to improve and refine the
+scheduling approach. To facilitate future progress in the study of this class
+of workloads, we construct a public benchmark that contains sparse multi-DNN
+workloads across different deployment scenarios, spanning from mobile phones
+and AR/VR wearables to data centers. A comprehensive evaluation on the sparse
+multi-DNN benchmark demonstrates that our proposed approach outperforms the
+state-of-the-art methods with up to 10% decrease in latency constraint
+violation rate and nearly 4X reduction in average normalized turnaround time.
+Our artifacts and code are publicly available at:
+https://github.com/SamsungLabs/Sparse-Multi-DNN-Scheduling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted by MICRO'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Relearning Forgotten Knowledge: on Forgetting, Overfit and Training-Free
+  Ensembles of DNNs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11094v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11094v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Uri Stern, Daphna Weinshall
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The infrequent occurrence of overfit in deep neural networks is perplexing.
+On the one hand, theory predicts that as models get larger they should
+eventually become too specialized for a specific training set, with ensuing
+decrease in generalization. In contrast, empirical results in image
+classification indicate that increasing the training time of deep models or
+using bigger models almost never hurts generalization. Is it because the way we
+measure overfit is too limited? Here, we introduce a novel score for
+quantifying overfit, which monitors the forgetting rate of deep models on
+validation data. Presumably, this score indicates that even while
+generalization improves overall, there are certain regions of the data space
+where it deteriorates. When thus measured, we show that overfit can occur with
+and without a decrease in validation accuracy, and may be more common than
+previously appreciated. This observation may help to clarify the aforementioned
+confusing picture. We use our observations to construct a new ensemble method,
+based solely on the training history of a single network, which provides
+significant improvement in performance without any additional cost in training
+time. An extensive empirical evaluation with modern deep models shows our
+method's utility on multiple datasets, neural networks architectures and
+training schemes, both when training from scratch and when using pre-trained
+networks in transfer learning. Notably, our method outperforms comparable
+methods while being easier to implement and use, and further improves the
+performance of competitive networks on Imagenet by 1\%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SODA: Robust Training of Test-Time Data Adaptors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11093v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11093v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zige Wang, Yonggang Zhang, Zhen Fang, Long Lan, Wenjing Yang, Bo Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adapting models deployed to test distributions can mitigate the performance
+degradation caused by distribution shifts. However, privacy concerns may render
+model parameters inaccessible. One promising approach involves utilizing
+zeroth-order optimization (ZOO) to train a data adaptor to adapt the test data
+to fit the deployed models. Nevertheless, the data adaptor trained with ZOO
+typically brings restricted improvements due to the potential corruption of
+data features caused by the data adaptor. To address this issue, we revisit ZOO
+in the context of test-time data adaptation. We find that the issue directly
+stems from the unreliable estimation of the gradients used to optimize the data
+adaptor, which is inherently due to the unreliable nature of the pseudo-labels
+assigned to the test data. Based on this observation, we propose
+pseudo-label-robust data adaptation (SODA) to improve the performance of data
+adaptation. Specifically, SODA leverages high-confidence predicted labels as
+reliable labels to optimize the data adaptor with ZOO for label prediction. For
+data with low-confidence predictions, SODA encourages the adaptor to preserve
+data information to mitigate data corruption. Empirical results indicate that
+SODA can significantly enhance the performance of deployed models in the
+presence of distribution shifts without requiring access to model parameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MeKB-Rec: Personal Knowledge Graph Learning for Cross-Domain
+  Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11088v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11088v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Su, Yao Zhou, Zifei Shan, Qian Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is a long-standing challenge in modern recommender systems to effectively
+make recommendations for new users, namely the cold-start problem. Cross-Domain
+Recommendation (CDR) has been proposed to address this challenge, but current
+ways to represent users' interests across systems are still severely limited.
+We introduce Personal Knowledge Graph (PKG) as a domain-invariant interest
+representation, and propose a novel CDR paradigm named MeKB-Rec. We first link
+users and entities in a knowledge base to construct a PKG of users' interests,
+named MeKB. Then we learn a semantic representation of MeKB for the
+cross-domain recommendation. To efficiently utilize limited training data in
+CDR, MeKB-Rec employs Pretrained Language Models to inject world knowledge into
+understanding users' interests. Beyond most existing systems, our approach
+builds a semantic mapping across domains which breaks the requirement for
+in-domain user behaviors, enabling zero-shot recommendations for new users in a
+low-resource domain. We experiment MeKB-Rec on well-established public CDR
+datasets, and demonstrate that the new formulation % is more powerful than
+previous approaches, achieves a new state-of-the-art that significantly
+improves HR@10 and NDCG@10 metrics over best previous approaches by 24\%--91\%,
+with a 105\% improvement for HR@10 of zero-shot users with no behavior in the
+target domain. We deploy MeKB-Rec in WeiXin recommendation scenarios and
+achieve significant gains in core online metrics. MeKB-Rec is now serving
+hundreds of millions of users in real-world products.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 4 figures, conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Feature Pyramid biLSTM: Using Smartphone Sensors for Transportation Mode
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11087v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11087v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qinrui Tang, Hao Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The widespread utilization of smartphones has provided extensive availability
+to Inertial Measurement Units, providing a wide range of sensory data that can
+be advantageous for the detection of transportation modes. The objective of
+this study is to propose a novel end-to-end approach to effectively explore a
+reduced amount of sensory data collected from a smartphone to achieve accurate
+mode detection in common daily traveling activities. Our approach, called
+Feature Pyramid biLSTM (FPbiLSTM), is characterized by its ability to reduce
+the number of sensors required and processing demands, resulting in a more
+efficient modeling process without sacrificing the quality of the outcomes than
+the other current models. FPbiLSTM extends an existing CNN biLSTM model with
+the Feature Pyramid Network, leveraging the advantages of both shallow layer
+richness and deeper layer feature resilience for capturing temporal moving
+patterns in various transportation modes. It exhibits an excellent performance
+by employing the data collected from only three out of seven sensors, i.e.
+accelerometers, gyroscopes, and magnetometers, in the 2018 Sussex-Huawei
+Locomotion (SHL) challenge dataset, attaining a noteworthy accuracy of 95.1%
+and an F1-score of 94.7% in detecting eight different transportation modes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ In-Context Few-Shot Relation Extraction via <span class="highlight-title">Pre-Train</span>ed Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11085v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11085v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yilmazcan Ozyurt, Stefan Feuerriegel, Ce Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Relation extraction aims at inferring structured human knowledge from textual
+documents. State-of-the-art methods based on language models commonly have two
+limitations: (1) they require named entities to be either given as input or
+infer them, which introduces additional noise, and (2) they require human
+annotations of documents. As a remedy, we present a novel framework for
+in-context few-shot relation extraction via pre-trained language models. To the
+best of our knowledge, we are the first to reformulate the relation extraction
+task as a tailored in-context few-shot learning paradigm. Thereby, we achieve
+crucial benefits in that we eliminate the need for both named entity
+recognition and human annotation of documents. Unlike existing methods based on
+fine-tuning, our framework is flexible in that it can be easily updated for a
+new set of relations without re-training. We evaluate our framework using
+DocRED, the largest publicly available dataset for document-level relation
+extraction, and demonstrate that our framework achieves state-of-the-art
+performance. Finally, our framework allows us to identify missing annotations,
+and we thus show that our framework actually performs much better than the
+original labels from the development set of DocRED.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CSG: Curriculum Representation Learning for Signed Graph 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11083v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11083v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyu Zhang, Jiamou Liu, Kaiqi Zhao, Yifei Wang, Pengqian Han, Xianda Zheng, Qiqi Wang, Zijian Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Signed graphs are valuable for modeling complex relationships with positive
+and negative connections, and Signed Graph Neural Networks (SGNNs) have become
+crucial tools for their analysis. However, prior to our work, no specific
+training plan existed for SGNNs, and the conventional random sampling approach
+did not address varying learning difficulties within the graph's structure. We
+proposed a curriculum-based training approach, where samples progress from easy
+to complex, inspired by human learning. To measure learning difficulty, we
+introduced a lightweight mechanism and created the Curriculum representation
+learning framework for Signed Graphs (CSG). This framework optimizes the order
+in which samples are presented to the SGNN model. Empirical validation across
+six real-world datasets showed impressive results, enhancing SGNN model
+accuracy by up to 23.7% in link sign prediction (AUC) and significantly
+improving stability with an up to 8.4 reduction in the standard deviation of
+AUC scores.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-omics Sampling-based Graph <span class="highlight-title">Transformer</span> for Synthetic Lethality
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11082v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11082v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xusheng Zhao, Hao Liu, Qiong Dai, Hao Peng, Xu Bai, Huailiang Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Synthetic lethality (SL) prediction is used to identify if the co-mutation of
+two genes results in cell death. The prevalent strategy is to abstract SL
+prediction as an edge classification task on gene nodes within SL data and
+achieve it through graph neural networks (GNNs). However, GNNs suffer from
+limitations in their message passing mechanisms, including over-smoothing and
+over-squashing issues. Moreover, harnessing the information of non-SL gene
+relationships within large-scale multi-omics data to facilitate SL prediction
+poses a non-trivial challenge. To tackle these issues, we propose a new
+multi-omics sampling-based graph transformer for SL prediction (MSGT-SL).
+Concretely, we introduce a shallow multi-view GNN to acquire local structural
+patterns from both SL and multi-omics data. Further, we input gene features
+that encode multi-view information into the standard self-attention to capture
+long-range dependencies. Notably, starting with batch genes from SL data, we
+adopt parallel random walk sampling across multiple omics gene graphs
+encompassing them. Such sampling effectively and modestly incorporates genes
+from omics in a structure-aware manner before using self-attention. We showcase
+the effectiveness of MSGT-SL on real-world SL tasks, demonstrating the
+empirical benefits gained from the graph transformer and multi-omics data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ United We Stand: Using Epoch-wise Agreement of Ensembles to Combat
+  Overfit 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11077v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11077v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Uri Stern, Daniel Shwartz, Daphna Weinshall
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks have become the method of choice for solving many image
+classification tasks, largely because they can fit very complex functions
+defined over raw images. The downside of such powerful learners is the danger
+of overfitting the training set, leading to poor generalization, which is
+usually avoided by regularization and "early stopping" of the training. In this
+paper, we propose a new deep network ensemble classifier that is very effective
+against overfit. We begin with the theoretical analysis of a regression model,
+whose predictions - that the variance among classifiers increases when overfit
+occurs - is demonstrated empirically in deep networks in common use. Guided by
+these results, we construct a new ensemble-based prediction method designed to
+combat overfit, where the prediction is determined by the most consensual
+prediction throughout the training. On multiple image and text classification
+datasets, we show that when regular ensembles suffer from overfit, our method
+eliminates the harmful reduction in generalization due to overfit, and often
+even surpasses the performance obtained by early stopping. Our method is easy
+to implement, and can be integrated with any training scheme and architecture,
+without additional prior knowledge beyond the training set. Accordingly, it is
+a practical and useful tool to overcome overfit.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Resampling Stochastic Gradient Descent Cheaply for Efficient Uncertainty
+  Quantification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11065v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11065v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Henry Lam, Zitong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic gradient descent (SGD) or stochastic approximation has been widely
+used in model training and stochastic optimization. While there is a huge
+literature on analyzing its convergence, inference on the obtained solutions
+from SGD has only been recently studied, yet is important due to the growing
+need for uncertainty quantification. We investigate two computationally cheap
+resampling-based methods to construct confidence intervals for SGD solutions.
+One uses multiple, but few, SGDs in parallel via resampling with replacement
+from the data, and another operates this in an online fashion. Our methods can
+be regarded as enhancements of established bootstrap schemes to substantially
+reduce the computation effort in terms of resampling requirements, while at the
+same time bypassing the intricate mixing conditions in existing batching
+methods. We achieve these via a recent so-called cheap bootstrap idea and
+Berry-Esseen-type bound for SGD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Locally Differentially Private Graph Embedding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11060v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11060v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zening Li, Rong-Hua Li, Meihao Liao, Fusheng Jin, Guoren Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph embedding has been demonstrated to be a powerful tool for learning
+latent representations for nodes in a graph. However, despite its superior
+performance in various graph-based machine learning tasks, learning over graphs
+can raise significant privacy concerns when graph data involves sensitive
+information. To address this, in this paper, we investigate the problem of
+developing graph embedding algorithms that satisfy local differential privacy
+(LDP). We propose LDP-GE, a novel privacy-preserving graph embedding framework,
+to protect the privacy of node data. Specifically, we propose an LDP mechanism
+to obfuscate node data and adopt personalized PageRank as the proximity measure
+to learn node representations. Then, we theoretically analyze the privacy
+guarantees and utility of the LDP-GE framework. Extensive experiments conducted
+over several real-world graph datasets demonstrate that LDP-GE achieves
+favorable privacy-utility trade-offs and significantly outperforms existing
+approaches in both node classification and link prediction tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Causal Feature Selection via Transfer Entropy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11059v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11059v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paolo Bonetti, Alberto Maria Metelli, Marcello Restelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning algorithms are designed to capture complex relationships
+between features. In this context, the high dimensionality of data often
+results in poor model performance, with the risk of overfitting. Feature
+selection, the process of selecting a subset of relevant and non-redundant
+features, is, therefore, an essential step to mitigate these issues. However,
+classical feature selection approaches do not inspect the causal relationship
+between selected features and target, which can lead to misleading results in
+real-world applications. Causal discovery, instead, aims to identify causal
+relationships between features with observational data. In this paper, we
+propose a novel methodology at the intersection between feature selection and
+causal discovery, focusing on time series. We introduce a new causal feature
+selection approach that relies on the forward and backward feature selection
+procedures and leverages transfer entropy to estimate the causal flow of
+information from the features to the target in time series. Our approach
+enables the selection of features not only in terms of mere model performance
+but also captures the causal information flow. In this context, we provide
+theoretical guarantees on the regression and classification errors for both the
+exact and the finite-sample cases. Finally, we present numerical validations on
+synthetic and real-world regression problems, showing results competitive
+w.r.t. the considered baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nonet at SemEval-2023 Task 6: Methodologies for Legal Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11049v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11049v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubham Kumar Nigam, Aniket Deroy, Noel Shallum, Ayush Kumar Mishra, Anup Roy, Shubham Kumar Mishra, Arnab Bhattacharya, Saptarshi Ghosh, Kripabandhu Ghosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper describes our submission to the SemEval-2023 for Task 6 on
+LegalEval: Understanding Legal Texts. Our submission concentrated on three
+subtasks: Legal Named Entity Recognition (L-NER) for Task-B, Legal Judgment
+Prediction (LJP) for Task-C1, and Court Judgment Prediction with Explanation
+(CJPE) for Task-C2. We conducted various experiments on these subtasks and
+presented the results in detail, including data statistics and methodology. It
+is worth noting that legal tasks, such as those tackled in this research, have
+been gaining importance due to the increasing need to automate legal analysis
+and support. Our team obtained competitive rankings of 15$^{th}$, 11$^{th}$,
+and 1$^{st}$ in Task-B, Task-C1, and Task-C2, respectively, as reported on the
+leaderboard.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding Contrastive Learning via Distributionally Robust
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11048v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11048v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junkang Wu, Jiawei Chen, Jiancan Wu, Wentao Shi, Xiang Wang, Xiangnan He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study reveals the inherent tolerance of contrastive learning (CL)
+towards sampling bias, wherein negative samples may encompass similar semantics
+(\eg labels). However, existing theories fall short in providing explanations
+for this phenomenon. We bridge this research gap by analyzing CL through the
+lens of distributionally robust optimization (DRO), yielding several key
+insights: (1) CL essentially conducts DRO over the negative sampling
+distribution, thus enabling robust performance across a variety of potential
+distributions and demonstrating robustness to sampling bias; (2) The design of
+the temperature $\tau$ is not merely heuristic but acts as a Lagrange
+Coefficient, regulating the size of the potential distribution set; (3) A
+theoretical connection is established between DRO and mutual information, thus
+presenting fresh evidence for ``InfoNCE as an estimate of MI'' and a new
+estimation approach for $\phi$-divergence-based generalized mutual information.
+We also identify CL's potential shortcomings, including over-conservatism and
+sensitivity to outliers, and introduce a novel Adjusted InfoNCE loss (ADNCE) to
+mitigate these issues. It refines potential distribution, improving performance
+and accelerating convergence. Extensive experiments on various domains (image,
+sentence, and graphs) validate the effectiveness of the proposal. The code is
+available at \url{https://github.com/junkangwu/ADNCE}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast Graph Condensation with Structure-based Neural Tangent Kernel 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11046v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11046v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lin Wang, Wenqi Fan, Jiatong Li, Yao Ma, Qing Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid development of Internet technology has given rise to a vast amount
+of graph-structured data. Graph Neural Networks (GNNs), as an effective method
+for various graph mining tasks, incurs substantial computational resource costs
+when dealing with large-scale graph data. A data-centric manner solution is
+proposed to condense the large graph dataset into a smaller one without
+sacrificing the predictive performance of GNNs. However, existing efforts
+condense graph-structured data through a computational intensive bi-level
+optimization architecture also suffer from massive computation costs. In this
+paper, we propose reforming the graph condensation problem as a Kernel Ridge
+Regression (KRR) task instead of iteratively training GNNs in the inner loop of
+bi-level optimization. More specifically, We propose a novel dataset
+condensation framework (GC-SNTK) for graph-structured data, where a
+Structure-based Neural Tangent Kernel (SNTK) is developed to capture the
+topology of graph and serves as the kernel function in KRR paradigm.
+Comprehensive experiments demonstrate the effectiveness of our proposed model
+in accelerating graph condensation while maintaining high prediction
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 6 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Matrix Compression via Randomized Low Rank and Low Precision
+  Factorization <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11028v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11028v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rajarshi Saha, Varun Srivastava, Mert Pilanci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Matrices are exceptionally useful in various fields of study as they provide
+a convenient framework to organize and manipulate data in a structured manner.
+However, modern matrices can involve billions of elements, making their storage
+and processing quite demanding in terms of computational resources and memory
+usage. Although prohibitively large, such matrices are often approximately low
+rank. We propose an algorithm that exploits this structure to obtain a low rank
+decomposition of any matrix $\mathbf{A}$ as $\mathbf{A} \approx
+\mathbf{L}\mathbf{R}$, where $\mathbf{L}$ and $\mathbf{R}$ are the low rank
+factors. The total number of elements in $\mathbf{L}$ and $\mathbf{R}$ can be
+significantly less than that in $\mathbf{A}$. Furthermore, the entries of
+$\mathbf{L}$ and $\mathbf{R}$ are quantized to low precision formats $--$
+compressing $\mathbf{A}$ by giving us a low rank and low precision
+factorization. Our algorithm first computes an approximate basis of the range
+space of $\mathbf{A}$ by randomly sketching its columns, followed by a
+quantization of the vectors constituting this basis. It then computes
+approximate projections of the columns of $\mathbf{A}$ onto this quantized
+basis. We derive upper bounds on the approximation error of our algorithm, and
+analyze the impact of target rank and quantization bit-budget. The tradeoff
+between compression ratio and approximation accuracy allows for flexibility in
+choosing these parameters based on specific application requirements. We
+empirically demonstrate the efficacy of our algorithm in image compression,
+nearest neighbor classification of image and text embeddings, and compressing
+the layers of LlaMa-$7$b. Our results illustrate that we can achieve
+compression ratios as aggressive as one bit per matrix coordinate, all while
+surpassing or maintaining the performance of traditional compression
+techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the 37th Conference on Neural Information Processing
+  Systems (NeurIPS 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SignGT: Signed Attention-based Graph <span class="highlight-title">Transformer</span> for Graph
+  Representation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11025v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11025v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinsong Chen, Gaichao Li, John E. Hopcroft, Kun He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emerging graph Transformers have achieved impressive performance for
+graph representation learning over graph neural networks (GNNs). In this work,
+we regard the self-attention mechanism, the core module of graph Transformers,
+as a two-step aggregation operation on a fully connected graph. Due to the
+property of generating positive attention values, the self-attention mechanism
+is equal to conducting a smooth operation on all nodes, preserving the
+low-frequency information. However, only capturing the low-frequency
+information is inefficient in learning complex relations of nodes on diverse
+graphs, such as heterophily graphs where the high-frequency information is
+crucial. To this end, we propose a Signed Attention-based Graph Transformer
+(SignGT) to adaptively capture various frequency information from the graphs.
+Specifically, SignGT develops a new signed self-attention mechanism (SignSA)
+that produces signed attention values according to the semantic relevance of
+node pairs. Hence, the diverse frequency information between different node
+pairs could be carefully preserved. Besides, SignGT proposes a structure-aware
+feed-forward network (SFFN) that introduces the neighborhood bias to preserve
+the local topology information. In this way, SignGT could learn informative
+node representations from both long-range dependencies and local topology
+information. Extensive empirical results on both node-level and graph-level
+tasks indicate the superiority of SignGT against state-of-the-art graph
+Transformers as well as advanced GNNs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to a conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Compatible <span class="highlight-title">Transformer</span> for Irregularly Sampled Multivariate Time Series <span class="chip">ICDM</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11022v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11022v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxi Wei, Juntong Peng, Tong He, Chenxin Xu, Jian Zhang, Shirui Pan, Siheng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To analyze multivariate time series, most previous methods assume regular
+subsampling of time series, where the interval between adjacent measurements
+and the number of samples remain unchanged. Practically, data collection
+systems could produce irregularly sampled time series due to sensor failures
+and interventions. However, existing methods designed for regularly sampled
+multivariate time series cannot directly handle irregularity owing to
+misalignment along both temporal and variate dimensions. To fill this gap, we
+propose Compatible Transformer (CoFormer), a transformer-based encoder to
+achieve comprehensive temporal-interaction feature learning for each individual
+sample in irregular multivariate time series. In CoFormer, we view each sample
+as a unique variate-time point and leverage intra-variate/inter-variate
+attentions to learn sample-wise temporal/interaction features based on
+intra-variate/inter-variate neighbors. With CoFormer as the core, we can
+analyze irregularly sampled multivariate time series for many downstream tasks,
+including classification and prediction. We conduct extensive experiments on 3
+real-world datasets and validate that the proposed CoFormer significantly and
+consistently outperforms existing methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the IEEE International Conference on Data Mining (ICDM)
+  2023 as short paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pure Exploration in Asynchronous Federated Bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11015v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11015v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zichen Wang, Chuanhao Li, Chenyu Song, Lianghui Wang, Quanquan Gu, Huazheng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the federated pure exploration problem of multi-armed bandits and
+linear bandits, where $M$ agents cooperatively identify the best arm via
+communicating with the central server. To enhance the robustness against
+latency and unavailability of agents that are common in practice, we propose
+the first federated asynchronous multi-armed bandit and linear bandit
+algorithms for pure exploration with fixed confidence. Our theoretical analysis
+shows the proposed algorithms achieve near-optimal sample complexities and
+efficient communication costs in a fully asynchronous environment. Moreover,
+experimental results based on synthetic and real-world data empirically
+elucidate the effectiveness and communication cost-efficiency of the proposed
+algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hyperspectral In-Memory Computing with Optical Frequency Combs and
+  Programmable Optical Memories 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11014v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11014v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mostafa Honari Latifpour, Byoung Jun Park, Yoshihisa Yamamoto, Myoung-Gyun Suh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancements in machine learning across numerous industries have
+amplified the demand for extensive matrix-vector multiplication operations,
+thereby challenging the capacities of traditional von Neumann computing
+architectures. To address this, researchers are currently exploring
+alternatives such as in-memory computing systems to develop faster and more
+energy-efficient hardware. In particular, there is renewed interest in
+computing systems based on optics, which could potentially handle matrix-vector
+multiplication in a more energy-efficient way. Despite promising initial
+results, developing a highly parallel, programmable, and scalable optical
+computing system capable of rivaling electronic computing hardware still
+remains elusive. In this context, we propose a hyperspectral in-memory
+computing architecture that integrates space multiplexing with frequency
+multiplexing of optical frequency combs and uses spatial light modulators as a
+programmable optical memory, thereby boosting the computational throughput and
+the energy efficiency. We have experimentally demonstrated multiply-accumulate
+operations with higher than 4-bit precision in both matrix-vector and
+matrix-matrix multiplications, which suggests the system's potential for a wide
+variety of deep learning and optimization tasks. This system exhibits
+extraordinary modularity, scalability, and programmability, effectively
+transcending the traditional limitations of optics-based computing
+architectures. Our approach demonstrates the potential to scale beyond peta
+operations per second, marking a significant step towards achieving
+high-throughput energy-efficient optical computing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Identifiable Causal Representations to Controllable Counterfactual
+  Generation: A <span class="highlight-title">Survey</span> on Causal Generative Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11011v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11011v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aneesh Komanduri, Xintao Wu, Yongkai Wu, Feng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep generative models have shown tremendous success in data density
+estimation and data generation from finite samples. While these models have
+shown impressive performance by learning correlations among features in the
+data, some fundamental shortcomings are their lack of explainability, the
+tendency to induce spurious correlations, and poor out-of-distribution
+extrapolation. In an effort to remedy such challenges, one can incorporate the
+theory of causality in deep generative modeling. Structural causal models
+(SCMs) describe data-generating processes and model complex causal
+relationships and mechanisms among variables in a system. Thus, SCMs can
+naturally be combined with deep generative models. Causal models offer several
+beneficial properties to deep generative models, such as distribution shift
+robustness, fairness, and interoperability. We provide a technical survey on
+causal generative modeling categorized into causal representation learning and
+controllable counterfactual generation methods. We focus on fundamental theory,
+formulations, drawbacks, datasets, metrics, and applications of causal
+generative models in fairness, privacy, out-of-distribution generalization, and
+precision medicine. We also discuss open problems and fruitful research
+directions for future work in the field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Pairwise Encodings for Link Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11009v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11009v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Harry Shomer, Yao Ma, Haitao Ma, Juanhui Li, Bo Wu, Jiliang Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Link prediction is a common task on graph-structured data that has seen
+applications in a variety of domains. Classically, hand-crafted heuristics were
+used for this task. Heuristic measures are chosen such that they correlate well
+with the underlying factors related to link formation. In recent years, a new
+class of methods has emerged that combines the advantages of message-passing
+neural networks (MPNN) and heuristics methods. These methods perform
+predictions by using the output of an MPNN in conjunction with a "pairwise
+encoding" that captures the relationship between nodes in the candidate link.
+They have been shown to achieve strong performance on numerous datasets.
+However, current pairwise encodings often contain a strong inductive bias,
+using the same underlying factors to classify all links. This limits the
+ability of existing methods to learn how to properly classify a variety of
+different links that may form from different factors. To address this
+limitation, we propose a new method, LPFormer, which attempts to adaptively
+learn the pairwise encodings for each link. LPFormer models the link factors
+via an attention module that learns the pairwise encoding that exists between
+nodes by modeling multiple factors integral to link prediction. Extensive
+experiments demonstrate that LPFormer can achieve SOTA performance on numerous
+datasets while maintaining efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Correction Focused Language Model Training for Speech Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11003v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11003v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingyi Ma, Zhe Liu, Ozlem Kalinli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models (LMs) have been commonly adopted to boost the performance of
+automatic speech recognition (ASR) particularly in domain adaptation tasks.
+Conventional way of LM training treats all the words in corpora equally,
+resulting in suboptimal improvements in ASR performance. In this work, we
+introduce a novel correction focused LM training approach which aims to
+prioritize ASR fallible words. The word-level ASR fallibility score,
+representing the likelihood of ASR mis-recognition, is defined and shaped as a
+prior word distribution to guide the LM training. To enable correction focused
+training with text-only corpora, large language models (LLMs) are employed as
+fallibility score predictors and text generators through multi-task
+fine-tuning. Experimental results for domain adaptation tasks demonstrate the
+effectiveness of our proposed method. Compared with conventional LMs,
+correction focused training achieves up to relatively 5.5% word error rate
+(WER) reduction in sufficient text scenarios. In insufficient text scenarios,
+LM training with LLM-generated text achieves up to relatively 13% WER
+reduction, while correction focused training further obtains up to relatively
+6% WER reduction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Spatially-resolved hyperlocal weather prediction and anomaly detection
+  using IoT sensor networks and machine learning techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11001v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11001v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anita B. Agarwal, Rohit Rajesh, Nitin Arul
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate and timely hyperlocal weather predictions are essential for various
+applications, ranging from agriculture to disaster management. In this paper,
+we propose a novel approach that combines hyperlocal weather prediction and
+anomaly detection using IoT sensor networks and advanced machine learning
+techniques. Our approach leverages data from multiple spatially-distributed yet
+relatively close locations and IoT sensors to create high-resolution weather
+models capable of predicting short-term, localized weather conditions such as
+temperature, pressure, and humidity. By monitoring changes in weather
+parameters across these locations, our system is able to enhance the spatial
+resolution of predictions and effectively detect anomalies in real-time.
+Additionally, our system employs unsupervised learning algorithms to identify
+unusual weather patterns, providing timely alerts. Our findings indicate that
+this system has the potential to enhance decision-making.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE Modelling Simulation & Intelligent Computing, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Accelerating Scalable Graph Neural Network Inference with Node-Adaptive
+  Propagation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10998v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10998v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyi Gao, Wentao Zhang, Junliang Yu, Yingxia Shao, Quoc Viet Hung Nguyen, Bin Cui, Hongzhi Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) have exhibited exceptional efficacy in a diverse
+array of applications. However, the sheer size of large-scale graphs presents a
+significant challenge to real-time inference with GNNs. Although existing
+Scalable GNNs leverage linear propagation to preprocess the features and
+accelerate the training and inference procedure, these methods still suffer
+from scalability issues when making inferences on unseen nodes, as the feature
+preprocessing requires the graph to be known and fixed. To further accelerate
+Scalable GNNs inference in this inductive setting, we propose an online
+propagation framework and two novel node-adaptive propagation methods that can
+customize the optimal propagation depth for each node based on its topological
+information and thereby avoid redundant feature propagation. The trade-off
+between accuracy and latency can be flexibly managed through simple
+hyper-parameters to accommodate various latency constraints. Moreover, to
+compensate for the inference accuracy loss caused by the potential early
+termination of propagation, we further propose Inception Distillation to
+exploit the multi-scale receptive field information within graphs. The rigorous
+and comprehensive experimental study on public datasets with varying scales and
+characteristics demonstrates that the proposed inference acceleration framework
+outperforms existing state-of-the-art graph inference acceleration methods in
+terms of accuracy and efficiency. Particularly, the superiority of our approach
+is notable on datasets with larger scales, yielding a 75x inference speedup on
+the largest Ogbn-products dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2211.00495</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Why Do Students Drop Out? University Dropout Prediction and Associated
+  Factor Analysis Using Machine Learning Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10987v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10987v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sean Kim, Eliot Yoo, Samuel Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graduation and dropout rates have always been a serious consideration for
+educational institutions and students. High dropout rates negatively impact
+both the lives of individual students and institutions. To address this
+problem, this study examined university dropout prediction using academic,
+demographic, socioeconomic, and macroeconomic data types. Additionally, we
+performed associated factor analysis to analyze which type of data would be
+most influential on the performance of machine learning models in predicting
+graduation and dropout status. These features were used to train four binary
+classifiers to determine if students would graduate or drop out. The overall
+performance of the classifiers in predicting dropout status had an average
+ROC-AUC score of 0.935. The data type most influential to the model performance
+was found to be academic data, with the average ROC-AUC score dropping from
+0.935 to 0.811 when excluding all academic-related features from the data set.
+Preliminary results indicate that a correlation does exist between data types
+and dropout status.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exact nonlinear state estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10976v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10976v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hristo G. Chipilski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The majority of data assimilation (DA) methods in the geosciences are based
+on Gaussian assumptions. While these assumptions facilitate efficient
+algorithms, they cause analysis biases and subsequent forecast degradations.
+Non-parametric, particle-based DA algorithms have superior accuracy, but their
+application to high-dimensional models still poses operational challenges.
+Drawing inspiration from recent advances in the field of generative artificial
+intelligence (AI), this article introduces a new nonlinear estimation theory
+which attempts to bridge the existing gap in DA methodology. Specifically, a
+Conjugate Transform Filter (CTF) is derived and shown to generalize the
+celebrated Kalman filter to arbitrarily non-Gaussian distributions. The new
+filter has several desirable properties, such as its ability to preserve
+statistical relationships in the prior state and convergence to highly accurate
+observations. An ensemble approximation of the new theory (ECTF) is also
+presented and validated using idealized statistical experiments that feature
+bounded quantities with non-Gaussian distributions, a prevalent challenge in
+Earth system models. Results from these experiments indicate that the greatest
+benefits from ECTF occur when observation errors are small relative to the
+forecast uncertainty and when state variables exhibit strong nonlinear
+dependencies. Ultimately, the new filtering theory offers exciting avenues for
+improving conventional DA algorithms through their principled integration with
+AI techniques.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Context-Aware Meta-Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10971v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10971v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christopher Fifty, Dennis Duan, Ronald G. Junkins, Ehsan Amid, Jure Leskovec, Christopher Ré, Sebastian Thrun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models like ChatGPT demonstrate a remarkable capacity to learn
+new concepts during inference without any fine-tuning. However, visual models
+trained to detect new objects during inference have been unable to replicate
+this ability, and instead either perform poorly or require meta-training and/or
+fine-tuning on similar objects. In this work, we propose a meta-learning
+algorithm that emulates Large Language Models by learning new visual concepts
+during inference without fine-tuning. Our approach leverages a frozen
+pre-trained feature extractor, and analogous to in-context learning, recasts
+meta-learning as sequence modeling over datapoints with known labels and a test
+datapoint with an unknown label. On 8 out of 11 meta-learning benchmarks, our
+approach -- without meta-training or fine-tuning -- exceeds or matches the
+state-of-the-art algorithm, P>M>F, which is meta-trained on these benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SD-PINN: Deep Learning based Spatially Dependent PDEs Recovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10970v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10970v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruixian Liu, Peter Gerstoft
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The physics-informed neural network (PINN) is capable of recovering partial
+differential equation (PDE) coefficients that remain constant throughout the
+spatial domain directly from physical measurements. In this work, we propose a
+spatially dependent physics-informed neural network (SD-PINN), which enables
+the recovery of coefficients in spatially-dependent PDEs using a single neural
+network, eliminating the requirement for domain-specific physical expertise.
+The proposed method exhibits robustness to noise owing to the incorporation of
+physical constraints. It can also incorporate the low-rank assumption of the
+spatial variation for the PDE coefficients to recover the coefficients at
+locations without available measurements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The neural network models with delays for solving absolute value
+  equations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10965v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10965v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongmei Yu, Gehao Zhang, Cairong Chen, Deren Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An inverse-free neural network model with mixed delays is proposed for
+solving the absolute value equation (AVE) $Ax -|x| - b =0$, which includes an
+inverse-free neural network model with discrete delay as a special case. By
+using the Lyapunov-Krasovskii theory and the linear matrix inequality (LMI)
+method, the developed neural network models are proved to be exponentially
+convergent to the solution of the AVE. Compared with the existing neural
+network models for solving the AVE, the proposed models feature the ability of
+solving a class of AVE with $\|A^{-1}\|>1$. Numerical simulations are given to
+show the effectiveness of the two delayed neural network models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Deep Neural Network Training Efficiency and Performance
+  through Linear Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10958v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10958v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hejie Ying, Mengmeng Song, Yaohong Tang, Shungen Xiao, Zimin Xiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks (DNN) have achieved remarkable success in various
+fields, including computer vision and natural language processing. However,
+training an effective DNN model still poses challenges. This paper aims to
+propose a method to optimize the training effectiveness of DNN, with the goal
+of improving model performance. Firstly, based on the observation that the DNN
+parameters change in certain laws during training process, the potential of
+parameter prediction for improving model training efficiency and performance is
+discovered. Secondly, considering the magnitude of DNN model parameters,
+hardware limitations and characteristics of Stochastic Gradient Descent (SGD)
+for noise tolerance, a Parameter Linear Prediction (PLP) method is exploit to
+perform DNN parameter prediction. Finally, validations are carried out on some
+representative backbones. Experiment results show that compare to the normal
+training ways, under the same training conditions and epochs, by employing
+proposed PLP method, the optimal model is able to obtain average about 1%
+accuracy improvement and 0.01 top-1/top-5 error reduction for Vgg16, Resnet18
+and GoogLeNet based on CIFAR-100 dataset, which shown the effectiveness of the
+proposed method on different DNN structures, and validated its capacity in
+enhancing DNN training efficiency and performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Computational Framework for Solving Wasserstein Lagrangian Flows 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10649v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10649v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kirill Neklyudov, Rob Brekelmans, Alexander Tong, Lazar Atanackovic, Qiang Liu, Alireza Makhzani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The dynamical formulation of the optimal transport can be extended through
+various choices of the underlying geometry ($\textit{kinetic energy}$), and the
+regularization of density paths ($\textit{potential energy}$). These
+combinations yield different variational problems ($\textit{Lagrangians}$),
+encompassing many variations of the optimal transport problem such as the
+Schr\"odinger bridge, unbalanced optimal transport, and optimal transport with
+physical constraints, among others. In general, the optimal density path is
+unknown, and solving these variational problems can be computationally
+challenging. Leveraging the dual formulation of the Lagrangians, we propose a
+novel deep learning based framework approaching all of these problems from a
+unified perspective. Our method does not require simulating or backpropagating
+through the trajectories of the learned dynamics, and does not need access to
+optimal couplings. We showcase the versatility of the proposed framework by
+outperforming previous approaches for the single-cell trajectory inference,
+where incorporating prior knowledge into the dynamics is crucial for correct
+predictions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Statistical Learning of Branch and Bound for Vehicle Routing
+  Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09986v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09986v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrew Naguib, Waleed A. Yousef, Issa Traoré, Mohammad Mamun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, machine learning of the branch and bound algorithm has shown
+promise in approximating competent solutions to NP-hard problems. In this
+paper, we utilize and comprehensively compare the outcomes of three neural
+networks--graph convolutional neural network (GCNN), GraphSAGE, and graph
+attention network (GAT)--to solve the capacitated vehicle routing problem. We
+train these neural networks to emulate the decision-making process of the
+computationally expensive Strong Branching strategy. The neural networks are
+trained on six instances with distinct topologies from the CVRPLIB and
+evaluated on eight additional instances. Moreover, we reduced the minimum
+number of vehicles required to solve a CVRP instance to a bin-packing problem,
+which was addressed in a similar manner. Through rigorous experimentation, we
+found that this approach can match or improve upon the performance of the
+branch and bound algorithm with the Strong Branching strategy while requiring
+significantly less computational time. The source code that corresponds to our
+research findings and methodology is readily accessible and available for
+reference at the following web address: https://isotlaboratory.github.io/ml4vrp
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Steering Prototypes with <span class="highlight-title">Prompt</span>-tuning for Rehearsal-free Continual
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09447v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09447v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuowei Li, Long Zhao, Zizhao Zhang, Han Zhang, Di Liu, Ting Liu, Dimitris N. Metaxas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the context of continual learning, prototypes-as representative class
+embeddings-offer advantages in memory conservation and the mitigation of
+catastrophic forgetting. However, challenges related to semantic drift and
+prototype interference persist. In this study, we introduce the Contrastive
+Prototypical Prompt (CPP) approach. Through task-specific prompt-tuning,
+underpinned by a contrastive learning objective, we effectively address both
+aforementioned challenges. Our evaluations on four challenging
+class-incremental benchmarks reveal that CPP achieves a significant 4% to 6%
+improvement over state-of-the-art methods. Importantly, CPP operates without a
+rehearsal buffer and narrows the performance divergence between continual and
+offline joint-learning, suggesting an innovative scheme for Transformer-based
+continual learning systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tensor Completion with Provable Consistency and Fairness Guarantees for
+  Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.01815v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.01815v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tung Nguyen, Jeffrey Uhlmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a new consistency-based approach for defining and solving
+nonnegative/positive matrix and tensor completion problems. The novelty of the
+framework is that instead of artificially making the problem well-posed in the
+form of an application-arbitrary optimization problem, e.g., minimizing a bulk
+structural measure such as rank or norm, we show that a single
+property/constraint: preserving unit-scale consistency, guarantees the
+existence of both a solution and, under relatively weak support assumptions,
+uniqueness. The framework and solution algorithms also generalize directly to
+tensors of arbitrary dimensions while maintaining computational complexity that
+is linear in problem size for fixed dimension d. In the context of recommender
+system (RS) applications, we prove that two reasonable properties that should
+be expected to hold for any solution to the RS problem are sufficient to permit
+uniqueness guarantees to be established within our framework. This is
+remarkable because it obviates the need for heuristic-based statistical or AI
+methods despite what appear to be distinctly human/subjective variables at the
+heart of the problem. Key theoretical contributions include a general
+unit-consistent tensor-completion framework with proofs of its properties,
+e.g., consensus-order and fairness, and algorithms with optimal runtime and
+space complexities, e.g., O(1) term-completion with preprocessing complexity
+that is linear in the number of known terms of the matrix/tensor. From a
+practical perspective, the seamless ability of the framework to generalize to
+exploit high-dimensional structural relationships among key state variables,
+e.g., user and product attributes, offers a means for extracting significantly
+more information than is possible for alternative methods that cannot
+generalize beyond direct user-product relationships.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Final published version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Densely Connected $G$-invariant Deep Neural Networks with Signed
+  Permutation Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.04614v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.04614v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Devanshu Agrawal, James Ostrowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce and investigate, for finite groups $G$, $G$-invariant deep
+neural network ($G$-DNN) architectures with ReLU activation that are densely
+connected-- i.e., include all possible skip connections. In contrast to other
+$G$-invariant architectures in the literature, the preactivations of
+the$G$-DNNs presented here are able to transform by \emph{signed} permutation
+representations (signed perm-reps) of $G$. Moreover, the individual layers of
+the $G$-DNNs are not required to be $G$-equivariant; instead, the
+preactivations are constrained to be $G$-equivariant functions of the network
+input in a way that couples weights across all layers. The result is a richer
+family of $G$-invariant architectures never seen previously. We derive an
+efficient implementation of $G$-DNNs after a reparameterization of weights, as
+well as necessary and sufficient conditions for an architecture to be
+``admissible''-- i.e., nondegenerate and inequivalent to smaller architectures.
+We include code that allows a user to build a $G$-DNN interactively
+layer-by-layer, with the final architecture guaranteed to be admissible. We
+show that there are far more admissible $G$-DNN architectures than those
+accessible with the ``concatenated ReLU'' activation function from the
+literature. Finally, we apply $G$-DNNs to two example problems -- (1)
+multiplication in $\{-1, 1\}$ (with theoretical guarantees) and (2) 3D object
+classification -- % finding that the inclusion of signed perm-reps
+significantly boosts predictive performance compared to baselines with only
+ordinary (i.e., unsigned) perm-reps.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>40 pages, 2 figures, 4 tables. For associated code repository see
+  https://github.com/dagrawa2/gdnn_code</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Minimax Optimality of Model-based Robust Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.05372v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.05372v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pierre Clavier, Erwan Le Pennec, Matthieu Geist
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the sample complexity of obtaining an $\epsilon$-optimal policy in
+\emph{Robust} discounted Markov Decision Processes (RMDPs), given only access
+to a generative model of the nominal kernel. This problem is widely studied in
+the non-robust case, and it is known that any planning approach applied to an
+empirical MDP estimated with $\tilde{\mathcal{O}}(\frac{H^3 \mid S \mid\mid A
+\mid}{\epsilon^2})$ samples provides an $\epsilon$-optimal policy, which is
+minimax optimal. Results in the robust case are much more scarce. For $sa$-
+(resp $s$-)rectangular uncertainty sets, the best known sample complexity is
+$\tilde{\mathcal{O}}(\frac{H^4 \mid S \mid^2\mid A \mid}{\epsilon^2})$ (resp.
+$\tilde{\mathcal{O}}(\frac{H^4 \mid S \mid^2\mid A \mid^2}{\epsilon^2})$), for
+specific algorithms and when the uncertainty set is based on the total
+variation (TV), the KL or the Chi-square divergences. In this paper, we
+consider uncertainty sets defined with an $L_p$-ball (recovering the TV case),
+and study the sample complexity of \emph{any} planning algorithm (with high
+accuracy guarantee on the solution) applied to an empirical RMDP estimated
+using the generative model. In the general case, we prove a sample complexity
+of $\tilde{\mathcal{O}}(\frac{H^4 \mid S \mid\mid A \mid}{\epsilon^2})$ for
+both the $sa$- and $s$-rectangular cases (improvements of $\mid S \mid$ and
+$\mid S \mid\mid A \mid$ respectively). When the size of the uncertainty is
+small enough, we improve the sample complexity to
+$\tilde{\mathcal{O}}(\frac{H^3 \mid S \mid\mid A \mid }{\epsilon^2})$,
+recovering the lower-bound for the non-robust case for the first time and a
+robust lower-bound when the size of the uncertainty is small enough.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Imitating Task and Motion Planning with Visuomotor <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16309v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16309v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Murtaza Dalal, Ajay Mandlekar, Caelan Garrett, Ankur Handa, Ruslan Salakhutdinov, Dieter Fox
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Imitation learning is a powerful tool for training robot manipulation
+policies, allowing them to learn from expert demonstrations without manual
+programming or trial-and-error. However, common methods of data collection,
+such as human supervision, scale poorly, as they are time-consuming and
+labor-intensive. In contrast, Task and Motion Planning (TAMP) can autonomously
+generate large-scale datasets of diverse demonstrations. In this work, we show
+that the combination of large-scale datasets generated by TAMP supervisors and
+flexible Transformer models to fit them is a powerful paradigm for robot
+manipulation. To that end, we present a novel imitation learning system called
+OPTIMUS that trains large-scale visuomotor Transformer policies by imitating a
+TAMP agent. OPTIMUS introduces a pipeline for generating TAMP data that is
+specifically curated for imitation learning and can be used to train performant
+transformer-based policies. In this paper, we present a thorough study of the
+design decisions required to imitate TAMP and demonstrate that OPTIMUS can
+solve a wide variety of challenging vision-based manipulation tasks with over
+70 different objects, ranging from long-horizon pick-and-place tasks, to shelf
+and articulated object manipulation, achieving 70 to 80% success rates. Video
+results and code at https://mihdalal.github.io/optimus/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Conference on Robot Learning (CoRL) 2023. 8 pages, 5 figures, 2
+  tables; 11 pages appendix (10 additional figures)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond One-Preference-for-All: Multi-Objective Direct Preference
+  Optimization for Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03708v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03708v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhanhui Zhou, Jie Liu, Chao Yang, Jing Shao, Yu Liu, Xiangyu Yue, Wanli Ouyang, Yu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A single language model (LM), despite aligning well with an average labeler
+through reinforcement learning from human feedback (RLHF), may not universally
+suit diverse human preferences. Recent approaches thus pursue customization,
+training separate principle-based reward models to represent different
+alignment objectives (e.g. helpfulness, harmlessness, or honesty). Different
+LMs can then be trained for different preferences through multi-objective RLHF
+(MORLHF) with different objective weightings. Yet, RLHF is unstable and
+resource-heavy, especially for MORLHF with diverse and usually conflicting
+objectives. In this paper, we present Multi-Objective Direct Preference
+Optimization (MODPO), an RL-free algorithm that extends Direct Preference
+Optimization (DPO) for multiple alignment objectives. Essentially, MODPO folds
+LM learning directly into reward modeling, aligning LMs with the weighted sum
+of all principle-based rewards using pure cross-entropy loss. While
+theoretically guaranteed to produce the same optimal solutions as MORLHF, MODPO
+is practically more stable and computationally efficient, obviating value
+function modeling and online sample collection. Empirical results in safety
+alignment and long-form question answering confirm that MODPO matches or
+outperforms existing methods, consistently producing one of the most
+competitive LM fronts that cater to diverse preferences with 3 times fewer
+computations compared with MORLHF.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Iteratively Refined Behavior Regularization for Offline Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05726v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05726v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaohan Hu, Yi Ma, Chenjun Xiao, Yan Zheng, Jianye Hao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the fundamental challenges for offline reinforcement learning (RL) is
+ensuring robustness to data distribution. Whether the data originates from a
+near-optimal policy or not, we anticipate that an algorithm should demonstrate
+its ability to learn an effective control policy that seamlessly aligns with
+the inherent distribution of offline data. Unfortunately, behavior
+regularization, a simple yet effective offline RL algorithm, tends to struggle
+in this regard. In this paper, we propose a new algorithm that substantially
+enhances behavior-regularization based on conservative policy iteration. Our
+key observation is that by iteratively refining the reference policy used for
+behavior regularization, conservative policy update guarantees gradually
+improvement, while also implicitly avoiding querying out-of-sample actions to
+prevent catastrophic learning failures. We prove that in the tabular setting
+this algorithm is capable of learning the optimal policy covered by the offline
+dataset, commonly referred to as the in-sample optimal policy. We then explore
+several implementation details of the algorithm when function approximations
+are applied. The resulting algorithm is easy to implement, requiring only a few
+lines of code modification to existing methods. Experimental results on the
+D4RL benchmark indicate that our method outperforms previous state-of-the-art
+baselines in most tasks, clearly demonstrate its superiority over behavior
+regularization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Prioritized training on points that are learnable, worth learning, and
+  not yet learned (workshop version) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2107.02565v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2107.02565v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sören Mindermann, Muhammed Razzak, Winnie Xu, Andreas Kirsch, Mrinank Sharma, Adrien Morisot, Aidan N. Gomez, Sebastian Farquhar, Jan Brauner, Yarin Gal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Goldilocks Selection, a technique for faster model training
+which selects a sequence of training points that are "just right". We propose
+an information-theoretic acquisition function -- the reducible validation loss
+-- and compute it with a small proxy model -- GoldiProx -- to efficiently
+choose training points that maximize information about a validation set. We
+show that the "hard" (e.g. high loss) points usually selected in the
+optimization literature are typically noisy, while the "easy" (e.g. low noise)
+samples often prioritized for curriculum learning confer less information.
+Further, points with uncertain labels, typically targeted by active learning,
+tend to be less relevant to the task. In contrast, Goldilocks Selection chooses
+points that are "just right" and empirically outperforms the above approaches.
+Moreover, the selected sequence can transfer to other architectures;
+practitioners can share and reuse it without the need to recreate it.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MuG: A Multimodal Classification Benchmark on Game Data with Tabular,
+  Textual, and Visual Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.02978v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.02978v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaying Lu, Yongchen Qian, Shifan Zhao, Yuanzhe Xi, Carl Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Previous research has demonstrated the advantages of integrating data from
+multiple sources over traditional unimodal data, leading to the emergence of
+numerous novel multimodal applications. We propose a multimodal classification
+benchmark MuG with eight datasets that allows researchers to evaluate and
+improve their models. These datasets are collected from four various genres of
+games that cover tabular, textual, and visual modalities. We conduct
+multi-aspect data analysis to provide insights into the benchmark, including
+label balance ratios, percentages of missing features, distributions of data
+within each modality, and the correlations between labels and input modalities.
+We further present experimental results obtained by several state-of-the-art
+unimodal classifiers and multimodal classifiers, which demonstrate the
+challenging and multimodal-dependent properties of the benchmark. MuG is
+released at https://github.com/lujiaying/MUG-Bench with the data, tutorials,
+and implemented baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Predicting User-specific Future Activities using LSTM-based Multi-label
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.03100v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.03100v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohammad Sabik Irbaz, Fardin Ahsan Sakib, Lutfun Nahar Lota
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  User-specific future activity prediction in the healthcare domain based on
+previous activities can drastically improve the services provided by the
+nurses. It is challenging because, unlike other domains, activities in
+healthcare involve both nurses and patients, and they also vary from hour to
+hour. In this paper, we employ various data processing techniques to organize
+and modify the data structure and an LSTM-based multi-label classifier for a
+novel 2-stage training approach (user-agnostic pre-training and user-specific
+fine-tuning). Our experiment achieves a validation accuracy of 31.58\%,
+precision 57.94%, recall 68.31%, and F1 score 60.38%. We concluded that proper
+data pre-processing and a 2-stage training process resulted in better
+performance. This experiment is a part of the "Fourth Nurse Care Activity
+Recognition Challenge" by our team "Not A Fan of Local Minima".
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at the Proceedings of the 4th International Conference on
+  Activity and Behavior Computing 2022 (ABC 2022) Reference:
+  https://abc-research.github.io/challenge2022/results/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ALP: Action-Aware Embodied Learning for Perception 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10190v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10190v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinran Liang, Anthony Han, Wilson Yan, Aditi Raghunathan, Pieter Abbeel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current methods in training and benchmarking vision models exhibit an
+over-reliance on passive, curated datasets. Although models trained on these
+datasets have shown strong performance in a wide variety of tasks such as
+classification, detection, and segmentation, they fundamentally are unable to
+generalize to an ever-evolving world due to constant out-of-distribution shifts
+of input data. Therefore, instead of training on fixed datasets, can we
+approach learning in a more human-centric and adaptive manner? In this paper,
+we introduce Action-Aware Embodied Learning for Perception (ALP), an embodied
+learning framework that incorporates action information into representation
+learning through a combination of optimizing a reinforcement learning policy
+and an inverse dynamics prediction objective. Our method actively explores in
+complex 3D environments to both learn generalizable task-agnostic visual
+representations as well as collect downstream training data. We show that ALP
+outperforms existing baselines in several downstream perception tasks. In
+addition, we show that by training on actively collected data more relevant to
+the environment and task, our method generalizes more robustly to downstream
+tasks compared to models pre-trained on fixed datasets such as ImageNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>project website available at https://xinranliang.github.io/alp/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards the Fundamental Limits of Knowledge Transfer over Finite Domains 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07838v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07838v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyue Zhao, Banghua Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We characterize the statistical efficiency of knowledge transfer through $n$
+samples from a teacher to a probabilistic student classifier with input space
+$\mathcal S$ over labels $\mathcal A$. We show that privileged information at
+three progressive levels accelerates the transfer. At the first level, only
+samples with hard labels are known, via which the maximum likelihood estimator
+attains the minimax rate $\sqrt{{|{\mathcal S}||{\mathcal A}|}/{n}}$. The
+second level has the teacher probabilities of sampled labels available in
+addition, which turns out to boost the convergence rate lower bound to
+${{|{\mathcal S}||{\mathcal A}|}/{n}}$. However, under this second data
+acquisition protocol, minimizing a naive adaptation of the cross-entropy loss
+results in an asymptotically biased student. We overcome this limitation and
+achieve the fundamental limit by using a novel empirical variant of the squared
+error logit loss. The third level further equips the student with the soft
+labels (complete logits) on ${\mathcal A}$ given every sampled input, thereby
+provably enables the student to enjoy a rate ${|{\mathcal S}|}/{n}$ free of
+$|{\mathcal A}|$. We find any Kullback-Leibler divergence minimizer to be
+optimal in the last case. Numerical simulations distinguish the four learners
+and corroborate our theory.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>38 pages, 2 figures; related works polished</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Posterior Inference on Shallow Infinitely Wide Bayesian Neural Networks
+  under Weights with Unbounded Variance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.10664v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.10664v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jorge Loría, Anindya Bhadra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  From the classical and influential works of Neal (1996), it is known that the
+infinite width scaling limit of a Bayesian neural network with one hidden layer
+is a Gaussian process, \emph{when the network weights have bounded prior
+variance}. Neal's result has been extended to networks with multiple hidden
+layers and to convolutional neural networks, also with Gaussian process scaling
+limits. The tractable properties of Gaussian processes then allow
+straightforward posterior inference and uncertainty quantification,
+considerably simplifying the study of the limit process compared to a network
+of finite width. Neural network weights with unbounded variance, however, pose
+unique challenges. In this case, the classical central limit theorem breaks
+down and it is well known that the scaling limit is an $\alpha$-stable process
+under suitable conditions. However, current literature is primarily limited to
+forward simulations under these processes and the problem of posterior
+inference under such a scaling limit remains largely unaddressed, unlike in the
+Gaussian process case. To this end, our contribution is an interpretable and
+computationally efficient procedure for posterior inference, using a
+\emph{conditionally Gaussian} representation, that then allows full use of the
+Gaussian process machinery for tractable posterior inference and uncertainty
+quantification in the non-Gaussian regime.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CARSO: Blending Adversarial Training and Purification Improves
+  Adversarial Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06081v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06081v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emanuele Ballarin, Alessio Ansuini, Luca Bortolussi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we propose a novel adversarial defence mechanism for image
+classification - CARSO - blending the paradigms of adversarial training and
+adversarial purification in a mutually-beneficial, robustness-enhancing way.
+The method builds upon an adversarially-trained classifier, and learns to map
+its internal representation associated with a potentially perturbed input onto
+a distribution of tentative clean reconstructions. Multiple samples from such
+distribution are classified by the adversarially-trained model itself, and an
+aggregation of its outputs finally constitutes the robust prediction of
+interest. Experimental evaluation by a well-established benchmark of varied,
+strong adaptive attacks, across different image datasets and classifier
+architectures, shows that CARSO is able to defend itself against foreseen and
+unforeseen threats, including adaptive end-to-end attacks devised for
+stochastic defences. Paying a tolerable clean accuracy toll, our method
+improves by a significant margin the state of the art for CIFAR-10 and
+CIFAR-100 $\ell_\infty$ robust classification accuracy against AutoAttack. Code
+and pre-trained models are available at https://github.com/emaballarin/CARSO .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 1 figure, 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ zkFL: Zero-Knowledge Proof-based Gradient Aggregation for Federated
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02554v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02554v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhipeng Wang, Nanqing Dong, Jiahao Sun, William Knottenbelt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) is a machine learning paradigm, which enables
+multiple and decentralized clients to collaboratively train a model under the
+orchestration of a central aggregator. Traditional FL solutions rely on the
+trust assumption of the centralized aggregator, which forms cohorts of clients
+in a fair and honest manner. However, a malicious aggregator, in reality, could
+abandon and replace the client's training models, or launch Sybil attacks to
+insert fake clients. Such malicious behaviors give the aggregator more power to
+control clients in the FL setting and determine the final training results. In
+this work, we introduce zkFL, which leverages zero-knowledge proofs (ZKPs) to
+tackle the issue of a malicious aggregator during the training model
+aggregation process. To guarantee the correct aggregation results, the
+aggregator needs to provide a proof per round. The proof can demonstrate to the
+clients that the aggregator executes the intended behavior faithfully. To
+further reduce the verification cost of clients, we employ a blockchain to
+handle the proof in a zero-knowledge way, where miners (i.e., the nodes
+validating and maintaining the blockchain data) can verify the proof without
+knowing the clients' local and aggregated models. The theoretical analysis and
+empirical results show that zkFL can achieve better security and privacy than
+traditional FL, without modifying the underlying FL network structure or
+heavily compromising the training speed.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BioT5: Enriching Cross-modal Integration in Biology with Chemical
+  Knowledge and Natural Language Associations <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07276v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07276v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qizhi Pei, Wei Zhang, Jinhua Zhu, Kehan Wu, Kaiyuan Gao, Lijun Wu, Yingce Xia, Rui Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in biological research leverage the integration of
+molecules, proteins, and natural language to enhance drug discovery. However,
+current models exhibit several limitations, such as the generation of invalid
+molecular SMILES, underutilization of contextual information, and equal
+treatment of structured and unstructured knowledge. To address these issues, we
+propose $\mathbf{BioT5}$, a comprehensive pre-training framework that enriches
+cross-modal integration in biology with chemical knowledge and natural language
+associations. $\mathbf{BioT5}$ utilizes SELFIES for $100%$ robust molecular
+representations and extracts knowledge from the surrounding context of
+bio-entities in unstructured biological literature. Furthermore,
+$\mathbf{BioT5}$ distinguishes between structured and unstructured knowledge,
+leading to more effective utilization of information. After fine-tuning, BioT5
+shows superior performance across a wide range of tasks, demonstrating its
+strong capability of capturing underlying relations and properties of
+bio-entities. Our code is available at
+$\href{https://github.com/QizhiPei/BioT5}{Github}$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Empirical Methods in Natural Language Processing 2023
+  (EMNLP 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FABind: Fast and Accurate Protein-Ligand Binding <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06763v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06763v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qizhi Pei, Kaiyuan Gao, Lijun Wu, Jinhua Zhu, Yingce Xia, Shufang Xie, Tao Qin, Kun He, Tie-Yan Liu, Rui Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modeling the interaction between proteins and ligands and accurately
+predicting their binding structures is a critical yet challenging task in drug
+discovery. Recent advancements in deep learning have shown promise in
+addressing this challenge, with sampling-based and regression-based methods
+emerging as two prominent approaches. However, these methods have notable
+limitations. Sampling-based methods often suffer from low efficiency due to the
+need for generating multiple candidate structures for selection. On the other
+hand, regression-based methods offer fast predictions but may experience
+decreased accuracy. Additionally, the variation in protein sizes often requires
+external modules for selecting suitable binding pockets, further impacting
+efficiency. In this work, we propose $\mathbf{FABind}$, an end-to-end model
+that combines pocket prediction and docking to achieve accurate and fast
+protein-ligand binding. $\mathbf{FABind}$ incorporates a unique ligand-informed
+pocket prediction module, which is also leveraged for docking pose estimation.
+The model further enhances the docking process by incrementally integrating the
+predicted pocket to optimize protein-ligand binding, reducing discrepancies
+between training and inference. Through extensive experiments on benchmark
+datasets, our proposed $\mathbf{FABind}$ demonstrates strong advantages in
+terms of effectiveness and efficiency compared to existing methods. Our code is
+available at $\href{https://github.com/QizhiPei/FABind}{Github}$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Neural Information Processing Systems 2023 (NeurIPS 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ When No-Rejection Learning is Consistent for Regression with Rejection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02932v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02932v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaocheng Li, Shang Liu, Chunlin Sun, Hanzhao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning with rejection has been a prototypical model for studying the
+human-AI interaction on prediction tasks. Upon the arrival of a sample
+instance, the model first uses a rejector to decide whether to accept and use
+the AI predictor to make a prediction or reject and defer the sample to humans.
+Learning such a model changes the structure of the original loss function and
+often results in undesirable non-convexity and inconsistency issues. For the
+classification with rejection problem, several works develop consistent
+surrogate losses for the joint learning of the predictor and the rejector,
+while there have been fewer works for the regression counterpart. This paper
+studies the regression with rejection (RwR) problem and investigates a
+no-rejection learning strategy that uses all the data to learn the predictor.
+We first establish the consistency for such a strategy under the weak
+realizability condition. Then for the case without the weak realizability, we
+show that the excessive risk can also be upper bounded with the sum of two
+parts: prediction error and calibration error. Lastly, we demonstrate the
+advantage of such a proposed learning strategy with empirical evidence.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AnglE-optimized Text Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.12871v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.12871v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianming Li, Jing Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High-quality text embedding is pivotal in improving semantic textual
+similarity (STS) tasks, which are crucial components in Large Language Model
+(LLM) applications. However, a common challenge existing text embedding models
+face is the problem of vanishing gradients, primarily due to their reliance on
+the cosine function in the optimization objective, which has saturation zones.
+To address this issue, this paper proposes a novel angle-optimized text
+embedding model called AnglE. The core idea of AnglE is to introduce angle
+optimization in a complex space. This novel approach effectively mitigates the
+adverse effects of the saturation zone in the cosine function, which can impede
+gradient and hinder optimization processes. To set up a comprehensive STS
+evaluation, we experimented on existing short-text STS datasets and a newly
+collected long-text STS dataset from GitHub Issues. Furthermore, we examine
+domain-specific STS scenarios with limited labeled data and explore how AnglE
+works with LLM-annotated data. Extensive experiments were conducted on various
+tasks including short-text STS, long-text STS, and domain-specific STS tasks.
+The results show that AnglE outperforms the state-of-the-art (SOTA) STS models
+that ignore the cosine saturation zone. These findings demonstrate the ability
+of AnglE to generate high-quality text embeddings and the usefulness of angle
+optimization in STS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>update llama results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SSM-DTA: Breaking the Barriers of Data Scarcity in Drug-Target Affinity
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.09818v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.09818v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qizhi Pei, Lijun Wu, Jinhua Zhu, Yingce Xia, Shufang Xie, Tao Qin, Haiguang Liu, Tie-Yan Liu, Rui Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate prediction of Drug-Target Affinity (DTA) is of vital importance in
+early-stage drug discovery, facilitating the identification of drugs that can
+effectively interact with specific targets and regulate their activities. While
+wet experiments remain the most reliable method, they are time-consuming and
+resource-intensive, resulting in limited data availability that poses
+challenges for deep learning approaches. Existing methods have primarily
+focused on developing techniques based on the available DTA data, without
+adequately addressing the data scarcity issue. To overcome this challenge, we
+present the SSM-DTA framework, which incorporates three simple yet highly
+effective strategies: (1) A multi-task training approach that combines DTA
+prediction with masked language modeling (MLM) using paired drug-target data.
+(2) A semi-supervised training method that leverages large-scale unpaired
+molecules and proteins to enhance drug and target representations. This
+approach differs from previous methods that only employed molecules or proteins
+in pre-training. (3) The integration of a lightweight cross-attention module to
+improve the interaction between drugs and targets, further enhancing prediction
+accuracy. Through extensive experiments on benchmark datasets such as
+BindingDB, DAVIS, and KIBA, we demonstrate the superior performance of our
+framework. Additionally, we conduct case studies on specific drug-target
+binding activities, virtual screening experiments, drug feature visualizations,
+and real-world applications, all of which showcase the significant potential of
+our work. In conclusion, our proposed SSM-DTA framework addresses the data
+limitation challenge in DTA prediction and yields promising results, paving the
+way for more efficient and accurate drug discovery processes. Our code is
+available at $\href{https://github.com/QizhiPei/SSM-DTA}{Github}$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Briefings in Bioinformatics 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SMACv2: An Improved Benchmark for Cooperative Multi-Agent Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.07489v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.07489v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Ellis, Jonathan Cook, Skander Moalla, Mikayel Samvelyan, Mingfei Sun, Anuj Mahajan, Jakob N. Foerster, Shimon Whiteson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The availability of challenging benchmarks has played a key role in the
+recent progress of machine learning. In cooperative multi-agent reinforcement
+learning, the StarCraft Multi-Agent Challenge (SMAC) has become a popular
+testbed for centralised training with decentralised execution. However, after
+years of sustained improvement on SMAC, algorithms now achieve near-perfect
+performance. In this work, we conduct new analysis demonstrating that SMAC
+lacks the stochasticity and partial observability to require complex
+*closed-loop* policies. In particular, we show that an *open-loop* policy
+conditioned only on the timestep can achieve non-trivial win rates for many
+SMAC scenarios. To address this limitation, we introduce SMACv2, a new version
+of the benchmark where scenarios are procedurally generated and require agents
+to generalise to previously unseen settings (from the same distribution) during
+evaluation. We also introduce the extended partial observability challenge
+(EPO), which augments SMACv2 to ensure meaningful partial observability. We
+show that these changes ensure the benchmark requires the use of *closed-loop*
+policies. We evaluate state-of-the-art algorithms on SMACv2 and show that it
+presents significant challenges not present in the original benchmark. Our
+analysis illustrates that SMACv2 addresses the discovered deficiencies of SMAC
+and can help benchmark the next generation of MARL methods. Videos of training
+are available at https://sites.google.com/view/smacv2.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Splicing Up Your Predictions with RNA Contrastive Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08738v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08738v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philip Fradkin, Ruian Shi, Bo Wang, Brendan Frey, Leo J. Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the face of rapidly accumulating genomic data, our understanding of the
+RNA regulatory code remains incomplete. Recent self-supervised methods in other
+domains have demonstrated the ability to learn rules underlying the
+data-generating process such as sentence structure in language. Inspired by
+this, we extend contrastive learning techniques to genomic data by utilizing
+functional similarities between sequences generated through alternative
+splicing and gene duplication. Our novel dataset and contrastive objective
+enable the learning of generalized RNA isoform representations. We validate
+their utility on downstream tasks such as RNA half-life and mean ribosome load
+prediction. Our pre-training strategy yields competitive results using linear
+probing on both tasks, along with up to a two-fold increase in Pearson
+correlation in low-data conditions. Importantly, our exploration of the learned
+latent space reveals that our contrastive objective yields semantically
+meaningful representations, underscoring its potential as a valuable
+initialization technique for RNA property prediction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TacticAI: an AI assistant for football tactics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10553v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10553v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhe Wang, Petar Veličković, Daniel Hennes, Nenad Tomašev, Laurel Prince, Michael Kaisers, Yoram Bachrach, Romuald Elie, Li Kevin Wenliang, Federico Piccinini, William Spearman, Ian Graham, Jerome Connor, Yi Yang, Adrià Recasens, Mina Khan, Nathalie Beauguerlange, Pablo Sprechmann, Pol Moreno, Nicolas Heess, Michael Bowling, Demis Hassabis, Karl Tuyls
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying key patterns of tactics implemented by rival teams, and
+developing effective responses, lies at the heart of modern football. However,
+doing so algorithmically remains an open research challenge. To address this
+unmet need, we propose TacticAI, an AI football tactics assistant developed and
+evaluated in close collaboration with domain experts from Liverpool FC. We
+focus on analysing corner kicks, as they offer coaches the most direct
+opportunities for interventions and improvements. TacticAI incorporates both a
+predictive and a generative component, allowing the coaches to effectively
+sample and explore alternative player setups for each corner kick routine and
+to select those with the highest predicted likelihood of success. We validate
+TacticAI on a number of relevant benchmark tasks: predicting receivers and shot
+attempts and recommending player position adjustments. The utility of TacticAI
+is validated by a qualitative study conducted with football domain experts at
+Liverpool FC. We show that TacticAI's model suggestions are not only
+indistinguishable from real tactics, but also favoured over existing tactics
+90% of the time, and that TacticAI offers an effective corner kick retrieval
+system. TacticAI achieves these results despite the limited availability of
+gold-standard data, achieving data efficiency through geometric deep learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CGAN-ECT: Tomography Image Reconstruction from Electrical Capacitance
+  Measurements Using CGANs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.03737v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.03737v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wael Deabes, Alaa E. Abdel-Hakim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the rapid growth of Electrical Capacitance Tomography (ECT)
+applications in several industrial fields, there is a crucial need for
+developing high quality, yet fast, methodologies of image reconstruction from
+raw capacitance measurements. Deep learning, as an effective non-linear mapping
+tool for complicated functions, has been going viral in many fields including
+electrical tomography. In this paper, we propose a Conditional Generative
+Adversarial Network (CGAN) model for reconstructing ECT images from capacitance
+measurements. The initial image of the CGAN model is constructed from the
+capacitance measurement. To our knowledge, this is the first time to represent
+the capacitance measurements in an image form. We have created a new massive
+ECT dataset of 320K synthetic image measurements pairs for training, and
+testing the proposed model. The feasibility and generalization ability of the
+proposed CGAN-ECT model are evaluated using testing dataset, contaminated data
+and flow patterns that are not exposed to the model during the training phase.
+The evaluation results prove that the proposed CGAN-ECT model can efficiently
+create more accurate ECT images than traditional and other deep learning-based
+image reconstruction algorithms. CGAN-ECT achieved an average image correlation
+coefficient of more than 99.3% and an average relative image error about 0.07.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 10 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Wav2vec-based Detection and Severity Level Classification of Dysarthria
+  from Speech 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.14107v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.14107v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Farhad Javanmardi, Saska Tirronen, Manila Kodali, Sudarsana Reddy Kadiri, Paavo Alku
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic detection and severity level classification of dysarthria directly
+from acoustic speech signals can be used as a tool in medical diagnosis. In
+this work, the pre-trained wav2vec 2.0 model is studied as a feature extractor
+to build detection and severity level classification systems for dysarthric
+speech. The experiments were carried out with the popularly used UA-speech
+database. In the detection experiments, the results revealed that the best
+performance was obtained using the embeddings from the first layer of the
+wav2vec model that yielded an absolute improvement of 1.23% in accuracy
+compared to the best performing baseline feature (spectrogram). In the studied
+severity level classification task, the results revealed that the embeddings
+from the final layer gave an absolute improvement of 10.62% in accuracy
+compared to the best baseline features (mel-frequency cepstral coefficients).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>copyright 2023 IEEE. Personal use of this material is permitted.
+  Permission from IEEE must be obtained for all other uses, in any current or
+  future media, including reprinting/republishing this material for advertising
+  or promotional purposes, creating new collective works, for resale or
+  redistribution to servers or lists, or reuse of any copyrighted component of
+  this work in other works</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analysis and Detection of Pathological Voice using Glottal Source
+  Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.14080v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.14080v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sudarsana Reddy Kadiri, Paavo Alku
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic detection of voice pathology enables objective assessment and
+earlier intervention for the diagnosis. This study provides a systematic
+analysis of glottal source features and investigates their effectiveness in
+voice pathology detection. Glottal source features are extracted using glottal
+flows estimated with the quasi-closed phase (QCP) glottal inverse filtering
+method, using approximate glottal source signals computed with the zero
+frequency filtering (ZFF) method, and using acoustic voice signals directly. In
+addition, we propose to derive mel-frequency cepstral coefficients (MFCCs) from
+the glottal source waveforms computed by QCP and ZFF to effectively capture the
+variations in glottal source spectra of pathological voice. Experiments were
+carried out using two databases, the Hospital Universitario Principe de
+Asturias (HUPA) database and the Saarbrucken Voice Disorders (SVD) database.
+Analysis of features revealed that the glottal source contains information that
+discriminates normal and pathological voice. Pathology detection experiments
+were carried out using support vector machine (SVM). From the detection
+experiments it was observed that the performance achieved with the studied
+glottal source features is comparable or better than that of conventional MFCCs
+and perceptual linear prediction (PLP) features. The best detection performance
+was achieved when the glottal source features were combined with the
+conventional MFCCs and PLP features, which indicates the complementary nature
+of the features.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Copyright 2020 IEEE. Personal use of this material is permitted.
+  Permission from IEEE must be obtained for all other uses, in any current or
+  future media, including reprinting/republishing this material for advertising
+  or promotional purposes, creating new collective works, for resale or
+  redistribution to servers or lists, or reuse of any copyrighted component of
+  this work in other works</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multivariate Probabilistic CRPS Learning with an Application to
+  Day-Ahead Electricity Prices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.10019v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.10019v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Berrisch, Florian Ziel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a new method for combining (or aggregating or ensembling)
+multivariate probabilistic forecasts, considering dependencies between
+quantiles and marginals through a smoothing procedure that allows for online
+learning. We discuss two smoothing methods: dimensionality reduction using
+Basis matrices and penalized smoothing. The new online learning algorithm
+generalizes the standard CRPS learning framework into multivariate dimensions.
+It is based on Bernstein Online Aggregation (BOA) and yields optimal asymptotic
+learning properties. The procedure uses horizontal aggregation, i.e.,
+aggregation across quantiles. We provide an in-depth discussion on possible
+extensions of the algorithm and several nested cases related to the existing
+literature on online forecast combination. We apply the proposed methodology to
+forecasting day-ahead electricity prices, which are 24-dimensional
+distributional forecasts. The proposed method yields significant improvements
+over uniform combination in terms of continuous ranked probability score
+(CRPS). We discuss the temporal evolution of the weights and hyperparameters
+and present the results of reduced versions of the preferred model. A fast C++
+implementation of the proposed algorithm will be made available in connection
+with this paper as an open-source R-Package on CRAN.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A path in regression Random Forest looking for spatial dependence: a
+  taxonomy and a systematic <span class="highlight-title">review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.04693v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.04693v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Patelli, Michela Cameletti, Natalia Golini, Rosaria Ignaccolo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Random Forest (RF) is a well-known data-driven algorithm applied in several
+fields thanks to its flexibility in modeling the relationship between the
+response variable and the predictors, also in case of strong non-linearities.
+In environmental applications, it often occurs that the phenomenon of interest
+may present spatial and/or temporal dependence that is not taken explicitly
+into account by RF in its standard version. In this work, we propose a taxonomy
+to classify strategies according to when (Pre-, In- and/or Post-processing)
+they try to include the spatial information into regression RF. Moreover, we
+provide a systematic review and classify the most recent strategies adopted to
+"adjust" regression RF to spatially dependent data, based on the criteria
+provided by the Preferred Reporting Items for Systematic reviews and
+Meta-Analysis (PRISMA). The latter consists of a reproducible methodology for
+collecting and processing existing literature on a specified topic from
+different sources. PRISMA starts with a query and ends with a set of scientific
+documents to review: we performed an online query on the 25$^{th}$ October 2022
+and, in the end, 32 documents were considered for review. The employed
+methodological strategies and the application fields considered in the 32
+scientific documents are described and discussed. This work falls inside the
+Agriculture Impact On Italian Air (AgrImOnIA) project.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rotating Features for Object Discovery <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.00600v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.00600v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sindy Löwe, Phillip Lippe, Francesco Locatello, Max Welling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The binding problem in human cognition, concerning how the brain represents
+and connects objects within a fixed network of neural connections, remains a
+subject of intense debate. Most machine learning efforts addressing this issue
+in an unsupervised setting have focused on slot-based methods, which may be
+limiting due to their discrete nature and difficulty to express uncertainty.
+Recently, the Complex AutoEncoder was proposed as an alternative that learns
+continuous and distributed object-centric representations. However, it is only
+applicable to simple toy data. In this paper, we present Rotating Features, a
+generalization of complex-valued features to higher dimensions, and a new
+evaluation procedure for extracting objects from distributed representations.
+Additionally, we show the applicability of our approach to pre-trained
+features. Together, these advancements enable us to scale distributed
+object-centric representations from simple toy to real-world data. We believe
+this work advances a new paradigm for addressing the binding problem in machine
+learning and has the potential to inspire further innovation in the field.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Oral presentation at NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ When Do Program-of-Thoughts Work for Reasoning? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.15452v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.15452v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Bi, Ningyu Zhang, Yinuo Jiang, Shumin Deng, Guozhou Zheng, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The reasoning capabilities of Large Language Models (LLMs) play a pivotal
+role in the realm of embodied artificial intelligence. Although there are
+effective methods like program-of-thought prompting for LLMs which uses
+programming language to tackle complex reasoning tasks, the specific impact of
+code data on the improvement of reasoning capabilities remains under-explored.
+To address this gap, we propose complexity-impacted reasoning score (CIRS),
+which combines structural and logical attributes, to measure the correlation
+between code and reasoning abilities. Specifically, we use the abstract syntax
+tree to encode the structural information and calculate logical complexity by
+considering the difficulty and the cyclomatic complexity. Through an empirical
+analysis, we find not all code data of complexity can be learned or understood
+by LLMs. Optimal level of complexity is critical to the improvement of
+reasoning abilities by program-aided prompting. Then we design an
+auto-synthesizing and stratifying algorithm, and apply it to instruction
+generation for mathematical reasoning and code data filtering for code
+generation tasks. Extensive results demonstrates the effectiveness of our
+proposed approach. Code will be integrated into the EasyInstruct framework at
+https://github.com/zjunlp/EasyInstruct.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLM-CXR: Instruction-Finetuned LLM for CXR Image Understanding and
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11490v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11490v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suhyeon Lee, Won Jun Kim, Jinho Chang, Jong Chul Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Following the impressive development of LLMs, vision-language alignment in
+LLMs is actively being researched to enable multimodal reasoning and visual IO.
+This direction of research is particularly relevant to medical imaging because
+medical image analysis and generation consist of reasoning based on a
+combination of visual features and prior knowledge. Many recent works have
+focused on training adapter networks that serve as an information bridge
+between image processing networks and LLMs; but presumably, in order to achieve
+maximum reasoning potential of LLMs on visual information as well, visual and
+language features should be allowed to interact more freely. This is especially
+important in the medical domain because understanding and generating medical
+images such as chest X-rays (CXR) require not only accurate visual and
+language-based reasoning but also a more intimate mapping between the two
+modalities. Thus, taking inspiration from previous work on the transformer and
+VQ-GAN combination for bidirectional image and text generation, we build upon
+this approach and develop a method for instruction-tuning an LLM pre-trained
+only on text to gain vision-language capabilities for medical images.
+Specifically, we leverage a pretrained LLM's existing question-answering and
+instruction-following abilities to teach it to understand visual inputs by
+instructing it to answer questions about image inputs and, symmetrically,
+output both text and image responses appropriate to a given query by tuning the
+LLM with diverse tasks that encompass image-based text-generation and
+text-based image-generation. We show that our model, LLM-CXR, trained in this
+approach shows better image-text alignment in both CXR understanding and
+generation tasks while being smaller in size compared to previously developed
+models that perform a narrower range of tasks. The code is at
+https://github.com/hyn2028/llm-cxr.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bootstrap Your Own Skills: Learning to Solve New Tasks with Large
+  Language Model Guidance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10021v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10021v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jesse Zhang, Jiahui Zhang, Karl Pertsch, Ziyi Liu, Xiang Ren, Minsuk Chang, Shao-Hua Sun, Joseph J. Lim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose BOSS, an approach that automatically learns to solve new
+long-horizon, complex, and meaningful tasks by growing a learned skill library
+with minimal supervision. Prior work in reinforcement learning require expert
+supervision, in the form of demonstrations or rich reward functions, to learn
+long-horizon tasks. Instead, our approach BOSS (BOotStrapping your own Skills)
+learns to accomplish new tasks by performing "skill bootstrapping," where an
+agent with a set of primitive skills interacts with the environment to practice
+new skills without receiving reward feedback for tasks outside of the initial
+skill set. This bootstrapping phase is guided by large language models (LLMs)
+that inform the agent of meaningful skills to chain together. Through this
+process, BOSS builds a wide range of complex and useful behaviors from a basic
+set of primitive skills. We demonstrate through experiments in realistic
+household environments that agents trained with our LLM-guided bootstrapping
+procedure outperform those trained with naive bootstrapping as well as prior
+unsupervised skill acquisition methods on zero-shot execution of unseen,
+long-horizon tasks in new environments. Website at clvrai.com/boss.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CoRL 2023 (Oral); 24 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Correlative Information Maximization: A Biologically Plausible Approach
+  to Supervised Deep Neural Networks without Weight Symmetry 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.04810v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.04810v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bariscan Bozkurt, Cengiz Pehlevan, Alper T Erdogan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The backpropagation algorithm has experienced remarkable success in training
+large-scale artificial neural networks; however, its biological plausibility
+has been strongly criticized, and it remains an open question whether the brain
+employs supervised learning mechanisms akin to it. Here, we propose correlative
+information maximization between layer activations as an alternative normative
+approach to describe the signal propagation in biological neural networks in
+both forward and backward directions. This new framework addresses many
+concerns about the biological-plausibility of conventional artificial neural
+networks and the backpropagation algorithm. The coordinate descent-based
+optimization of the corresponding objective, combined with the mean square
+error loss function for fitting labeled supervision data, gives rise to a
+neural network structure that emulates a more biologically realistic network of
+multi-compartment pyramidal neurons with dendritic processing and lateral
+inhibitory neurons. Furthermore, our approach provides a natural resolution to
+the weight symmetry problem between forward and backward signal propagation
+paths, a significant critique against the plausibility of the conventional
+backpropagation algorithm. This is achieved by leveraging two alternative, yet
+equivalent forms of the correlative mutual information objective. These
+alternatives intrinsically lead to forward and backward prediction networks
+without weight symmetry issues, providing a compelling solution to this
+long-standing challenge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint, 38 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models Are Latent Variable Models: Explaining and Finding
+  Good Demonstrations for In-Context Learning <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.11916v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.11916v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyi Wang, Wanrong Zhu, Michael Saxon, Mark Steyvers, William Yang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, pre-trained large language models (LLMs) have demonstrated
+remarkable efficiency in achieving an inference-time few-shot learning
+capability known as in-context learning. However, existing literature has
+highlighted the sensitivity of this capability to the selection of few-shot
+demonstrations. Current understandings of the underlying mechanisms by which
+this capability arises from regular language model pretraining objectives
+remain disconnected from the real-world LLMs. This study aims to examine the
+in-context learning phenomenon through a Bayesian lens, viewing real-world LLMs
+as latent variable models. On this premise, we propose an algorithm to select
+optimal demonstrations from a set of annotated data with a small LM, and then
+directly generalize the selected demonstrations to larger LMs. We demonstrate
+significant improvement over baselines, averaged over eight GPT models on eight
+real-world text classification datasets. We also demonstrate the real-world
+usefulness of our algorithm on GSM8K, a math word problem dataset. Our
+empirical findings support our hypothesis that LLMs implicitly infer a latent
+variable containing task information.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>code at:
+  https://github.com/WANGXinyiLinda/concept-based-demonstration-selection
+  Accepted to NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The emergence of clusters in self-attention dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.05465v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.05465v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Borjan Geshkovski, Cyril Letrouit, Yury Polyanskiy, Philippe Rigollet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Viewing Transformers as interacting particle systems, we describe the
+geometry of learned representations when the weights are not time dependent. We
+show that particles, representing tokens, tend to cluster toward particular
+limiting objects as time tends to infinity. Cluster locations are determined by
+the initial tokens, confirming context-awareness of representations learned by
+Transformers. Using techniques from dynamical systems and partial differential
+equations, we show that the type of limiting object that emerges depends on the
+spectrum of the value matrix. Additionally, in the one-dimensional case we
+prove that the self-attention matrix converges to a low-rank Boolean matrix.
+The combination of these results mathematically confirms the empirical
+observation made by Vaswani et al. [VSP'17] that leaders appear in a sequence
+of tokens when processed by Transformers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unsupervised Text Embedding Space Generation Using Generative
+  Adversarial Networks for Text Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.17181v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.17181v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun-Min Lee, Tae-Bin Ha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Adversarial Networks (GAN) is a model for data synthesis, which
+creates plausible data through the competition of generator and discriminator.
+Although GAN application to image synthesis is extensively studied, it has
+inherent limitations to natural language generation. Because natural language
+is composed of discrete tokens, a generator has difficulty updating its
+gradient through backpropagation; therefore, most text-GAN studies generate
+sentences starting with a random token based on a reward system. Thus, the
+generators of previous studies are pre-trained in an autoregressive way before
+adversarial training, causing data memorization that synthesized sentences
+reproduce the training data. In this paper, we synthesize sentences using a
+framework similar to the original GAN. More specifically, we propose Text
+Embedding Space Generative Adversarial Networks (TESGAN) which generate
+continuous text embedding spaces instead of discrete tokens to solve the
+gradient backpropagation problem. Furthermore, TESGAN conducts unsupervised
+learning which does not directly refer to the text of the training data to
+overcome the data memorization issue. By adopting this novel method, TESGAN can
+synthesize new sentences, showing the potential of unsupervised learning for
+text synthesis. We expect to see extended research combining Large Language
+Models with a new perspective of viewing text as an continuous space.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NEJLT accpeted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Overlooked Structure of Stochastic Gradients <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.02083v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.02083v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeke Xie, Qian-Yuan Tang, Mingming Sun, Ping Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Stochastic gradients closely relate to both optimization and generalization
+of deep neural networks (DNNs). Some works attempted to explain the success of
+stochastic optimization for deep learning by the arguably heavy-tail properties
+of gradient noise, while other works presented theoretical and empirical
+evidence against the heavy-tail hypothesis on gradient noise. Unfortunately,
+formal statistical tests for analyzing the structure and heavy tails of
+stochastic gradients in deep learning are still under-explored. In this paper,
+we mainly make two contributions. First, we conduct formal statistical tests on
+the distribution of stochastic gradients and gradient noise across both
+parameters and iterations. Our statistical tests reveal that dimension-wise
+gradients usually exhibit power-law heavy tails, while iteration-wise gradients
+and stochastic gradient noise caused by minibatch training usually do not
+exhibit power-law heavy tails. Second, we further discover that the covariance
+spectra of stochastic gradients have the power-law structures overlooked by
+previous studies and present its theoretical implications for training of DNNs.
+While previous studies believed that the anisotropic structure of stochastic
+gradients matters to deep learning, they did not expect the gradient covariance
+can have such an elegant mathematical structure. Our work challenges the
+existing belief and provides novel insights on the structure of stochastic
+gradients in deep learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023. 20 pages, 16 figures, 17 Tables; Key Words: Deep
+  Learning, Stochastic Gradient, Optimization. arXiv admin note: text overlap
+  with arXiv:2201.13011</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Convergence and Recovery Guarantees of Unsupervised Neural Networks for
+  Inverse Problems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.12128v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.12128v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathan Buskulic, Jalal Fadili, Yvain Quéau
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural networks have become a prominent approach to solve inverse problems in
+recent years. While a plethora of such methods was developed to solve inverse
+problems empirically, we are still lacking clear theoretical guarantees for
+these methods. On the other hand, many works proved convergence to optimal
+solutions of neural networks in a more general setting using
+overparametrization as a way to control the Neural Tangent Kernel. In this work
+we investigate how to bridge these two worlds and we provide deterministic
+convergence and recovery guarantees for the class of unsupervised feedforward
+multilayer neural networks trained to solve inverse problems. We also derive
+overparametrization bounds under which a two-layers Deep Inverse Prior network
+with smooth activation function will benefit from our guarantees.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Getting ViT in Shape: Scaling Laws for Compute-Optimal Model Design 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13035v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13035v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ibrahim Alabdulmohsin, Xiaohua Zhai, Alexander Kolesnikov, Lucas Beyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scaling laws have been recently employed to derive compute-optimal model size
+(number of parameters) for a given compute duration. We advance and refine such
+methods to infer compute-optimal model shapes, such as width and depth, and
+successfully implement this in vision transformers. Our shape-optimized vision
+transformer, SoViT, achieves results competitive with models that exceed twice
+its size, despite being pre-trained with an equivalent amount of compute. For
+example, SoViT-400m/14 achieves 90.3% fine-tuning accuracy on ILSRCV2012,
+surpassing the much larger ViT-g/14 and approaching ViT-G/14 under identical
+settings, with also less than half the inference cost. We conduct a thorough
+evaluation across multiple tasks, such as image classification, captioning, VQA
+and zero-shot transfer, demonstrating the effectiveness of our model across a
+broad range of domains and identifying limitations. Overall, our findings
+challenge the prevailing approach of blindly scaling up vision models and pave
+a path for a more informed scaling.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 7 figures, 9 tables. Version 2: Layout fixes</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimizing Memory Mapping Using Deep Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07440v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07440v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengming Wang, Mikita Sazanovich, Berkin Ilbeyi, Phitchaya Mangpo Phothilimthana, Manish Purohit, Han Yang Tay, Ngân Vũ, Miaosen Wang, Cosmin Paduraru, Edouard Leurent, Anton Zhernov, Po-Sen Huang, Julian Schrittwieser, Thomas Hubert, Robert Tung, Paula Kurylowicz, Kieran Milan, Oriol Vinyals, Daniel J. Mankowitz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Resource scheduling and allocation is a critical component of many high
+impact systems ranging from congestion control to cloud computing. Finding more
+optimal solutions to these problems often has significant impact on resource
+and time savings, reducing device wear-and-tear, and even potentially improving
+carbon emissions. In this paper, we focus on a specific instance of a
+scheduling problem, namely the memory mapping problem that occurs during
+compilation of machine learning programs: That is, mapping tensors to different
+memory layers to optimize execution time.
+  We introduce an approach for solving the memory mapping problem using
+Reinforcement Learning. RL is a solution paradigm well-suited for sequential
+decision making problems that are amenable to planning, and combinatorial
+search spaces with high-dimensional data inputs. We formulate the problem as a
+single-player game, which we call the mallocGame, such that high-reward
+trajectories of the game correspond to efficient memory mappings on the target
+hardware. We also introduce a Reinforcement Learning agent, mallocMuZero, and
+show that it is capable of playing this game to discover new and improved
+memory mapping solutions that lead to faster execution times on real ML
+workloads on ML accelerators. We compare the performance of mallocMuZero to the
+default solver used by the Accelerated Linear Algebra (XLA) compiler on a
+benchmark of realistic ML workloads. In addition, we show that mallocMuZero is
+capable of improving the execution time of the recently published AlphaTensor
+matrix multiplication model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Removing Structured Noise with Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.05290v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.05290v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tristan S. W. Stevens, Hans van Gorp, Faik C. Meral, Junseob Shin, Jason Yu, Jean-Luc Robert, Ruud J. G. van Sloun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Solving ill-posed inverse problems requires careful formulation of prior
+beliefs over the signals of interest and an accurate description of their
+manifestation into noisy measurements. Handcrafted signal priors based on e.g.
+sparsity are increasingly replaced by data-driven deep generative models, and
+several groups have recently shown that state-of-the-art score-based diffusion
+models yield particularly strong performance and flexibility. In this paper, we
+show that the powerful paradigm of posterior sampling with diffusion models can
+be extended to include rich, structured, noise models. To that end, we propose
+a joint conditional reverse diffusion process with learned scores for the noise
+and signal-generating distribution. We demonstrate strong performance gains
+across various inverse problems with structured noise, outperforming
+competitive baselines that use normalizing flows and adversarial networks. This
+opens up new opportunities and relevant practical applications of diffusion
+modeling for inverse problems in the context of non-Gaussian measurement
+models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 7 figures, preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Theoretical Analysis on Independence-driven Importance Weighting for
+  Covariate-shift Generalization <span class="chip">ICML 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.02355v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.02355v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renzhe Xu, Xingxuan Zhang, Zheyan Shen, Tong Zhang, Peng Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Covariate-shift generalization, a typical case in out-of-distribution (OOD)
+generalization, requires a good performance on the unknown test distribution,
+which varies from the accessible training distribution in the form of covariate
+shift. Recently, independence-driven importance weighting algorithms in stable
+learning literature have shown empirical effectiveness to deal with
+covariate-shift generalization on several learning models, including regression
+algorithms and deep neural networks, while their theoretical analyses are
+missing. In this paper, we theoretically prove the effectiveness of such
+algorithms by explaining them as feature selection processes. We first specify
+a set of variables, named minimal stable variable set, that is the minimal and
+optimal set of variables to deal with covariate-shift generalization for common
+loss functions, such as the mean squared loss and binary cross-entropy loss.
+Afterward, we prove that under ideal conditions, independence-driven importance
+weighting algorithms could identify the variables in this set. Analysis of
+asymptotic properties is also provided. These theories are further validated in
+several synthetic experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICML 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Learning Enhanced Realized GARCH 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08002v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08002v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Liu, Chao Wang, Minh-Ngoc Tran, Robert Kohn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a new approach to volatility modeling by combining deep learning
+(LSTM) and realized volatility measures. This LSTM-enhanced realized GARCH
+framework incorporates and distills modeling advances from financial
+econometrics, high frequency trading data and deep learning. Bayesian inference
+via the Sequential Monte Carlo method is employed for statistical inference and
+forecasting. The new framework can jointly model the returns and realized
+volatility measures, has an excellent in-sample fit and superior predictive
+performance compared to several benchmark models, while being able to adapt
+well to the stylized facts in volatility. The performance of the new framework
+is tested using a wide range of metrics, from marginal likelihood, volatility
+forecasting, to tail risk forecasting and option pricing. We report on a
+comprehensive empirical study using 31 widely traded stock indices over a time
+period that includes COVID-19 pandemic.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>47 pages, 12 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Internal State of an LLM Knows When It's Lying 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.13734v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.13734v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amos Azaria, Tom Mitchell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While Large Language Models (LLMs) have shown exceptional performance in
+various tasks, one of their most prominent drawbacks is generating inaccurate
+or false information with a confident tone. In this paper, we provide evidence
+that the LLM's internal state can be used to reveal the truthfulness of
+statements. This includes both statements provided to the LLM, and statements
+that the LLM itself generates. Our approach is to train a classifier that
+outputs the probability that a statement is truthful, based on the hidden layer
+activations of the LLM as it reads or generates the statement. Experiments
+demonstrate that given a set of test sentences, of which half are true and half
+false, our trained classifier achieves an average of 71\% to 83\% accuracy
+labeling which sentences are true versus false, depending on the LLM base
+model. Furthermore, we explore the relationship between our classifier's
+performance and approaches based on the probability assigned to the sentence by
+the LLM. We show that while LLM-assigned sentence probability is related to
+sentence truthfulness, this probability is also dependent on sentence length
+and the frequencies of words in the sentence, resulting in our trained
+classifier providing a more reliable approach to detecting truthfulness,
+highlighting its potential to enhance the reliability of LLM-generated content
+and its practical applicability in real-world scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How accurate are neural approximations of complex network dynamics? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.04900v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.04900v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vaiva Vasiliauskaite, Nino Antulov-Fantulin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data-driven approximations of ordinary differential equations offer a
+promising alternative to classical methods of discovering a dynamical system
+model, particularly in complex systems lacking explicit first principles. This
+paper focuses on a complex system whose dynamics is described with a system of
+such equations, coupled through a complex network. Numerous real-world systems,
+including financial, social, and neural systems, belong to this class of
+dynamical models. We propose essential elements for approximating these
+dynamical systems using neural networks, including necessary biases and an
+appropriate neural architecture. Emphasizing the differences from static
+supervised learning, we advocate for evaluating generalization beyond classical
+assumptions of statistical learning theory. To estimate confidence in
+prediction during inference time, we introduce a dedicated null model. By
+studying various complex network dynamics, we demonstrate that the neural
+approximations of dynamics generalize across complex network structures, sizes,
+and statistical properties of inputs. Our comprehensive framework enables
+accurate and reliable deep learning approximations of high-dimensional,
+nonlinear dynamical systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NeuroInspect: Interpretable Neuron-based Debugging Framework through
+  Class-conditional Visualizations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07184v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07184v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeong-Joon Ju, Ji-Hoon Park, Seong-Whan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite deep learning (DL) has achieved remarkable progress in various
+domains, the DL models are still prone to making mistakes. This issue
+necessitates effective debugging tools for DL practitioners to interpret the
+decision-making process within the networks. However, existing debugging
+methods often demand extra data or adjustments to the decision process,
+limiting their applicability. To tackle this problem, we present NeuroInspect,
+an interpretable neuron-based debugging framework with three key stages:
+counterfactual explanations, feature visualizations, and false correlation
+mitigation. Our debugging framework first pinpoints neurons responsible for
+mistakes in the network and then visualizes features embedded in the neurons to
+be human-interpretable. To provide these explanations, we introduce
+CLIP-Illusion, a novel feature visualization method that generates images
+representing features conditioned on classes to examine the connection between
+neurons and the decision layer. We alleviate convoluted explanations of the
+conventional visualization approach by employing class information, thereby
+isolating mixed properties. This process offers more human-interpretable
+explanations for model errors without altering the trained network or requiring
+additional data. Furthermore, our framework mitigates false correlations
+learned from a dataset under a stochastic perspective, modifying decisions for
+the neurons considered as the main causes. We validate the effectiveness of our
+framework by addressing false correlations and improving inferences for classes
+with the worst performance in real-world settings. Moreover, we demonstrate
+that NeuroInspect helps debug the mistakes of DL models through evaluation for
+human understanding. The code is openly available at
+https://github.com/yeongjoonJu/NeuroInspect.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Private Multi-Task Learning: Formulation and Applications to Federated
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2108.12978v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2108.12978v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengyuan Hu, Zhiwei Steven Wu, Virginia Smith
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many problems in machine learning rely on multi-task learning (MTL), in which
+the goal is to solve multiple related machine learning tasks simultaneously.
+MTL is particularly relevant for privacy-sensitive applications in areas such
+as healthcare, finance, and IoT computing, where sensitive data from multiple,
+varied sources are shared for the purpose of learning. In this work, we
+formalize notions of client-level privacy for MTL via joint differential
+privacy (JDP), a relaxation of differential privacy for mechanism design and
+distributed optimization. We then propose an algorithm for mean-regularized
+MTL, an objective commonly used for applications in personalized federated
+learning, subject to JDP. We analyze our objective and solver, providing
+certifiable guarantees on both privacy and utility. Empirically, we find that
+our method provides improved privacy/utility trade-offs relative to global
+baselines across common federated learning benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to TMLR. Transactions on Machine Learning Research (2022)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptation of Tongue Ultrasound-Based Silent Speech Interfaces Using
+  Spatial <span class="highlight-title">Transformer</span> Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19130v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19130v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        László Tóth, Amin Honarmandi Shandiz, Gábor Gosztolya, Csapó Tamás Gábor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Thanks to the latest deep learning algorithms, silent speech interfaces (SSI)
+are now able to synthesize intelligible speech from articulatory movement data
+under certain conditions. However, the resulting models are rather
+speaker-specific, making a quick switch between users troublesome. Even for the
+same speaker, these models perform poorly cross-session, i.e. after dismounting
+and re-mounting the recording equipment. To aid quick speaker and session
+adaptation of ultrasound tongue imaging-based SSI models, we extend our deep
+networks with a spatial transformer network (STN) module, capable of performing
+an affine transformation on the input images. Although the STN part takes up
+only about 10% of the network, our experiments show that adapting just the STN
+module might allow to reduce MSE by 88% on the average, compared to retraining
+the whole network. The improvement is even larger (around 92%) when adapting
+the network to different recording sessions from the same speaker.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Mirage: Model-Agnostic Graph Distillation for Graph Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09486v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09486v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mridul Gupta, Sahil Manchanda, Hariprasad Kodamana, Sayan Ranu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  GNNs, like other deep learning models, are data and computation hungry. There
+is a pressing need to scale training of GNNs on large datasets to enable their
+usage on low-resource environments. Graph distillation is an effort in that
+direction with the aim to construct a smaller synthetic training set from the
+original training data without significantly compromising model performance.
+While initial efforts are promising, this work is motivated by two key
+observations: (1) Existing graph distillation algorithms themselves rely on
+training with the full dataset, which undermines the very premise of graph
+distillation. (2) The distillation process is specific to the target GNN
+architecture and hyper-parameters and thus not robust to changes in the
+modeling pipeline. We circumvent these limitations by designing a distillation
+algorithm called Mirage for graph classification. Mirage is built on the
+insight that a message-passing GNN decomposes the input graph into a multiset
+of computation trees. Furthermore, the frequency distribution of computation
+trees is often skewed in nature, enabling us to condense this data into a
+concise distilled summary. By compressing the computation data itself, as
+opposed to emulating gradient flows on the original training set-a prevalent
+approach to date-Mirage transforms into an unsupervised and
+architecture-agnostic distillation algorithm. Extensive benchmarking on
+real-world datasets underscores Mirage's superiority, showcasing enhanced
+generalization accuracy, data compression, and distillation efficiency when
+compared to state-of-the-art baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FedLAP-DP: Federated Learning by Sharing Differentially Private Loss
+  Approximations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.01068v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.01068v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hui-Po Wang, Dingfan Chen, Raouf Kerkouche, Mario Fritz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work proposes FedLAP-DP, a novel privacy-preserving approach for
+federated learning. Unlike previous linear point-wise gradient-sharing schemes,
+such as FedAvg, our formulation enables a type of global optimization by
+leveraging synthetic samples received from clients. These synthetic samples,
+serving as loss surrogates, approximate local loss landscapes by simulating the
+utility of real images within a local region. We additionally introduce an
+approach to measure effective approximation regions reflecting the quality of
+the approximation. Therefore, the server can recover an approximation of the
+global loss landscape and optimize the model globally. Moreover, motivated by
+the emerging privacy concerns, we demonstrate that our approach seamlessly
+works with record-level differential privacy (DP), granting theoretical privacy
+guarantees for every data record on the clients. Extensive results validate the
+efficacy of our formulation on various datasets with highly skewed
+distributions. Our method consistently improves over the baselines, especially
+considering highly skewed distributions and noisy gradients due to DP. The
+source code and setup will be released upon publication.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhanced Meta Label Correction for Coping with Label Corruption <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12961v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12961v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mitchell Keren Taraday, Chaim Baskin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional methods for learning with the presence of noisy labels have
+successfully handled datasets with artificially injected noise but still fall
+short of adequately handling real-world noise. With the increasing use of
+meta-learning in the diverse fields of machine learning, researchers leveraged
+auxiliary small clean datasets to meta-correct the training labels.
+Nonetheless, existing meta-label correction approaches are not fully exploiting
+their potential. In this study, we propose an Enhanced Meta Label Correction
+approach abbreviated as EMLC for the learning with noisy labels (LNL) problem.
+We re-examine the meta-learning process and introduce faster and more accurate
+meta-gradient derivations. We propose a novel teacher architecture tailored
+explicitly to the LNL problem, equipped with novel training objectives. EMLC
+outperforms prior approaches and achieves state-of-the-art results in all
+standard benchmarks. Notably, EMLC enhances the previous art on the noisy
+real-world dataset Clothing1M by $1.52\%$ while requiring $\times 0.5$ the time
+per epoch and with much faster convergence of the meta-objective when compared
+to the baseline approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sophia: A Scalable Stochastic Second-order Optimizer for Language Model
+  <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14342v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14342v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hong Liu, Zhiyuan Li, David Hall, Percy Liang, Tengyu Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given the massive cost of language model pre-training, a non-trivial
+improvement of the optimization algorithm would lead to a material reduction on
+the time and cost of training. Adam and its variants have been state-of-the-art
+for years, and more sophisticated second-order (Hessian-based) optimizers often
+incur too much per-step overhead. In this paper, we propose Sophia,
+Second-order Clipped Stochastic Optimization, a simple scalable second-order
+optimizer that uses a light-weight estimate of the diagonal Hessian as the
+pre-conditioner. The update is the moving average of the gradients divided by
+the moving average of the estimated Hessian, followed by element-wise clipping.
+The clipping controls the worst-case update size and tames the negative impact
+of non-convexity and rapid change of Hessian along the trajectory. Sophia only
+estimates the diagonal Hessian every handful of iterations, which has
+negligible average per-step time and memory overhead. On language modeling with
+GPT models of sizes ranging from 125M to 1.5B, Sophia achieves a 2x speed-up
+compared to Adam in the number of steps, total compute, and wall-clock time,
+achieving the same perplexity with 50% fewer steps, less total compute, and
+reduced wall-clock time. Theoretically, we show that Sophia, in a much
+simplified setting, adapts to the heterogeneous curvatures in different
+parameter dimensions, and thus has a run-time bound that does not depend on the
+condition number of the loss.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BLM-17m: A Large-Scale <span class="highlight-title">Dataset</span> for Black Lives Matter Topic Detection on
+  Twitter 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2105.01331v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2105.01331v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hasan Kemik, Nusret Özateş, Meysam Asgari-Chenaghlu, Yang Li, Erik Cambria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Protection of human rights is one of the most important problems of our
+world. In this paper, our aim is to provide a dataset which covers one of the
+most significant human rights contradiction in recent months affected the whole
+world, George Floyd incident. We propose a labeled dataset for topic detection
+that contains 17 million tweets. These Tweets are collected from 25 May 2020 to
+21 August 2020 that covers 89 days from start of this incident. We labeled the
+dataset by monitoring most trending news topics from global and local
+newspapers. Apart from that, we present two baselines, TF-IDF and LDA. We
+evaluated the results of these two methods with three different k values for
+metrics of precision, recall and f1-score. The collected dataset is available
+at https://github.com/MeysamAsgariC/BLMT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Q-<span class="highlight-title">Transformer</span>: Scalable Offline Reinforcement Learning via
+  Autoregressive Q-Functions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10150v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10150v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yevgen Chebotar, Quan Vuong, Alex Irpan, Karol Hausman, Fei Xia, Yao Lu, Aviral Kumar, Tianhe Yu, Alexander Herzog, Karl Pertsch, Keerthana Gopalakrishnan, Julian Ibarz, Ofir Nachum, Sumedh Sontakke, Grecia Salazar, Huong T Tran, Jodilyn Peralta, Clayton Tan, Deeksha Manjunath, Jaspiar Singht, Brianna Zitkovich, Tomas Jackson, Kanishka Rao, Chelsea Finn, Sergey Levine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we present a scalable reinforcement learning method for
+training multi-task policies from large offline datasets that can leverage both
+human demonstrations and autonomously collected data. Our method uses a
+Transformer to provide a scalable representation for Q-functions trained via
+offline temporal difference backups. We therefore refer to the method as
+Q-Transformer. By discretizing each action dimension and representing the
+Q-value of each action dimension as separate tokens, we can apply effective
+high-capacity sequence modeling techniques for Q-learning. We present several
+design decisions that enable good performance with offline RL training, and
+show that Q-Transformer outperforms prior offline RL algorithms and imitation
+learning techniques on a large diverse real-world robotic manipulation task
+suite. The project's website and videos can be found at
+https://qtransformer.github.io
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>See website at https://qtransformer.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Permutation Equivariant Graph Framelets for Heterophilous Graph Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.04265v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.04265v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianfei Li, Ruigang Zheng, Han Feng, Ming Li, Xiaosheng Zhuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The nature of heterophilous graphs is significantly different from that of
+homophilous graphs, which causes difficulties in early graph neural network
+models and suggests aggregations beyond the 1-hop neighborhood. In this paper,
+we develop a new way to implement multi-scale extraction via constructing
+Haar-type graph framelets with desired properties of permutation equivariance,
+efficiency, and sparsity, for deep learning tasks on graphs. We further design
+a graph framelet neural network model PEGFAN (Permutation Equivariant Graph
+Framelet Augmented Network) based on our constructed graph framelets. The
+experiments are conducted on a synthetic dataset and 9 benchmark datasets to
+compare performance with other state-of-the-art models. The result shows that
+our model can achieve the best performance on certain datasets of heterophilous
+graphs (including the majority of heterophilous datasets with relatively larger
+sizes and denser connections) and competitive performance on the remaining.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ReMax: A Simple, Effective, and Efficient Reinforcement Learning Method
+  for Aligning Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10505v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10505v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziniu Li, Tian Xu, Yushun Zhang, Yang Yu, Ruoyu Sun, Zhi-Quan Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Alignment is of critical importance for training large language models
+(LLMs). The predominant strategy to address this is through Reinforcement
+Learning from Human Feedback (RLHF), where PPO serves as the de-facto
+algorithm. Yet, PPO is known to suffer from computational inefficiency, which
+is a challenge that this paper aims to address. We identify three important
+properties in RLHF tasks: fast simulation, deterministic transitions, and
+trajectory-level rewards, which are not leveraged in PPO. Based on such
+observations, we develop a new algorithm tailored for RLHF, called ReMax. The
+algorithm design of ReMax is built on a celebrated algorithm REINFORCE but is
+equipped with a new variance-reduction technique.
+  Our method has three-fold advantages over PPO: first, ReMax is simple to
+implement and removes many hyper-parameters in PPO, which are scale-sensitive
+and laborious to tune. Second, ReMax saves about 50% memory usage in principle.
+As a result, PPO runs out-of-memory when fine-tuning a Llama2 (7B) model on
+8xA100-40GB GPUs, whereas ReMax can afford training. This memory improvement is
+achieved by removing the value model in PPO. Third, based on our calculations,
+we find that even assuming PPO can afford the training of Llama2 (7B), it would
+still run about 2x slower than ReMax. This is due to the computational overhead
+of the value model, which does not exist in ReMax. Importantly, the above
+computational improvements do not sacrifice the performance. We hypothesize
+these advantages can be maintained in larger-scaled models. Our implementation
+of ReMax is available at https://github.com/liziniu/ReMax
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluation of feature selection performance for identification of best
+  effective technical indicators on stock market price prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09903v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09903v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fatemeh Moodi, Amir Jahangard-Rafsanjani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the influence of many factors, including technical indicators on stock
+market prediction, feature selection is important to choose the best
+indicators. One of the feature selection methods that consider the performance
+of models during feature selection is the wrapper feature selection method. The
+aim of this research is to identify a combination of the best stock market
+indicators through feature selection to predict the stock market price with the
+least error. In order to evaluate the impact of wrapper feature selection
+techniques on stock market prediction, in this paper SFS and SBS with 10
+estimators and 123 technical indicators have been examined on the last 13 years
+of Apple Company. Also, by the proposed method, the data created by the 3-day
+time window were converted to the appropriate input for regression methods.
+Based on the results observed: (1) Each wrapper feature selection method has
+different results with different machine learning methods, and each method is
+more correlated with a specific set of technical indicators of the stock
+market. (2) Ridge and LR estimates alone, and with two methods of the wrapper
+feature selection, namely SFS and SBS; They had the best results with all
+assessment criteria for market forecast. (3)The Ridge and LR method with all
+the R2, MSE, RMSE, MAE and MAPE have the best stock market prediction results.
+Also, the MLP Regression Method, along with the Sequential Forwards Selection
+and the MSE, had the best performance. SVR regression, along with the SFS and
+the MSE, has improved greatly compared to the SVR regression with all
+indicators. (4) It was also observed that different features are selected by
+different ML methods with different evaluation parameters. (5) Most ML methods
+have used the Squeeze_pro, Percentage Price Oscillator, Thermo, Decay, Archer
+On-Balance Volume, Bollinger Bands, Squeeze and Ichimoku indicator.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 8 figures,4 tables,45 references</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Policy learning for many outcomes of interest: Combining optimal policy
+  trees with multi-objective Bayesian optimisation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.06312v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.06312v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Rehill, Nicholas Biddle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Methods for learning optimal policies use causal machine learning models to
+create human-interpretable rules for making choices around the allocation of
+different policy interventions. However, in realistic policy-making contexts,
+decision-makers often care about trade-offs between outcomes, not just
+single-mindedly maximising utility for one outcome. This paper proposes an
+approach termed Multi-Objective Policy Learning (MOPoL) which combines optimal
+decision trees for policy learning with a multi-objective Bayesian optimisation
+approach to explore the trade-off between multiple outcomes. It does this by
+building a Pareto frontier of non-dominated models for different hyperparameter
+settings which govern outcome weighting. The key here is that a low-cost greedy
+tree can be an accurate proxy for the very computationally costly optimal tree
+for the purposes of making decisions which means models can be repeatedly fit
+to learn a Pareto frontier. The method is applied to a real-world case-study of
+non-price rationing of anti-malarial medication in Kenya.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Domain-adaptive Message Passing Graph Neural Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.16470v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.16470v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiao Shen, Shirui Pan, Kup-Sze Choi, Xi Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cross-network node classification (CNNC), which aims to classify nodes in a
+label-deficient target network by transferring the knowledge from a source
+network with abundant labels, draws increasing attention recently. To address
+CNNC, we propose a domain-adaptive message passing graph neural network
+(DM-GNN), which integrates graph neural network (GNN) with conditional
+adversarial domain adaptation. DM-GNN is capable of learning informative
+representations for node classification that are also transferrable across
+networks. Firstly, a GNN encoder is constructed by dual feature extractors to
+separate ego-embedding learning from neighbor-embedding learning so as to
+jointly capture commonality and discrimination between connected nodes.
+Secondly, a label propagation node classifier is proposed to refine each node's
+label prediction by combining its own prediction and its neighbors' prediction.
+In addition, a label-aware propagation scheme is devised for the labeled source
+network to promote intra-class propagation while avoiding inter-class
+propagation, thus yielding label-discriminative source embeddings. Thirdly,
+conditional adversarial domain adaptation is performed to take the
+neighborhood-refined class-label information into account during adversarial
+domain adaptation, so that the class-conditional distributions across networks
+can be better matched. Comparisons with eleven state-of-the-art methods
+demonstrate the effectiveness of the proposed DM-GNN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This version rectifies the numerical inaccuracies of Table 3 and 4 in
+  the printed version (https://doi.org/10.1016/j.neunet.2023.04.038). See our
+  corrigendum at https://doi.org/10.1016/j.neunet.2023.09.026</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph Mixture of Experts: Learning on Large-Scale Graphs with Explicit
+  Diversity Modeling <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.02806v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.02806v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haotao Wang, Ziyu Jiang, Yuning You, Yan Han, Gaowen Liu, Jayanth Srinivasa, Ramana Rao Kompella, Zhangyang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph neural networks (GNNs) have found extensive applications in learning
+from graph data. However, real-world graphs often possess diverse structures
+and comprise nodes and edges of varying types. To bolster the generalization
+capacity of GNNs, it has become customary to augment training graph structures
+through techniques like graph augmentations and large-scale pre-training on a
+wider array of graphs. Balancing this diversity while avoiding increased
+computational costs and the notorious trainability issues of GNNs is crucial.
+This study introduces the concept of Mixture-of-Experts (MoE) to GNNs, with the
+aim of augmenting their capacity to adapt to a diverse range of training graph
+structures, without incurring explosive computational overhead. The proposed
+Graph Mixture of Experts (GMoE) model empowers individual nodes in the graph to
+dynamically and adaptively select more general information aggregation experts.
+These experts are trained to capture distinct subgroups of graph structures and
+to incorporate information with varying hop sizes, where those with larger hop
+sizes specialize in gathering information over longer distances. The
+effectiveness of GMoE is validated through a series of experiments on a diverse
+set of tasks, including graph, node, and link prediction, using the OGB
+benchmark. Notably, it enhances ROC-AUC by $1.81\%$ in ogbg-molhiv and by
+$1.40\%$ in ogbg-molbbbp, when compared to the non-MoE baselines. Our code is
+publicly available at https://github.com/VITA-Group/Graph-Mixture-of-Experts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Over-The-Air Clustered Wireless Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.03363v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.03363v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ayush Madhan-Sohini, Divin Dominic, Nazreen Shah, Ranjitha Prasad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Privacy and bandwidth constraints have led to the use of federated learning
+(FL) in wireless systems, where training a machine learning (ML) model is
+accomplished collaboratively without sharing raw data. While using
+bandwidth-constrained uplink wireless channels, over-the-air (OTA) FL is
+preferred since the clients can transmit parameter updates simultaneously to a
+server. A powerful server may not be available for parameter aggregation due to
+increased latency and server failures. In the absence of a powerful server,
+decentralised strategy is employed where clients communicate with their
+neighbors to obtain a consensus ML model while incurring huge communication
+cost. In this work, we propose the OTA semi-decentralised clustered wireless FL
+(CWFL) and CWFL-Prox algorithms, which is communication efficient as compared
+to the decentralised FL strategy, while the parameter updates converge to
+global minima as O(1/T) for each cluster. Using the MNIST and CIFAR10 datasets,
+we demonstrate the accuracy performance of CWFL is comparable to the
+central-server based COTAF and proximal constraint based methods, while beating
+single-client based ML model by vast margins in accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IEEE GlobeCom 2023, ELNextGen Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UniPC: A Unified Predictor-Corrector Framework for Fast Sampling of
+  Diffusion Models <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.04867v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.04867v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenliang Zhao, Lujia Bai, Yongming Rao, Jie Zhou, Jiwen Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion probabilistic models (DPMs) have demonstrated a very promising
+ability in high-resolution image synthesis. However, sampling from a
+pre-trained DPM is time-consuming due to the multiple evaluations of the
+denoising network, making it more and more important to accelerate the sampling
+of DPMs. Despite recent progress in designing fast samplers, existing methods
+still cannot generate satisfying images in many applications where fewer steps
+(e.g., $<$10) are favored. In this paper, we develop a unified corrector (UniC)
+that can be applied after any existing DPM sampler to increase the order of
+accuracy without extra model evaluations, and derive a unified predictor (UniP)
+that supports arbitrary order as a byproduct. Combining UniP and UniC, we
+propose a unified predictor-corrector framework called UniPC for the fast
+sampling of DPMs, which has a unified analytical form for any order and can
+significantly improve the sampling quality over previous methods, especially in
+extremely few steps. We evaluate our methods through extensive experiments
+including both unconditional and conditional sampling using pixel-space and
+latent-space DPMs. Our UniPC can achieve 3.87 FID on CIFAR10 (unconditional)
+and 7.51 FID on ImageNet 256$\times$256 (conditional) with only 10 function
+evaluations. Code is available at https://github.com/wl-zhao/UniPC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2023. Project page:
+  https://unipc.ivg-research.xyz</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ImageCAS: A Large-Scale <span class="highlight-title">Dataset</span> and Benchmark for Coronary Artery
+  Segmentation based on Computed Tomography Angiography Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.01607v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.01607v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        An Zeng, Chunbiao Wu, Meiping Huang, Jian Zhuang, Shanshan Bi, Dan Pan, Najeeb Ullah, Kaleem Nawaz Khan, Tianchen Wang, Yiyu Shi, Xiaomeng Li, Guisen Lin, Xiaowei Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cardiovascular disease (CVD) accounts for about half of non-communicable
+diseases. Vessel stenosis in the coronary artery is considered to be the major
+risk of CVD. Computed tomography angiography (CTA) is one of the widely used
+noninvasive imaging modalities in coronary artery diagnosis due to its superior
+image resolution. Clinically, segmentation of coronary arteries is essential
+for the diagnosis and quantification of coronary artery disease. Recently, a
+variety of works have been proposed to address this problem. However, on one
+hand, most works rely on in-house datasets, and only a few works published
+their datasets to the public which only contain tens of images. On the other
+hand, their source code have not been published, and most follow-up works have
+not made comparison with existing works, which makes it difficult to judge the
+effectiveness of the methods and hinders the further exploration of this
+challenging yet critical problem in the community. In this paper, we propose a
+large-scale dataset for coronary artery segmentation on CTA images. In
+addition, we have implemented a benchmark in which we have tried our best to
+implement several typical existing methods. Furthermore, we propose a strong
+baseline method which combines multi-scale patch fusion and two-stage
+processing to extract the details of vessels. Comprehensive experiments show
+that the proposed method achieves better performance than existing works on the
+proposed large-scale dataset. The benchmark and the dataset are published at
+https://github.com/XiaoweiXu/ImageCAS-A-Large-Scale-Dataset-and-Benchmark-for-Coronary-Artery-Segmentation-based-on-CT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 12 figures, 4 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Convergence and sample complexity of natural policy gradient primal-dual
+  methods for constrained MDPs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.02346v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.02346v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongsheng Ding, Kaiqing Zhang, Jiali Duan, Tamer Başar, Mihailo R. Jovanović
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study sequential decision making problems aimed at maximizing the expected
+total reward while satisfying a constraint on the expected total utility. We
+employ the natural policy gradient method to solve the discounted
+infinite-horizon optimal control problem for Constrained Markov Decision
+Processes (constrained MDPs). Specifically, we propose a new Natural Policy
+Gradient Primal-Dual (NPG-PD) method that updates the primal variable via
+natural policy gradient ascent and the dual variable via projected sub-gradient
+descent. Although the underlying maximization involves a nonconcave objective
+function and a nonconvex constraint set, under the softmax policy
+parametrization we prove that our method achieves global convergence with
+sublinear rates regarding both the optimality gap and the constraint violation.
+Such convergence is independent of the size of the state-action space, i.e., it
+is~dimension-free. Furthermore, for log-linear and general smooth policy
+parametrizations, we establish sublinear convergence rates up to a function
+approximation error caused by restricted policy parametrization. We also
+provide convergence and finite-sample complexity guarantees for two
+sample-based NPG-PD algorithms. Finally, we use computational experiments to
+showcase the merits and the effectiveness of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>72 pages, 4 figures, 2 tables; revised sample complexity and
+  computational experiments, and added zero constraint violation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ It Is All About Data: A <span class="highlight-title">Survey</span> on the Effects of Data on Adversarial
+  Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09767v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09767v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peiyu Xiong, Michael Tegegn, Jaskeerat Singh Sarin, Shubhraneel Pal, Julia Rubin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial examples are inputs to machine learning models that an attacker
+has intentionally designed to confuse the model into making a mistake. Such
+examples pose a serious threat to the applicability of machine-learning-based
+systems, especially in life- and safety-critical domains. To address this
+problem, the area of adversarial robustness investigates mechanisms behind
+adversarial attacks and defenses against these attacks. This survey reviews a
+particular subset of this literature that focuses on investigating properties
+of training data in the context of model robustness under evasion attacks. It
+first summarizes the main properties of data leading to adversarial
+vulnerability. It then discusses guidelines and techniques for improving
+adversarial robustness by enhancing the data representation and learning
+procedures, as well as techniques for estimating robustness guarantees given
+particular data. Finally, it discusses gaps of knowledge and promising future
+research directions in this area.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM Computing Surveys, 40 pages, 24 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Native Ads CTR Prediction by Large Scale Event Embedding and
+  Recurrent Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1804.09133v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1804.09133v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehul Parsana, Krishna Poola, Yajun Wang, Zhiguang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Click through rate (CTR) prediction is very important for Native
+advertisement but also hard as there is no direct query intent. In this paper
+we propose a large-scale event embedding scheme to encode the each user
+browsing event by training a Siamese network with weak supervision on the
+users' consecutive events. The CTR prediction problem is modeled as a
+supervised recurrent neural network, which naturally model the user history as
+a sequence of events. Our proposed recurrent models utilizing pretrained event
+embedding vectors and an attention layer to model the user history. Our
+experiments demonstrate that our model significantly outperforms the baseline
+and some variants.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Undecidability of Non-Triviality and Finiteness to Undecidability
+  of Learnability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.01382v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.01382v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthias C. Caro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning researchers and practitioners steadily enlarge the multitude
+of successful learning models. They achieve this through in-depth theoretical
+analyses and experiential heuristics. However, there is no known
+general-purpose procedure for rigorously evaluating whether newly proposed
+models indeed successfully learn from data. We show that such a procedure
+cannot exist. For PAC binary classification, uniform and universal online
+learning, and exact learning through teacher-learner interactions, learnability
+is in general undecidable, both in the sense of independence of the axioms in a
+formal system and in the sense of uncomputability. Our proofs proceed via
+computable constructions that encode the consistency problem for formal systems
+and the halting problem for Turing machines into whether certain function
+classes are trivial/finite or highly complex, which we then relate to whether
+these classes are learnable via established characterizations of learnability
+through complexity measures. Our work shows that undecidability appears in the
+theoretical foundations of artificial intelligence: There is no
+one-size-fits-all algorithm for deciding whether a machine learning model can
+be successful. We cannot in general automatize the process of assessing new
+learning models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages (main body) + 11 pages (references and appendix); 1 figure;
+  Close to published version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Embarrassingly Simple Approach for Wafer Feature Extraction and
+  Defect Pattern Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11632v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11632v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nitish Shukla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying defect patterns in a wafer map during manufacturing is crucial to
+find the root cause of the underlying issue and provides valuable insights on
+improving yield in the foundry. Currently used methods use deep neural networks
+to identify the defects. These methods are generally very huge and have
+significant inference time. They also require GPU support to efficiently
+operate. All these issues make these models not fit for on-line prediction in
+the manufacturing foundry. In this paper, we propose an extremely simple yet
+effective technique to extract features from wafer images. The proposed method
+is extremely fast, intuitive, and non-parametric while being explainable. The
+experiment results show that the proposed pipeline outperforms conventional
+deep learning models. Our feature extraction requires no training or
+fine-tuning while preserving the relative shape and location of data points as
+revealed by our interpretability analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>study is not relevant</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Mixed-Type Wafer Defect Pattern Recognition Using Compact
+  Deformable Convolutional <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13827v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13827v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nitish Shukla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Manufacturing wafers is an intricate task involving thousands of steps.
+Defect Pattern Recognition (DPR) of wafer maps is crucial to find the root
+cause of the issue and further improving the yield in the wafer foundry.
+Mixed-type DPR is much more complicated compared to single-type DPR due to
+varied spatial features, the uncertainty of defects, and the number of defects
+present. To accurately predict the number of defects as well as the types of
+defects, we propose a novel compact deformable convolutional transformer (DC
+Transformer). Specifically, DC Transformer focuses on the global features
+present in the wafer map by virtue of learnable deformable kernels and
+multi-head attention to the global features. The proposed method succinctly
+models the internal relationship between the wafer maps and the defects. DC
+Transformer is evaluated on a real dataset containing 38 defect patterns.
+Experimental results show that DC Transformer performs exceptionally well in
+recognizing both single and mixed-type defects. The proposed method outperforms
+the current state of the models by a considerable margin
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Study is not relevant</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fighting Uncertainty with Gradients: Offline Reinforcement Learning via
+  Diffusion Score Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14079v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14079v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        H. J. Terry Suh, Glen Chou, Hongkai Dai, Lujie Yang, Abhishek Gupta, Russ Tedrake
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gradient-based methods enable efficient search capabilities in high
+dimensions. However, in order to apply them effectively in offline optimization
+paradigms such as offline Reinforcement Learning (RL) or Imitation Learning
+(IL), we require a more careful consideration of how uncertainty estimation
+interplays with first-order methods that attempt to minimize them. We study
+smoothed distance to data as an uncertainty metric, and claim that it has two
+beneficial properties: (i) it allows gradient-based methods that attempt to
+minimize uncertainty to drive iterates to data as smoothing is annealed, and
+(ii) it facilitates analysis of model bias with Lipschitz constants. As
+distance to data can be expensive to compute online, we consider settings where
+we need amortize this computation. Instead of learning the distance however, we
+propose to learn its gradients directly as an oracle for first-order
+optimizers. We show these gradients can be efficiently learned with
+score-matching techniques by leveraging the equivalence between distance to
+data and data likelihood. Using this insight, we propose Score-Guided Planning
+(SGP), a planning algorithm for offline RL that utilizes score-matching to
+enable first-order planning in high-dimensional problems, where zeroth-order
+methods were unable to scale, and ensembles were unable to overcome local
+minima. Website: https://sites.google.com/view/score-guided-planning/home
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Glen Chou, Hongkai Dai, and Lujie Yang contributed equally to this
+  work. Accepted to CoRL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">4</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An empirical study of automatic wildlife detection using drone thermal
+  imaging and object detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11257v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11257v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miao Chang, Tan Vuong, Manas Palaparthi, Lachlan Howell, Alessio Bonti, Mohamed Abdelrazek, Duc Thanh Nguyen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial intelligence has the potential to make valuable contributions to
+wildlife management through cost-effective methods for the collection and
+interpretation of wildlife data. Recent advances in remotely piloted aircraft
+systems (RPAS or ``drones'') and thermal imaging technology have created new
+approaches to collect wildlife data. These emerging technologies could provide
+promising alternatives to standard labourious field techniques as well as cover
+much larger areas. In this study, we conduct a comprehensive review and
+empirical study of drone-based wildlife detection. Specifically, we collect a
+realistic dataset of drone-derived wildlife thermal detections. Wildlife
+detections, including arboreal (for instance, koalas, phascolarctos cinereus)
+and ground dwelling species in our collected data are annotated via bounding
+boxes by experts. We then benchmark state-of-the-art object detection
+algorithms on our collected dataset. We use these experimental results to
+identify issues and discuss future directions in automatic animal monitoring
+using drones.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Comprehensive Representations with Richer Self for
+  Text-to-Image Person Re-Identification <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.11210v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.11210v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuanglin Yan, Neng Dong, Jun Liu, Liyan Zhang, Jinhui Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image person re-identification (TIReID) retrieves pedestrian images
+of the same identity based on a query text. However, existing methods for
+TIReID typically treat it as a one-to-one image-text matching problem, only
+focusing on the relationship between image-text pairs within a view. The
+many-to-many matching between image-text pairs across views under the same
+identity is not taken into account, which is one of the main reasons for the
+poor performance of existing methods. To this end, we propose a simple yet
+effective framework, called LCR$^2$S, for modeling many-to-many correspondences
+of the same identity by learning comprehensive representations for both
+modalities from a novel perspective. We construct a support set for each image
+(text) by using other images (texts) under the same identity and design a
+multi-head attentional fusion module to fuse the image (text) and its support
+set. The resulting enriched image and text features fuse information from
+multiple views, which are aligned to train a "richer" TIReID model with
+many-to-many correspondences. Since the support set is unavailable during
+inference, we propose to distill the knowledge learned by the "richer" model
+into a lightweight model for inference with a single image/text as input. The
+lightweight model focuses on semantic association and reasoning of multi-view
+information, which can generate a comprehensive representation containing
+multi-view information with only a single-view input to perform accurate
+text-to-image retrieval during inference. In particular, we use the intra-modal
+features and inter-modal semantic relations of the "richer" model to supervise
+the lightweight model to inherit its powerful capability. Extensive experiments
+demonstrate the effectiveness of LCR$^2$S, and it also achieves new
+state-of-the-art performance on three popular TIReID datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ImagenHub: Standardizing the evaluation of conditional image generation
+  models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01596v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01596v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Max Ku, Tianle Li, Kai Zhang, Yujie Lu, Xingyu Fu, Wenwen Zhuang, Wenhu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, a myriad of conditional image generation and editing models have
+been developed to serve different downstream tasks, including text-to-image
+generation, text-guided image editing, subject-driven image generation,
+control-guided image generation, etc. However, we observe huge inconsistencies
+in experimental conditions: datasets, inference, and evaluation metrics -
+render fair comparisons difficult. This paper proposes ImagenHub, which is a
+one-stop library to standardize the inference and evaluation of all the
+conditional image generation models. Firstly, we define seven prominent tasks
+and curate high-quality evaluation datasets for them. Secondly, we built a
+unified inference pipeline to ensure fair comparison. Thirdly, we design two
+human evaluation scores, i.e. Semantic Consistency and Perceptual Quality,
+along with comprehensive guidelines to evaluate generated images. We train
+expert raters to evaluate the model outputs based on the proposed metrics. Our
+human evaluation achieves a high inter-worker agreement of Krippendorff's alpha
+on 76% models with a value higher than 0.4. We comprehensively evaluated a
+total of around 30 models and observed three key takeaways: (1) the existing
+models' performance is generally unsatisfying except for Text-guided Image
+Generation and Subject-driven Image Generation, with 74% models achieving an
+overall score lower than 0.5. (2) we examined the claims from published papers
+and found 83% of them hold with a few exceptions. (3) None of the existing
+automatic metrics has a Spearman's correlation higher than 0.2 except
+subject-driven image generation. Moving forward, we will continue our efforts
+to evaluate newly published models and update our leaderboard to keep track of
+the progress in conditional image generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Video-Text Retrieval by Supervised Sparse Multi-Grained Learning <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.09473v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.09473v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yimu Wang, Peng Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While recent progress in video-text retrieval has been advanced by the
+exploration of better representation learning, in this paper, we present a
+novel multi-grained sparse learning framework, S3MA, to learn an aligned sparse
+space shared between the video and the text for video-text retrieval. The
+shared sparse space is initialized with a finite number of sparse concepts,
+each of which refers to a number of words. With the text data at hand, we learn
+and update the shared sparse space in a supervised manner using the proposed
+similarity and alignment losses. Moreover, to enable multi-grained alignment,
+we incorporate frame representations for better modeling the video modality and
+calculating fine-grained and coarse-grained similarities. Benefiting from the
+learned shared sparse space and multi-grained similarities, extensive
+experiments on several video-text retrieval benchmarks demonstrate the
+superiority of S3MA over existing methods. Our code is available at
+https://github.com/yimuwangcs/Better_Cross_Modal_Retrieval.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-10-16T00:00:00Z">2023-10-16</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">47</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IDEAL: Influence-Driven Selective Annotations Empower In-Context
+  Learners in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10873v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10873v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaokun Zhang, Xiaobo Xia, Zhaoqing Wang, Ling-Hao Chen, Jiale Liu, Qingyun Wu, Tongliang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-context learning is a promising paradigm that utilizes in-context examples
+as prompts for the predictions of large language models. These prompts are
+crucial for achieving strong performance. However, since the prompts need to be
+sampled from a large volume of annotated examples, finding the right prompt may
+result in high annotation costs. To address this challenge, this paper
+introduces an influence-driven selective annotation method that aims to
+minimize annotation costs while improving the quality of in-context examples.
+The essence of our method is to select a pivotal subset from a large-scale
+unlabeled data pool to annotate for the subsequent sampling of prompts.
+Specifically, a directed graph is first constructed to represent unlabeled
+data. Afterward, the influence of candidate unlabeled subsets is quantified
+with a diffusion process. A simple yet effective greedy algorithm for unlabeled
+data selection is lastly introduced. It iteratively selects the data if it
+provides a maximum marginal gain with respect to quantified influence. Compared
+with previous efforts on selective annotations, our influence-driven method
+works in an end-to-end manner, avoids an intractable explicit balance between
+data diversity and representativeness, and enjoys theoretical support.
+Experiments confirm the superiority of the proposed method on various
+benchmarks, achieving better performance under lower time consumption during
+subset selection. The project page is available at
+https://skzhang1.github.io/IDEAL/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 6 figures, 11 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Will the Prince Get True Love's Kiss? On the Model Sensitivity to Gender
+  Perturbation over Fairytale Texts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10865v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10865v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christina Chance, Da Yin, Dakuo Wang, Kai-Wei Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies show that traditional fairytales are rife with harmful gender
+biases. To help mitigate these gender biases in fairytales, this work aims to
+assess learned biases of language models by evaluating their robustness against
+gender perturbations. Specifically, we focus on Question Answering (QA) tasks
+in fairytales. Using counterfactual data augmentation to the FairytaleQA
+dataset, we evaluate model robustness against swapped gender character
+information, and then mitigate learned biases by introducing counterfactual
+gender stereotypes during training time. We additionally introduce a novel
+approach that utilizes the massive vocabulary of language models to support
+text genres beyond fairytales. Our experimental results suggest that models are
+sensitive to gender perturbations, with significant performance drops compared
+to the original testing set. However, when first fine-tuned on a counterfactual
+training dataset, models are less sensitive to the later introduced anti-gender
+stereotyped text.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CoTFormer: More Tokens With Attention Make Up For Less Depth 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10845v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10845v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amirkeivan Mohtashami, Matteo Pagliardini, Martin Jaggi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The race to continually develop ever larger and deeper foundational models is
+underway. However, techniques like the Chain-of-Thought (CoT) method continue
+to play a pivotal role in achieving optimal downstream performance. In this
+work, we establish an approximate parallel between using chain-of-thought and
+employing a deeper transformer. Building on this insight, we introduce
+CoTFormer, a transformer variant that employs an implicit CoT-like mechanism to
+achieve capacity comparable to a deeper model. Our empirical findings
+demonstrate the effectiveness of CoTFormers, as they significantly outperform
+larger standard transformers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Survey</span> of Vulnerabilities in Large Language Models Revealed by
+  Adversarial Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10844v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10844v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erfan Shayegani, Md Abdullah Al Mamun, Yu Fu, Pedram Zaree, Yue Dong, Nael Abu-Ghazaleh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are swiftly advancing in architecture and
+capability, and as they integrate more deeply into complex systems, the urgency
+to scrutinize their security properties grows. This paper surveys research in
+the emerging interdisciplinary field of adversarial attacks on LLMs, a subfield
+of trustworthy ML, combining the perspectives of Natural Language Processing
+and Security. Prior work has shown that even safety-aligned LLMs (via
+instruction tuning and reinforcement learning through human feedback) can be
+susceptible to adversarial attacks, which exploit weaknesses and mislead AI
+systems, as evidenced by the prevalence of `jailbreak' attacks on models like
+ChatGPT and Bard. In this survey, we first provide an overview of large
+language models, describe their safety alignment, and categorize existing
+research based on various learning structures: textual-only attacks,
+multi-modal attacks, and additional attack methods specifically targeting
+complex systems, such as federated learning or multi-agent systems. We also
+offer comprehensive remarks on works that focus on the fundamental sources of
+vulnerabilities and potential defenses. To make this field more accessible to
+newcomers, we present a systematic review of existing works, a structured
+typology of adversarial attack concepts, and additional resources, including
+slides for presentations on related topics at the 62nd Annual Meeting of the
+Association for Computational Linguistics (ACL'24).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fake News in Sheep's Clothing: Robust Fake News Detection Against
+  LLM-Empowered Style Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10830v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10830v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaying Wu, Bryan Hooi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is commonly perceived that online fake news and reliable news exhibit
+stark differences in writing styles, such as the use of sensationalist versus
+objective language. However, we emphasize that style-related features can also
+be exploited for style-based attacks. Notably, the rise of powerful Large
+Language Models (LLMs) has enabled malicious users to mimic the style of
+trustworthy news outlets at minimal cost. Our analysis reveals that
+LLM-camouflaged fake news content leads to substantial performance degradation
+of state-of-the-art text-based detectors (up to 38% decrease in F1 Score),
+posing a significant challenge for automated detection in online ecosystems. To
+address this, we introduce SheepDog, a style-agnostic fake news detector robust
+to news writing styles. SheepDog achieves this adaptability through
+LLM-empowered news reframing, which customizes each article to match different
+writing styles using style-oriented reframing prompts. By employing
+style-agnostic training, SheepDog enhances its resilience to stylistic
+variations by maximizing prediction consistency across these diverse
+reframings. Furthermore, SheepDog extracts content-focused veracity
+attributions from LLMs, where the news content is evaluated against a set of
+fact-checking rationales. These attributions provide supplementary information
+and potential interpretability that assist veracity prediction. On three
+benchmark datasets, empirical results show that SheepDog consistently yields
+significant improvements over competitive baselines and enhances robustness
+against LLM-empowered style attacks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SD-Hu<span class="highlight-title">BERT</span>: Self-Distillation Induces Syllabic Organization in Hu<span class="highlight-title">BERT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10803v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10803v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheol Jun Cho, Abdelrahman Mohamed, Shang-Wen Li, Alan W Black, Gopala K. Anumanchipalli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data-driven unit discovery in self-supervised learning (SSL) of speech has
+embarked on a new era of spoken language processing. Yet, the discovered units
+often remain in phonetic space, limiting the utility of SSL representations.
+Here, we demonstrate that a syllabic organization emerges in learning
+sentence-level representation of speech. In particular, we adopt
+"self-distillation" objective to fine-tune the pretrained HuBERT with an
+aggregator token that summarizes the entire sentence. Without any supervision,
+the resulting model draws definite boundaries in speech, and the
+representations across frames show salient syllabic structures. We demonstrate
+that this emergent structure largely corresponds to the ground truth syllables.
+Furthermore, we propose a new benchmark task, Spoken Speech ABX, for evaluating
+sentence-level representation of speech. When compared to previous models, our
+model outperforms in both unsupervised syllable discovery and learning
+sentence-level representation. Together, we demonstrate that the
+self-distillation of HuBERT gives rise to syllabic organization without relying
+on external labels or modalities, and potentially provides novel data-driven
+units for spoken language modeling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-Supervised</span> Models of Speech Infer Universal Articulatory Kinematics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10788v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10788v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheol Jun Cho, Abdelrahman Mohamed, Alan W Black, Gopala K. Anumanchipalli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-Supervised Learning (SSL) based models of speech have shown remarkable
+performance on a range of downstream tasks. These state-of-the-art models have
+remained blackboxes, but many recent studies have begun "probing" models like
+HuBERT, to correlate their internal representations to different aspects of
+speech. In this paper, we show "inference of articulatory kinematics" as
+fundamental property of SSL models, i.e., the ability of these models to
+transform acoustics into the causal articulatory dynamics underlying the speech
+signal. We also show that this abstraction is largely overlapping across the
+language of the data used to train the model, with preference to the language
+with similar phonological system. Furthermore, we show that with simple affine
+transformations, Acoustic-to-Articulatory inversion (AAI) is transferrable
+across speakers, even across genders, languages, and dialects, showing the
+generalizability of this property. Together, these results shed new light on
+the internals of SSL models that are critical to their superior performance,
+and open up new avenues into language-agnostic universal models for speech
+engineering, that are interpretable and grounded in speech science.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BanglaNLP at BLP-2023 Task 1: Benchmarking different <span class="highlight-title">Transformer</span> Models
+  for Violence Inciting Text Detection in Bengali 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10781v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10781v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saumajit Saha, Albert Nanda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents the system that we have developed while solving this
+shared task on violence inciting text detection in Bangla. We explain both the
+traditional and the recent approaches that we have used to make our models
+learn. Our proposed system helps to classify if the given text contains any
+threat. We studied the impact of data augmentation when there is a limited
+dataset available. Our quantitative results show that finetuning a
+multilingual-e5-base model performed the best in our task compared to other
+transformer-based architectures. We obtained a macro F1 of 68.11\% in the test
+set and our performance in this shared task is ranked at 23 in the leaderboard.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures, workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BiomedJourney: Counterfactual Biomedical Image Generation by
+  Instruction-Learning from Multimodal Patient Journeys 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10765v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10765v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Gu, Jianwei Yang, Naoto Usuyama, Chunyuan Li, Sheng Zhang, Matthew P. Lungren, Jianfeng Gao, Hoifung Poon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rapid progress has been made in instruction-learning for image editing with
+natural-language instruction, as exemplified by InstructPix2Pix. In
+biomedicine, such methods can be applied to counterfactual image generation,
+which helps differentiate causal structure from spurious correlation and
+facilitate robust image interpretation for disease progression modeling.
+However, generic image-editing models are ill-suited for the biomedical domain,
+and counterfactual biomedical image generation is largely underexplored. In
+this paper, we present BiomedJourney, a novel method for counterfactual
+biomedical image generation by instruction-learning from multimodal patient
+journeys. Given a patient with two biomedical images taken at different time
+points, we use GPT-4 to process the corresponding imaging reports and generate
+a natural language description of disease progression. The resulting triples
+(prior image, progression description, new image) are then used to train a
+latent diffusion model for counterfactual biomedical image generation. Given
+the relative scarcity of image time series data, we introduce a two-stage
+curriculum that first pretrains the denoising network using the much more
+abundant single image-report pairs (with dummy prior image), and then continues
+training using the counterfactual triples. Experiments using the standard
+MIMIC-CXR dataset demonstrate the promise of our method. In a comprehensive
+battery of tests on counterfactual medical image generation, BiomedJourney
+substantially outperforms prior state-of-the-art methods in instruction image
+editing and medical image generation such as InstructPix2Pix and RoentGen. To
+facilitate future study in counterfactual medical generation, we plan to
+release our instruction-learning code and pretrained models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards reducing hallucination in extracting information from financial
+  reports using Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10760v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10760v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bhaskarjit Sarmah, Tianjie Zhu, Dhagash Mehta, Stefano Pasquali
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For a financial analyst, the question and answer (Q\&A) segment of the
+company financial report is a crucial piece of information for various analysis
+and investment decisions. However, extracting valuable insights from the Q\&A
+section has posed considerable challenges as the conventional methods such as
+detailed reading and note-taking lack scalability and are susceptible to human
+errors, and Optical Character Recognition (OCR) and similar techniques
+encounter difficulties in accurately processing unstructured transcript text,
+often missing subtle linguistic nuances that drive investor decisions. Here, we
+demonstrate the utilization of Large Language Models (LLMs) to efficiently and
+rapidly extract information from earnings report transcripts while ensuring
+high accuracy transforming the extraction process as well as reducing
+hallucination by combining retrieval-augmented generation technique as well as
+metadata. We evaluate the outcomes of various LLMs with and without using our
+proposed approach based on various objective metrics for evaluating Q\&A
+systems, and empirically demonstrate superiority of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages + references. Accepted for publication in Workshop on
+  Generative AI at the 3rd International Conference on AI-ML Systems 2023,
+  Bengaluru, India</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Building Persona Consistent Dialogue Agents with Offline Reinforcement
+  Learning <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10735v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10735v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryan Shea, Zhou Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Maintaining a consistent persona is a key quality for any open domain
+dialogue system. Current state-of-the-art systems do this by training agents
+with supervised learning or online reinforcement learning (RL). However,
+systems trained with supervised learning often lack consistency as they are
+never punished for uttering contradictions. Additional training with RL can
+alleviate some of these issues, however the training process is expensive.
+Instead, we propose an offline RL framework to improve the persona consistency
+of dialogue systems. Our framework allows us to combine the advantages of
+previous methods as we can inexpensively train our model on existing data as in
+supervised learning, while punishing and rewarding specific utterances as in
+RL. We also introduce a simple importance sampling method to reduce the
+variance of importance weights in offline RL training which we call
+Variance-Reducing MLE-Initialized (VaRMI) importance sampling. Our automatic
+and human evaluations show that our framework improves both the persona
+consistency and dialogue quality of a state-of-the-art social chatbot.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Step-by-Step Remediation of Students' Mathematical Mistakes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10648v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10648v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rose E. Wang, Qingyang Zhang, Carly Robinson, Susanna Loeb, Dorottya Demszky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scaling high-quality tutoring is a major challenge in education. Because of
+the growing demand, many platforms employ novice tutors who, unlike
+professional educators, struggle to effectively address student mistakes and
+thus fail to seize prime learning opportunities for students. In this paper, we
+explore the potential for large language models (LLMs) to assist math tutors in
+remediating student mistakes. We present ReMath, a benchmark co-developed with
+experienced math teachers that deconstructs their thought process for
+remediation. The benchmark consists of three step-by-step tasks: (1) infer the
+type of student error, (2) determine the strategy to address the error, and (3)
+generate a response that incorporates that information. We evaluate the
+performance of state-of-the-art instruct-tuned and dialog models on ReMath. Our
+findings suggest that although models consistently improve upon original tutor
+responses, we cannot rely on models alone to remediate mistakes. Providing
+models with the error type (e.g., the student is guessing) and strategy (e.g.,
+simplify the problem) leads to a 75% improvement in the response quality over
+models without that information. Nonetheless, despite the improvement, the
+quality of the best model's responses still falls short of experienced math
+teachers. Our work sheds light on the potential and limitations of using
+current LLMs to provide high-quality learning experiences for both tutors and
+students at scale. Our work is open-sourced at this link:
+\url{https://github.com/rosewang2008/remath}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our work is open-sourced at this link:
+  \url{https://github.com/rosewang2008/remath}</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interactive Task Planning with Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10645v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10645v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boyi Li, Philipp Wu, Pieter Abbeel, Jitendra Malik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An interactive robot framework accomplishes long-horizon task planning and
+can easily generalize to new goals or distinct tasks, even during execution.
+However, most traditional methods require predefined module design, which makes
+it hard to generalize to different goals. Recent large language model based
+approaches can allow for more open-ended planning but often require heavy
+prompt engineering or domain-specific pretrained models. To tackle this, we
+propose a simple framework that achieves interactive task planning with
+language models. Our system incorporates both high-level planning and low-level
+function execution via language. We verify the robustness of our system in
+generating novel high-level instructions for unseen objectives and its ease of
+adaptation to different tasks by merely substituting the task guidelines,
+without the need for additional complex prompt engineering. Furthermore, when
+the user sends a new request, our system is able to replan accordingly with
+precision based on the new request, task guidelines and previously executed
+steps. Please check more details on our https://wuphilipp.github.io/itp_site
+and https://youtu.be/TrKLuyv26_g.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ In-Context <span class="highlight-title">Pretrain</span>ing: Language Modeling Beyond Document Boundaries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10638v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10638v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weijia Shi, Sewon Min, Maria Lomeli, Chunting Zhou, Margaret Li, Victoria Lin, Noah A. Smith, Luke Zettlemoyer, Scott Yih, Mike Lewis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LMs) are currently trained to predict tokens given
+document prefixes, enabling them to directly perform long-form generation and
+prompting-style tasks which can be reduced to document completion. Existing
+pretraining pipelines train LMs by concatenating random sets of short documents
+to create input contexts but the prior documents provide no signal for
+predicting the next document. We instead present In-Context Pretraining, a new
+approach where language models are pretrained on a sequence of related
+documents, thereby explicitly encouraging them to read and reason across
+document boundaries. We can do In-Context Pretraining by simply changing the
+document ordering so that each context contains related documents, and directly
+applying existing pretraining pipelines. However, this document sorting problem
+is challenging. There are billions of documents and we would like the sort to
+maximize contextual similarity for every document without repeating any data.
+To do this, we introduce approximate algorithms for finding related documents
+with efficient nearest neighbor search and constructing coherent input contexts
+with a graph traversal algorithm. Our experiments show In-Context Pretraining
+offers a simple and scalable approach to significantly enhance LMs'performance:
+we see notable improvements in tasks that require more complex contextual
+reasoning, including in-context learning (+8%), reading comprehension (+15%),
+faithfulness to previous contexts (+16%), long-context reasoning (+5%), and
+retrieval augmentation (+9%).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ "Mistakes Help Us Grow": Facilitating and Evaluating Growth Mindset
+  Supportive Language in Classrooms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10637v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10637v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kunal Handa, Margaret Clapper, Jessica Boyle, Rose E Wang, Diyi Yang, David S Yeager, Dorottya Demszky
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Teachers' growth mindset supportive language (GMSL)--rhetoric emphasizing
+that one's skills can be improved over time--has been shown to significantly
+reduce disparities in academic achievement and enhance students' learning
+outcomes. Although teachers espouse growth mindset principles, most find it
+difficult to adopt GMSL in their practice due the lack of effective coaching in
+this area. We explore whether large language models (LLMs) can provide
+automated, personalized coaching to support teachers' use of GMSL. We establish
+an effective coaching tool to reframe unsupportive utterances to GMSL by
+developing (i) a parallel dataset containing GMSL-trained teacher reframings of
+unsupportive statements with an accompanying annotation guide, (ii) a GMSL
+prompt framework to revise teachers' unsupportive language, and (iii) an
+evaluation framework grounded in psychological theory for evaluating GMSL with
+the help of students and teachers. We conduct a large-scale evaluation
+involving 174 teachers and 1,006 students, finding that both teachers and
+students perceive GMSL-trained teacher and model reframings as more effective
+in fostering a growth mindset and promoting challenge-seeking behavior, among
+other benefits. We also find that model-generated reframings outperform those
+from the GMSL-trained teachers. These results show promise for harnessing LLMs
+to provide automated GMSL feedback for teachers and, more broadly, LLMs'
+potentiality for supporting students' learning in the classroom. Our findings
+also demonstrate the benefit of large-scale human evaluations when applying
+LLMs in educational domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Proceedings of the 2023 Conference on Empirical Methods in Natural
+  Language Processing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OpenAgents: An Open Platform for Language Agents in the Wild 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10634v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10634v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianbao Xie, Fan Zhou, Zhoujun Cheng, Peng Shi, Luoxuan Weng, Yitao Liu, Toh Jing Hua, Junning Zhao, Qian Liu, Che Liu, Leo Z. Liu, Yiheng Xu, Hongjin Su, Dongchan Shin, Caiming Xiong, Tao Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language agents show potential in being capable of utilizing natural language
+for varied and intricate tasks in diverse environments, particularly when built
+upon large language models (LLMs). Current language agent frameworks aim to
+facilitate the construction of proof-of-concept language agents while
+neglecting the non-expert user access to agents and paying little attention to
+application-level designs. We present OpenAgents, an open platform for using
+and hosting language agents in the wild of everyday life. OpenAgents includes
+three agents: (1) Data Agent for data analysis with Python/SQL and data tools;
+(2) Plugins Agent with 200+ daily API tools; (3) Web Agent for autonomous web
+browsing. OpenAgents enables general users to interact with agent
+functionalities through a web user interface optimized for swift responses and
+common failures while offering developers and researchers a seamless deployment
+experience on local setups, providing a foundation for crafting innovative
+language agents and facilitating real-world evaluations. We elucidate the
+challenges and opportunities, aspiring to set a foundation for future research
+and development of real-world language agents.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BioPlanner: Automatic Evaluation of LLMs on Protocol Planning in Biology <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10632v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10632v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Odhran O'Donoghue, Aleksandar Shtedritski, John Ginger, Ralph Abboud, Ali Essa Ghareeb, Justin Booth, Samuel G Rodriques
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to automatically generate accurate protocols for scientific
+experiments would represent a major step towards the automation of science.
+Large Language Models (LLMs) have impressive capabilities on a wide range of
+tasks, such as question answering and the generation of coherent text and code.
+However, LLMs can struggle with multi-step problems and long-term planning,
+which are crucial for designing scientific experiments. Moreover, evaluation of
+the accuracy of scientific protocols is challenging, because experiments can be
+described correctly in many different ways, require expert knowledge to
+evaluate, and cannot usually be executed automatically. Here we present an
+automatic evaluation framework for the task of planning experimental protocols,
+and we introduce BioProt: a dataset of biology protocols with corresponding
+pseudocode representations. To measure performance on generating scientific
+protocols, we use an LLM to convert a natural language protocol into
+pseudocode, and then evaluate an LLM's ability to reconstruct the pseudocode
+from a high-level description and a list of admissible pseudocode functions. We
+evaluate GPT-3 and GPT-4 on this task and explore their robustness. We
+externally validate the utility of pseudocode representations of text by
+generating accurate novel protocols using retrieved pseudocode, and we run a
+generated protocol successfully in our biological laboratory. Our framework is
+extensible to the evaluation and improvement of language model planning
+abilities in other areas of science or other areas that lack automatic
+evaluation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023. Dataset and code:
+  https://github.com/bioplanner/bioplanner</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Llemma: An Open Language Model For Mathematics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10631v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10631v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhangir Azerbayev, Hailey Schoelkopf, Keiran Paster, Marco Dos Santos, Stephen McAleer, Albert Q. Jiang, Jia Deng, Stella Biderman, Sean Welleck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Llemma, a large language model for mathematics. We continue
+pretraining Code Llama on the Proof-Pile-2, a mixture of scientific papers, web
+data containing mathematics, and mathematical code, yielding Llemma. On the
+MATH benchmark Llemma outperforms all known open base models, as well as the
+unreleased Minerva model suite on an equi-parameter basis. Moreover, Llemma is
+capable of tool use and formal theorem proving without any further finetuning.
+We openly release all artifacts, including 7 billion and 34 billion parameter
+models, the Proof-Pile-2, and code to replicate our experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data Contamination Through the Lens of Time 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10628v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10628v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manley Roberts, Himanshu Thakur, Christine Herlihy, Colin White, Samuel Dooley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent claims about the impressive abilities of large language models (LLMs)
+are often supported by evaluating publicly available benchmarks. Since LLMs
+train on wide swaths of the internet, this practice raises concerns of data
+contamination, i.e., evaluating on examples that are explicitly or implicitly
+included in the training data. Data contamination remains notoriously
+challenging to measure and mitigate, even with partial attempts like controlled
+experimentation of training data, canary strings, or embedding similarities. In
+this work, we conduct the first thorough longitudinal analysis of data
+contamination in LLMs by using the natural experiment of training cutoffs in
+GPT models to look at benchmarks released over time. Specifically, we consider
+two code/mathematical problem-solving datasets, Codeforces and Project Euler,
+and find statistically significant trends among LLM pass rate vs. GitHub
+popularity and release date that provide strong evidence of contamination. By
+open-sourcing our dataset, raw results, and evaluation framework, our work
+paves the way for rigorous analyses of data contamination in modern models. We
+conclude with a discussion of best practices and future steps for publicly
+releasing benchmarks in the age of LLMs that train on webscale data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Factored Verification: Detecting and Reducing Hallucination in Summaries
+  of Academic Papers <span class="chip">SP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10627v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10627v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Charlie George, Andreas Stuhlmüller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hallucination plagues even frontier LLMs--but how bad is it really for
+summarizing academic papers? We evaluate Factored Verification, a simple
+automated method for detecting hallucinations in abstractive summaries. This
+method sets a new SotA on hallucination detection in the summarization task of
+the HaluEval benchmark, achieving 76.2% accuracy. We then use this method to
+estimate how often language models hallucinate when summarizing across multiple
+academic papers and find 0.62 hallucinations in the average ChatGPT (16k)
+summary, 0.84 for GPT-4, and 1.55 for Claude 2. We ask models to self-correct
+using Factored Critiques and find that this lowers the number of hallucinations
+to 0.49 for ChatGPT, 0.46 for GPT-4, and 0.95 for Claude 2. The hallucinations
+we find are often subtle, so we advise caution when using models to synthesize
+academic papers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Second Workshop on Information Extraction from Scientific
+  Publications (WIESP) at IJCNLP-AACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generating Summaries with Controllable Readability Levels <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10623v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10623v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leonardo F. R. Ribeiro, Mohit Bansal, Markus Dreyer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Readability refers to how easily a reader can understand a written text.
+Several factors affect the readability level, such as the complexity of the
+text, its subject matter, and the reader's background knowledge. Generating
+summaries based on different readability levels is critical for enabling
+knowledge consumption by diverse audiences. However, current text generation
+approaches lack refined control, resulting in texts that are not customized to
+readers' proficiency levels. In this work, we bridge this gap and study
+techniques to generate summaries at specified readability levels. Unlike
+previous methods that focus on a specific readability level (e.g., lay
+summarization), we generate summaries with fine-grained control over their
+readability. We develop three text generation techniques for controlling
+readability: (1) instruction-based readability control, (2) reinforcement
+learning to minimize the gap between requested and observed readability and (3)
+a decoding approach that uses lookahead to estimate the readability of upcoming
+decoding steps. We show that our generation methods significantly improve
+readability control on news summarization (CNN/DM dataset), as measured by
+various readability metrics and human judgement, establishing strong baselines
+for controllable readability in summarization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as an EMNLP 2023 main paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ForceGen: End-to-end de novo protein generation based on nonlinear
+  mechanical unfolding responses using a protein language diffusion model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10605v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10605v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Ni, David L. Kaplan, Markus J. Buehler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Through evolution, nature has presented a set of remarkable protein
+materials, including elastins, silks, keratins and collagens with superior
+mechanical performances that play crucial roles in mechanobiology. However,
+going beyond natural designs to discover proteins that meet specified
+mechanical properties remains challenging. Here we report a generative model
+that predicts protein designs to meet complex nonlinear mechanical
+property-design objectives. Our model leverages deep knowledge on protein
+sequences from a pre-trained protein language model and maps mechanical
+unfolding responses to create novel proteins. Via full-atom molecular
+simulations for direct validation, we demonstrate that the designed proteins
+are novel, and fulfill the targeted mechanical properties, including unfolding
+energy and mechanical strength, as well as the detailed unfolding
+force-separation curves. Our model offers rapid pathways to explore the
+enormous mechanobiological protein sequence space unconstrained by biological
+synthesis, using mechanical features as target to enable the discovery of
+protein materials with superior mechanical properties.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Motion2Language, Unsupervised learning of synchronized semantic motion
+  segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10594v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10594v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karim Radouane, Andon Tchechmedjiev, Sylvie Ranwez, Julien Lagarde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigate building a sequence to sequence architecture
+for motion to language translation and synchronization. The aim is to translate
+motion capture inputs into English natural-language descriptions, such that the
+descriptions are generated synchronously with the actions performed, enabling
+semantic segmentation as a byproduct, but without requiring synchronized
+training data. We propose a new recurrent formulation of local attention that
+is suited for synchronous/live text generation, as well as an improved motion
+encoder architecture better suited to smaller data and for synchronous
+generation. We evaluate both contributions in individual experiments, using the
+standard BLEU4 metric, as well as a simple semantic equivalence measure, on the
+KIT motion language dataset. In a follow-up experiment, we assess the quality
+of the synchronization of generated text in our proposed approaches through
+multiple evaluation metrics. We find that both contributions to the attention
+mechanism and the encoder architecture additively improve the quality of
+generated text (BLEU and semantic equivalence), but also of synchronization.
+Our code will be made available at
+\url{https://github.com/rd20karim/M2T-Segmentation/tree/main}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mastering the Task of Open Information Extraction with Large Language
+  Models and Consistent Reasoning Environment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10590v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10590v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ji Qi, Kaixuan Ji, Xiaozhi Wang, Jifan Yu, Kaisheng Zeng, Lei Hou, Juanzi Li, Bin Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open Information Extraction (OIE) aims to extract objective structured
+knowledge from natural texts, which has attracted growing attention to build
+dedicated models with human experience. As the large language models (LLMs)
+have exhibited remarkable in-context learning capabilities, a question arises
+as to whether the task of OIE can be effectively tackled with this paradigm? In
+this paper, we explore solving the OIE problem by constructing an appropriate
+reasoning environment for LLMs. Specifically, we first propose a method to
+effectively estimate the discrepancy of syntactic distribution between a LLM
+and test samples, which can serve as correlation evidence for preparing
+positive demonstrations. Upon the evidence, we introduce a simple yet effective
+mechanism to establish the reasoning environment for LLMs on specific tasks.
+Without bells and whistles, experimental results on the standard CaRB benchmark
+demonstrate that our $6$-shot approach outperforms state-of-the-art supervised
+method, achieving an $55.3$ $F_1$ score. Further experiments on TACRED and
+ACE05 show that our method can naturally generalize to other information
+extraction tasks, resulting in improvements of $5.7$ and $6.8$ $F_1$ scores,
+respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BiLL-VTG: Bridging Large Language Models and Lightweight Visual Tools
+  for Video-based Texts Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10586v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10586v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ji Qi, Kaixuan Ji, Jifan Yu, Duokang Wang, Bin Xu, Lei Hou, Juanzi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building models that generate textual responses to user instructions for
+videos is a practical and challenging topic, as it requires both vision
+understanding and knowledge reasoning. Compared to language and image
+modalities, training efficiency remains a serious problem as existing studies
+train models on massive sparse videos aligned with brief descriptions. In this
+paper, we introduce BiLL-VTG, a fast adaptive framework that leverages large
+language models (LLMs) to reasoning on videos based on essential lightweight
+visual tools. Specifically, we reveal the key to response specific instructions
+is the concentration on relevant video events, and utilize two visual tools of
+structured scene graph generation and descriptive image caption generation to
+gather and represent the events information. Thus, a LLM equipped with world
+knowledge is adopted as the reasoning agent to achieve the response by
+performing multiple reasoning steps on specified video events.To address the
+difficulty of specifying events from agent, we further propose an
+Instruction-oriented Video Events Recognition (InsOVER) algorithm based on the
+efficient Hungarian matching to localize corresponding video events using
+linguistic instructions, enabling LLMs to interact with long videos. Extensive
+experiments on two typical video-based texts generations tasks show that our
+tuning-free framework outperforms the pre-trained models including
+Flamingo-80B, to achieve the state-of-the-art performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Who Are All The Stochastic Parrots Imitating? They Should Tell Us! <span class="chip">AACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10583v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10583v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sagi Shaier, Lawrence E. Hunter, Katharina von der Wense
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Both standalone language models (LMs) as well as LMs within downstream-task
+systems have been shown to generate statements which are factually untrue. This
+problem is especially severe for low-resource languages, where training data is
+scarce and of worse quality than for high-resource languages. In this opinion
+piece, we argue that LMs in their current state will never be fully trustworthy
+in critical settings and suggest a possible novel strategy to handle this
+issue: by building LMs such that can cite their sources - i.e., point a user to
+the parts of their training data that back up their outputs. We first discuss
+which current NLP tasks would or would not benefit from such models. We then
+highlight the expected benefits such models would bring, e.g., quick
+verifiability of statements. We end by outlining the individual tasks that
+would need to be solved on the way to developing LMs with the ability to cite.
+We hope to start a discussion about the field's current approach to building
+LMs, especially for low-resource languages, and the role of the training data
+in explaining model generations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IJCNLP-AACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Emerging Challenges in Personalized Medicine: Assessing Demographic
+  Effects on Biomedical Question Answering Systems <span class="chip">AACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10571v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10571v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sagi Shaier, Kevin Bennett, Lawrence Hunter, Katharina von der Wense
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-of-the-art question answering (QA) models exhibit a variety of social
+biases (e.g., with respect to sex or race), generally explained by similar
+issues in their training data. However, what has been overlooked so far is that
+in the critical domain of biomedicine, any unjustified change in model output
+due to patient demographics is problematic: it results in the unfair treatment
+of patients. Selecting only questions on biomedical topics whose answers do not
+depend on ethnicity, sex, or sexual orientation, we ask the following research
+questions: (RQ1) Do the answers of QA models change when being provided with
+irrelevant demographic information? (RQ2) Does the answer of RQ1 differ between
+knowledge graph (KG)-grounded and text-based QA systems? We find that
+irrelevant demographic information change up to 15% of the answers of a
+KG-grounded system and up to 23% of the answers of a text-based system,
+including changes that affect accuracy. We conclude that unjustified answer
+changes caused by patient demographics are a frequent phenomenon, which raises
+fairness concerns and should be paid more attention to.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IJCNLP-AACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Position Bias in Summarization with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10570v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10570v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mathieu Ravaut, Shafiq Joty, Aixin Sun, Nancy F. Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) excel in zero-shot abstractive summarization
+tasks, delivering fluent and pertinent summaries. Recent advancements have
+extended their capabilities to handle long-input contexts, surpassing token
+limits of 32k or more. However, in the realm of multi-document question
+answering, language models exhibit uneven utilization of their input context.
+They tend to favor the initial and final segments, resulting in a U-shaped
+performance pattern concerning where the answer is located within the input.
+This bias raises concerns, particularly in summarization tasks where crucial
+content may be dispersed throughout the source document(s). This paper presents
+a comprehensive investigation encompassing 10 datasets, 4 LLMs, and 5
+evaluation metrics to analyze how these models leverage their input for
+abstractive summarization. Our findings reveal a pronounced bias towards the
+introductory content (and to a lesser extent, the final content), posing
+challenges for LLM performance across a range of diverse summarization
+benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>work in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RegaVAE: A Retrieval-Augmented Gaussian Mixture Variational Auto-Encoder
+  for Language Modeling <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10567v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10567v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingcheng Deng, Liang Pang, Huawei Shen, Xueqi Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-augmented language models show promise in addressing issues like
+outdated information and hallucinations in language models (LMs). However,
+current research faces two main problems: 1) determining what information to
+retrieve, and 2) effectively combining retrieved information during generation.
+We argue that valuable retrieved information should not only be related to the
+current source text but also consider the future target text, given the nature
+of LMs that model future tokens. Moreover, we propose that aggregation using
+latent variables derived from a compact latent space is more efficient than
+utilizing explicit raw text, which is limited by context length and susceptible
+to noise. Therefore, we introduce RegaVAE, a retrieval-augmented language model
+built upon the variational auto-encoder (VAE). It encodes the text corpus into
+a latent space, capturing current and future information from both source and
+target text. Additionally, we leverage the VAE to initialize the latent space
+and adopt the probabilistic form of the retrieval generation paradigm by
+expanding the Gaussian prior distribution into a Gaussian mixture distribution.
+Theoretical analysis provides an optimizable upper bound for RegaVAE.
+Experimental results on various datasets demonstrate significant improvements
+in text generation quality and hallucination removal.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Demonstrations Are All You Need: Advancing Offensive Content
+  Paraphrasing using In-Context Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10707v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10707v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anirudh Som, Karan Sikka, Helen Gent, Ajay Divakaran, Andreas Kathol, Dimitra Vergyri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Paraphrasing of offensive content is a better alternative to content removal
+and helps improve civility in a communication environment. Supervised
+paraphrasers; however, rely heavily on large quantities of labelled data to
+help preserve meaning and intent. They also retain a large portion of the
+offensiveness of the original content, which raises questions on their overall
+usability. In this paper we aim to assist practitioners in developing usable
+paraphrasers by exploring In-Context Learning (ICL) with large language models
+(LLMs), i.e., using a limited number of input-label demonstration pairs to
+guide the model in generating desired outputs for specific queries. Our study
+focuses on key factors such as -- number and order of demonstrations, exclusion
+of prompt instruction, and reduction in measured toxicity. We perform
+principled evaluation on three datasets, including our proposed Context-Aware
+Polite Paraphrase dataset, comprising of dialogue-style rude utterances, polite
+paraphrases, and additional dialogue context. We evaluate our approach using
+two closed source and one open source LLM. Our results reveal that ICL is
+comparable to supervised methods in generation quality, while being
+qualitatively better by 25% on human evaluation and attaining lower toxicity by
+76%. Also, ICL-based paraphrasers only show a slight reduction in performance
+even with just 10% training data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ViPE: Visualise Pretty-much Everything <span class="chip">EMNLP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10543v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10543v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hassan Shahmohammadi, Adhiraj Ghosh, Hendrik P. A. Lensch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Figurative and non-literal expressions are profoundly integrated in human
+communication. Visualising such expressions allow us to convey our creative
+thoughts, and evoke nuanced emotions. Recent text-to-image models like Stable
+Diffusion, on the other hand, struggle to depict non-literal expressions.
+Recent works primarily deal with this issue by compiling humanly annotated
+datasets on a small scale, which not only demands specialised expertise but
+also proves highly inefficient. To address this issue, we introduce ViPE:
+Visualise Pretty-much Everything. ViPE offers a series of lightweight and
+robust language models that have been trained on a large-scale set of lyrics
+with noisy visual descriptions that represent their implicit meaning. The
+synthetic visual descriptions are generated by GPT3.5 relying on neither human
+annotations nor images. ViPE effectively expresses any arbitrary piece of text
+into a visualisable description, enabling meaningful and high-quality image
+generation. We provide compelling evidence that ViPE is more robust than GPT3.5
+in synthesising visual elaborations. ViPE also exhibits an understanding of
+figurative expressions comparable to human experts, providing a powerful and
+open-source backbone to many downstream applications such as music video and
+caption generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented in EMNLP2023 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ One For All & All For One: Bypassing Hyperparameter Tuning with Model
+  Averaging For Cross-Lingual Transfer <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10532v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10532v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian David Schmidt, Ivan Vulić, Goran Glavaš
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multilingual language models enable zero-shot cross-lingual transfer
+(ZS-XLT): fine-tuned on sizable source-language task data, they perform the
+task in target languages without labeled instances. The effectiveness of ZS-XLT
+hinges on the linguistic proximity between languages and the amount of
+pretraining data for a language. Because of this, model selection based on
+source-language validation is unreliable: it picks model snapshots with
+suboptimal target-language performance. As a remedy, some work optimizes ZS-XLT
+by extensively tuning hyperparameters: the follow-up work then routinely
+struggles to replicate the original results. Other work searches over narrower
+hyperparameter grids, reporting substantially lower performance. In this work,
+we therefore propose an unsupervised evaluation protocol for ZS-XLT that
+decouples performance maximization from hyperparameter tuning. As a robust and
+more transparent alternative to extensive hyperparameter tuning, we propose to
+accumulatively average snapshots from different runs into a single model. We
+run broad ZS-XLT experiments on both higher-level semantic tasks (NLI,
+extractive QA) and a lower-level token classification task (NER) and find that
+conventional model selection based on source-language validation quickly
+plateaus to suboptimal ZS-XLT performance. On the other hand, our accumulative
+run-by-run averaging of models trained with different hyperparameters boosts
+ZS-XLT performance and closely correlates with "oracle" ZS-XLT, i.e., model
+selection based on target-language validation performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semantic Parsing by Large Language Models for Intricate Updating
+  Strategies of Zero-Shot Dialogue State Tracking <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10520v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10520v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxiang Wu, Guanting Dong, Weiran Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot Dialogue State Tracking (DST) addresses the challenge of acquiring
+and annotating task-oriented dialogues, which can be time consuming and costly.
+However, DST extends beyond simple slot-filling and requires effective updating
+strategies for tracking dialogue state as conversations progress. In this
+paper, we propose ParsingDST, a new In-Context Learning (ICL) method, to
+introduce additional intricate updating strategies in zero-shot DST. Our
+approach reformulates the DST task by leveraging powerful Large Language Models
+(LLMs) and translating the original dialogue text to JSON through semantic
+parsing as an intermediate state. We also design a novel framework that
+includes more modules to ensure the effectiveness of updating strategies in the
+text-to-JSON process. Experimental results demonstrate that our approach
+outperforms existing zero-shot DST methods on MultiWOZ, exhibiting significant
+improvements in Joint Goal Accuracy (JGA) and slot accuracy compared to
+existing ICL methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the Findings of EMNLP 2023 (Short Paper)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NeMo Guardrails: A Toolkit for Controllable and Safe LLM Applications
+  with Programmable Rails <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10501v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10501v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Traian Rebedea, Razvan Dinu, Makesh Sreedhar, Christopher Parisien, Jonathan Cohen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  NeMo Guardrails is an open-source toolkit for easily adding programmable
+guardrails to LLM-based conversational systems. Guardrails (or rails for short)
+are a specific way of controlling the output of an LLM, such as not talking
+about topics considered harmful, following a predefined dialogue path, using a
+particular language style, and more. There are several mechanisms that allow
+LLM providers and developers to add guardrails that are embedded into a
+specific model at training, e.g. using model alignment. Differently, using a
+runtime inspired from dialogue management, NeMo Guardrails allows developers to
+add programmable rails to LLM applications - these are user-defined,
+independent of the underlying LLM, and interpretable. Our initial results show
+that the proposed approach can be used with several LLM providers to develop
+controllable and safe LLM applications using programmable rails.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at EMNLP 2023 - Demo track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Metric Ensembles For Hallucination Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10495v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10495v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Grant C. Forbes, Parth Katlana, Zeydy Ortiz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Abstractive text summarization has garnered increased interest as of late, in
+part due to the proliferation of large language models (LLMs). One of the most
+pressing problems related to generation of abstractive summaries is the need to
+reduce "hallucinations," information that was not included in the document
+being summarized, and which may be wholly incorrect. Due to this need, a wide
+array of metrics estimating consistency with the text being summarized have
+been proposed. We examine in particular a suite of unsupervised metrics for
+summary consistency, and measure their correlations with each other and with
+human evaluation scores in the wiki_bio_gpt3_hallucination dataset. We then
+compare these evaluations to models made from a simple linear ensemble of these
+metrics. We find that LLM-based methods outperform other unsupervised metrics
+for hallucination detection. We also find that ensemble methods can improve
+these scores even further, provided that the metrics in the ensemble have
+sufficiently similar and uncorrelated error rates. Finally, we present an
+ensemble method for LLM-based evaluations that we show improves over this
+previous SOTA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UNO-DST: Leveraging Unlabelled Data in Zero-Shot Dialogue State Tracking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10492v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10492v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuang Li, Yan Zhang, Min-Yen Kan, Haizhou Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Previous zero-shot dialogue state tracking (DST) methods only apply transfer
+learning, but ignore unlabelled data in the target domain. We transform
+zero-shot DST into few-shot DST by utilising such unlabelled data via joint and
+self-training methods. Our method incorporates auxiliary tasks that generate
+slot types as inverse prompts for main tasks, creating slot values during joint
+training. Cycle consistency between these two tasks enables the generation and
+selection of quality samples in unknown target domains for subsequent
+fine-tuning. This approach also facilitates automatic label creation, thereby
+optimizing the training and fine-tuning of DST models. We demonstrate this
+method's effectiveness on large language models in zero-shot scenarios,
+improving average joint goal accuracy by $8\%$ across all domains in MultiWOZ.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Harnessing the Power of LLMs: Evaluating Human-AI text Co-Creation
+  through the Lens of News Headline Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10706v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10706v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijian Ding, Alison Smith-Renner, Wenjuan Zhang, Joel R. Tetreault, Alejandro Jaimes
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To explore how humans can best leverage LLMs for writing and how interacting
+with these models affects feelings of ownership and trust in the writing
+process, we compared common human-AI interaction types (e.g., guiding system,
+selecting from system outputs, post-editing outputs) in the context of
+LLM-assisted news headline generation. While LLMs alone can generate
+satisfactory news headlines, on average, human control is needed to fix
+undesirable model outputs. Of the interaction methods, guiding and selecting
+model output added the most benefit with the lowest cost (in time and effort).
+Further, AI assistance did not harm participants' perception of control
+compared to freeform editing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Models Are Not Strong Abstract Reasoners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19555v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19555v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gaël Gendron, Qiming Bao, Michael Witbrock, Gillian Dobbie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models have shown tremendous performance on a large variety of
+natural language processing tasks, ranging from text comprehension to common
+sense reasoning. However, the mechanisms responsible for this success remain
+opaque, and it is unclear whether LLMs can achieve human-like cognitive
+capabilities or whether these models are still fundamentally circumscribed.
+Abstract reasoning is a fundamental task for cognition, consisting of finding
+and applying a general pattern from few data. Evaluating deep neural
+architectures on this task could give insight into their potential limitations
+regarding reasoning and their broad generalisation abilities, yet this is
+currently an under-explored area. In this paper, we introduce a new benchmark
+for evaluating language models beyond memorization on abstract reasoning tasks.
+We perform extensive evaluations of state-of-the-art LLMs, showing that they
+currently achieve very limited performance in contrast with other natural
+language tasks, and we examine the reasons for this difference. We apply
+techniques that have been shown to improve performance on other NLP tasks and
+show that their impact on abstract reasoning is limited.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>46 pages, 13 pages for the main paper and 33 pages for the
+  supplement, 35 figures. V2: performed additional experiments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multilingual Simplification of Medical Texts <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12532v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12532v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Joseph, Kathryn Kazanas, Keziah Reina, Vishnesh J. Ramanathan, Wei Xu, Byron C. Wallace, Junyi Jessy Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated text simplification aims to produce simple versions of complex
+texts. This task is especially useful in the medical domain, where the latest
+medical findings are typically communicated via complex and technical articles.
+This creates barriers for laypeople seeking access to up-to-date medical
+findings, consequently impeding progress on health literacy. Most existing work
+on medical text simplification has focused on monolingual settings, with the
+result that such evidence would be available only in just one language (most
+often, English). This work addresses this limitation via multilingual
+simplification, i.e., directly simplifying complex texts into simplified texts
+in multiple languages. We introduce MultiCochrane, the first sentence-aligned
+multilingual text simplification dataset for the medical domain in four
+languages: English, Spanish, French, and Farsi. We evaluate fine-tuned and
+zero-shot models across these languages, with extensive human assessments and
+analyses. Although models can now generate viable simplified texts, we identify
+outstanding challenges that this dataset might be used to address.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This version will be in EMNLP 2023 main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Editable User Profiles for Controllable Text Recommendation <span class="chip">SIGIR-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.04250v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.04250v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheshera Mysore, Mahmood Jasim, Andrew McCallum, Hamed Zamani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Methods for making high-quality recommendations often rely on learning latent
+representations from interaction data. These methods, while performant, do not
+provide ready mechanisms for users to control the recommendation they receive.
+Our work tackles this problem by proposing LACE, a novel concept value
+bottleneck model for controllable text recommendations. LACE represents each
+user with a succinct set of human-readable concepts through retrieval given
+user-interacted documents and learns personalized representations of the
+concepts based on user documents. This concept based user profile is then
+leveraged to make recommendations. The design of our model affords control over
+the recommendations through a number of intuitive interactions with a
+transparent user profile. We first establish the quality of recommendations
+obtained from LACE in an offline evaluation on three recommendation tasks
+spanning six datasets in warm-start, cold-start, and zero-shot setups. Next, we
+validate the controllability of LACE under simulated user interactions.
+Finally, we implement LACE in an interactive controllable recommender system
+and conduct a user study to demonstrate that users are able to improve the
+quality of recommendations they receive through interactions with an editable
+user profile.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGIR-2023 paper with extended results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Whispering LLaMA: A Cross-Modal Generative Error Correction Framework
+  for Speech Recognition <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06434v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06434v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Srijith Radhakrishnan, Chao-Han Huck Yang, Sumeer Ahmad Khan, Rohit Kumar, Narsis A. Kiani, David Gomez-Cabrero, Jesper N. Tegner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a new cross-modal fusion technique designed for generative error
+correction in automatic speech recognition (ASR). Our methodology leverages
+both acoustic information and external linguistic representations to generate
+accurate speech transcription contexts. This marks a step towards a fresh
+paradigm in generative error correction within the realm of n-best hypotheses.
+Unlike the existing ranking-based rescoring methods, our approach adeptly uses
+distinct initialization techniques and parameter-efficient algorithms to boost
+ASR performance derived from pre-trained speech and text models. Through
+evaluation across diverse ASR datasets, we evaluate the stability and
+reproducibility of our fusion technique, demonstrating its improved word error
+rate relative (WERR) performance in comparison to n-best hypotheses by
+relatively 37.66%. To encourage future research, we have made our code and
+pre-trained models open source at
+https://github.com/Srijith-rkr/Whispering-LLaMA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP 2023 as main paper. 10 pages. Revised math
+  notations. GitHub: https://github.com/Srijith-rkr/Whispering-LLaMA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ REFLECT: Summarizing Robot Experiences for Failure Explanation and
+  Correction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15724v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15724v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyi Liu, Arpit Bahety, Shuran Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to detect and analyze failed executions automatically is crucial
+for an explainable and robust robotic system. Recently, Large Language Models
+(LLMs) have demonstrated strong reasoning abilities on textual inputs. To
+leverage the power of LLMs for robot failure explanation, we introduce REFLECT,
+a framework which queries LLM for failure reasoning based on a hierarchical
+summary of robot past experiences generated from multisensory observations. The
+failure explanation can further guide a language-based planner to correct the
+failure and complete the task. To systematically evaluate the framework, we
+create the RoboFail dataset with a variety of tasks and failure scenarios. We
+demonstrate that the LLM-based framework is able to generate informative
+failure explanations that assist successful correction planning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Conference on Robot Learning (CoRL) 2023; Project website:
+  https://robot-reflect.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">GPT</span>-Driver: Learning to Drive with <span class="highlight-title">GPT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01415v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01415v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiageng Mao, Yuxi Qian, Hang Zhao, Yue Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a simple yet effective approach that can transform the OpenAI
+GPT-3.5 model into a reliable motion planner for autonomous vehicles. Motion
+planning is a core challenge in autonomous driving, aiming to plan a driving
+trajectory that is safe and comfortable. Existing motion planners predominantly
+leverage heuristic methods to forecast driving trajectories, yet these
+approaches demonstrate insufficient generalization capabilities in the face of
+novel and unseen driving scenarios. In this paper, we propose a novel approach
+to motion planning that capitalizes on the strong reasoning capabilities and
+generalization potential inherent to Large Language Models (LLMs). The
+fundamental insight of our approach is the reformulation of motion planning as
+a language modeling problem, a perspective not previously explored.
+Specifically, we represent the planner inputs and outputs as language tokens,
+and leverage the LLM to generate driving trajectories through a language
+description of coordinate positions. Furthermore, we propose a novel
+prompting-reasoning-finetuning strategy to stimulate the numerical reasoning
+potential of the LLM. With this strategy, the LLM can describe highly precise
+trajectory coordinates and also its internal decision-making process in natural
+language. We evaluate our approach on the large-scale nuScenes dataset, and
+extensive experiments substantiate the effectiveness, generalization ability,
+and interpretability of our GPT-based motion planner. Code is now available at
+https://github.com/PointsCoder/GPT-Driver.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page:
+  https://pointscoder.github.io/projects/gpt_driver/index.html</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Does Localization Inform Editing? Surprising Differences in
+  Causality-Based Localization vs. Knowledge Editing in Language Models <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.04213v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.04213v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Hase, Mohit Bansal, Been Kim, Asma Ghandeharioun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models learn a great quantity of factual information during
+pretraining, and recent work localizes this information to specific model
+weights like mid-layer MLP weights. In this paper, we find that we can change
+how a fact is stored in a model by editing weights that are in a different
+location than where existing methods suggest that the fact is stored. This is
+surprising because we would expect that localizing facts to specific model
+parameters would tell us where to manipulate knowledge in models, and this
+assumption has motivated past work on model editing methods. Specifically, we
+show that localization conclusions from representation denoising (also known as
+Causal Tracing) do not provide any insight into which model MLP layer would be
+best to edit in order to override an existing stored fact with a new one. This
+finding raises questions about how past work relies on Causal Tracing to select
+which model layers to edit. Next, we consider several variants of the editing
+problem, including erasing and amplifying facts. For one of our editing
+problems, editing performance does relate to localization results from
+representation denoising, but we find that which layer we edit is a far better
+predictor of performance. Our results suggest, counterintuitively, that better
+mechanistic understanding of how pretrained language models work may not always
+translate to insights about how to best change their behavior. Our code is
+available at https://github.com/google/belief-localization
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023 (Spotlight). 26 pages, 22 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Expresssive Power of <span class="highlight-title">Transformer</span>s with Chain of Thought 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07923v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07923v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        William Merrill, Ashish Sabharwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent theoretical work has identified surprisingly simple reasoning
+problems, such as checking if two nodes in a graph are connected or simulating
+finite-state machines, that are provably unsolvable by standard transformers
+that answer immediately after reading their input. However, in practice,
+transformers' reasoning can be improved by allowing them to use a "chain of
+thought" or "scratchpad", i.e., generate and condition on a sequence of
+intermediate tokens before answering. Motivated by this, we ask: Does such
+intermediate generation fundamentally extend the computational power of a
+decoder-only transformer? We show that the answer is yes, but the amount of
+increase depends crucially on the amount of intermediate generation. For
+instance, we find that transformer decoders with a logarithmic number of
+decoding steps (w.r.t. the input length) push the limits of standard
+transformers only slightly, while a linear number of decoding steps adds a
+clear new ability (under standard complexity conjectures): recognizing all
+regular languages. Our results also imply that linear steps keep transformer
+decoders within context-sensitive languages, and polynomial steps make them
+recognize exactly the class of polynomial-time solvable problems -- the first
+exact characterization of a type of transformers in terms of standard
+complexity classes. Together, our results provide a nuanced framework for
+understanding how the length of a transformer's chain of thought or scratchpad
+impacts its reasoning power.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9-page preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bridging Discrete and Backpropagation: Straight-Through and Beyond <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.08612v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.08612v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liyuan Liu, Chengyu Dong, Xiaodong Liu, Bin Yu, Jianfeng Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Backpropagation, the cornerstone of deep learning, is limited to computing
+gradients for continuous variables. This limitation poses challenges for
+problems involving discrete latent variables. To address this issue, we propose
+a novel approach to approximate the gradient of parameters involved in
+generating discrete latent variables. First, we examine the widely used
+Straight-Through (ST) heuristic and demonstrate that it works as a first-order
+approximation of the gradient. Guided by our findings, we propose ReinMax,
+which achieves second-order accuracy by integrating Heun's method, a
+second-order numerical method for solving ODEs. ReinMax does not require
+Hessian or other second-order derivatives, thus having negligible computation
+overheads. Extensive experimental results on various tasks demonstrate the
+superiority of ReinMax over the state of the art. Implementations are released
+at https://github.com/microsoft/ReinMax.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023 (Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Extending TrOCR for Text Localization-Free OCR of Full-Page Scanned
+  Receipt Images <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.05525v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.05525v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongkuan Zhang, Edward Whittaker, Ikuo Kitagishi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Digitization of scanned receipts aims to extract text from receipt images and
+save it into structured documents. This is usually split into two sub-tasks:
+text localization and optical character recognition (OCR). Most existing OCR
+models only focus on the cropped text instance images, which require the
+bounding box information provided by a text region detection model. Introducing
+an additional detector to identify the text instance images in advance adds
+complexity, however instance-level OCR models have very low accuracy when
+processing the whole image for the document-level OCR, such as receipt images
+containing multiple text lines arranged in various layouts. To this end, we
+propose a localization-free document-level OCR model for transcribing all the
+characters in a receipt image into an ordered sequence end-to-end.
+Specifically, we finetune the pretrained instance-level model TrOCR with
+randomly cropped image chunks, and gradually increase the image chunk size to
+generalize the recognition ability from instance images to full-page images. In
+our experiments on the SROIE receipt OCR dataset, the model finetuned with our
+strategy achieved 64.4 F1-score and a 22.8% character error rate (CER),
+respectively, which outperforms the baseline results with 48.5 F1-score and
+50.6% CER. The best model, which splits the full image into 15 equally sized
+chunks, gives 87.8 F1-score and 4.98% CER with minimal additional pre or
+post-processing of the output. Moreover, the characters in the generated
+document-level sequences are arranged in the reading order, which is practical
+for real-world applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023 RCV Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">46</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Filling the Holes on 3D Heritage Object Surface based on Automatic
+  Segmentation Algorithm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10875v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10875v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sinh Van Nguyen, Son Thanh Le, Minh Khai Tran, Le Thanh Sach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconstructing and processing the 3D objects are popular activities in the
+research field of computer graphics, image processing and computer vision. The
+3D objects are processed based on the methods like geometric modeling, a branch
+of applied mathematics and computational geometry, or the machine learning
+algorithms based on image processing. The computation of geometrical objects
+includes processing the curves and surfaces, subdivision, simplification,
+meshing, holes filling, reconstructing, and refining the 3D surface objects on
+both point cloud data and triangular mesh. While the machine learning methods
+are developed using deep learning models. With the support of 3D laser scan
+devices and Lidar techniques, the obtained dataset is close to original shape
+of the real objects. Besides, the photography and its application based on the
+modern techniques in recent years help us collect data and process the 3D
+models more precise. This article proposes an improved method for filling holes
+on the 3D object surface based on an automatic segmentation. Instead of filling
+the hole directly as the existing methods, we now subdivide the hole before
+filling it. The hole is first determined and segmented automatically based on
+computation of its local curvature. It is then filled on each part of the hole
+to match its local curvature shape. The method can work on both 3D point cloud
+surfaces and triangular mesh surface. Comparing to the state of the art
+methods, our proposed method obtained higher accuracy of the reconstructed 3D
+objects.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 11 figures, 37 references</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Approximation properties of slice-matching operators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10869v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10869v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiying Li, Caroline Moosmueller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Iterative slice-matching procedures are efficient schemes for transferring a
+source measure to a target measure, especially in high dimensions. These
+schemes have been successfully used in applications such as color transfer and
+shape retrieval, and are guaranteed to converge under regularity assumptions.
+In this paper, we explore approximation properties related to a single step of
+such iterative schemes by examining an associated slice-matching operator,
+depending on a source measure, a target measure, and slicing directions. In
+particular, we demonstrate an invariance property with respect to the source
+measure, an equivariance property with respect to the target measure, and
+Lipschitz continuity concerning the slicing directions. We furthermore
+establish error bounds corresponding to approximating the target measure by one
+step of the slice-matching scheme and characterize situations in which the
+slice-matching operator recovers the optimal transport map between two
+measures. We also investigate connections to affine registration problems with
+respect to (sliced) Wasserstein distances. These connections can be also be
+viewed as extensions to the invariance and equivariance properties of the
+slice-matching operator and illustrate the extent to which slice-matching
+schemes incorporate affine effects.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Invisible Map: Visual-Inertial SLAM with Fiducial Markers for
+  Smartphone-based Indoor Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10862v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10862v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Ruvolo, Ayush Chakraborty, Rucha Dave, Richard Li, Duncan Mazza, Xierui Shen, Raiyan Siddique, Krishna Suresh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a system for creating building-scale, easily navigable 3D maps
+using mainstream smartphones. In our approach, we formulate the 3D-mapping
+problem as an instance of Graph SLAM and infer the position of both building
+landmarks (fiducial markers) and navigable paths through the environment (phone
+poses). Our results demonstrate the system's ability to create accurate 3D
+maps. Further, we highlight the importance of careful selection of mapping
+hyperparameters and provide a novel technique for tuning these hyperparameters
+to adapt our algorithm to new environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SoybeanNet: <span class="highlight-title">Transformer</span>-Based Convolutional Neural Network for Soybean
+  Pod Counting from Unmanned Aerial Vehicle (UAV) Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10861v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10861v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiajia Li, Raju Thada Magar, Dong Chen, Feng Lin, Dechun Wang, Xiang Yin, Weichao Zhuang, Zhaojian Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Soybeans are a critical source of food, protein and oil, and thus have
+received extensive research aimed at enhancing their yield, refining
+cultivation practices, and advancing soybean breeding techniques. Within this
+context, soybean pod counting plays an essential role in understanding and
+optimizing production. Despite recent advancements, the development of a robust
+pod-counting algorithm capable of performing effectively in real-field
+conditions remains a significant challenge This paper presents a pioneering
+work of accurate soybean pod counting utilizing unmanned aerial vehicle (UAV)
+images captured from actual soybean fields in Michigan, USA. Specifically, this
+paper presents SoybeanNet, a novel point-based counting network that harnesses
+powerful transformer backbones for simultaneous soybean pod counting and
+localization with high accuracy. In addition, a new dataset of UAV-acquired
+images for soybean pod counting was created and open-sourced, consisting of 113
+drone images with more than 260k manually annotated soybean pods captured under
+natural lighting conditions. Through comprehensive evaluations, SoybeanNet
+demonstrated superior performance over five state-of-the-art approaches when
+tested on the collected images. Remarkably, SoybeanNet achieved a counting
+accuracy of $84.51\%$ when tested on the testing dataset, attesting to its
+efficacy in real-world scenarios. The publication also provides both the source
+code (\url{https://github.com/JiajiaLi04/Soybean-Pod-Counting-from-UAV-Images})
+and the labeled soybean dataset
+(\url{https://www.kaggle.com/datasets/jiajiali/uav-based-soybean-pod-images}),
+offering a valuable resource for future research endeavors in soybean pod
+counting and related fields.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Provable Probabilistic Imaging using Score-Based Generative Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10835v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10835v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Sun, Zihui Wu, Yifan Chen, Berthy T. Feng, Katherine L. Bouman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Estimating high-quality images while also quantifying their uncertainty are
+two desired features in an image reconstruction algorithm for solving ill-posed
+inverse problems. In this paper, we propose plug-and-play Monte Carlo (PMC) as
+a principled framework for characterizing the space of possible solutions to a
+general inverse problem. PMC is able to incorporate expressive score-based
+generative priors for high-quality image reconstruction while also performing
+uncertainty quantification via posterior sampling. In particular, we introduce
+two PMC algorithms which can be viewed as the sampling analogues of the
+traditional plug-and-play priors (PnP) and regularization by denoising (RED)
+algorithms. We also establish a theoretical analysis for characterizing the
+convergence of the PMC algorithms. Our analysis provides non-asymptotic
+stationarity guarantees for both algorithms, even in the presence of
+non-log-concave likelihoods and imperfect score networks. We demonstrate the
+performance of the PMC algorithms on multiple representative inverse problems
+with both linear and nonlinear forward models. Experimental results show that
+PMC significantly improves reconstruction quality and enables high-fidelity
+uncertainty quantification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vision and Language Navigation in the Real World via Online Visual
+  Language Mapping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10822v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10822v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengguang Xu, Hieu T. Nguyen, Christopher Amato, Lawson L. S. Wong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Navigating in unseen environments is crucial for mobile robots. Enhancing
+them with the ability to follow instructions in natural language will further
+improve navigation efficiency in unseen cases. However, state-of-the-art (SOTA)
+vision-and-language navigation (VLN) methods are mainly evaluated in
+simulation, neglecting the complex and noisy real world. Directly transferring
+SOTA navigation policies trained in simulation to the real world is challenging
+due to the visual domain gap and the absence of prior knowledge about unseen
+environments. In this work, we propose a novel navigation framework to address
+the VLN task in the real world. Utilizing the powerful foundation models, the
+proposed framework includes four key components: (1) an LLMs-based instruction
+parser that converts the language instruction into a sequence of pre-defined
+macro-action descriptions, (2) an online visual-language mapper that builds a
+real-time visual-language map to maintain a spatial and semantic understanding
+of the unseen environment, (3) a language indexing-based localizer that grounds
+each macro-action description into a waypoint location on the map, and (4) a
+DD-PPO-based local controller that predicts the action. We evaluate the
+proposed pipeline on an Interbotix LoCoBot WX250 in an unseen lab environment.
+Without any fine-tuning, our pipeline significantly outperforms the SOTA VLN
+baseline in the real world.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Convolutional Neural Network Model for Diabetic Retinopathy Feature
+  Extraction and Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10806v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10806v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sharan Subramanian, Leilani H. Gilpin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The application of Artificial Intelligence in the medical market brings up
+increasing concerns but aids in more timely diagnosis of silent progressing
+diseases like Diabetic Retinopathy. In order to diagnose Diabetic Retinopathy
+(DR), ophthalmologists use color fundus images, or pictures of the back of the
+retina, to identify small distinct features through a difficult and
+time-consuming process. Our work creates a novel CNN model and identifies the
+severity of DR through fundus image input. We classified 4 known DR features,
+including micro-aneurysms, cotton wools, exudates, and hemorrhages, through
+convolutional layers and were able to provide an accurate diagnostic without
+additional user input. The proposed model is more interpretable and robust to
+overfitting. We present initial results with a sensitivity of 97% and an
+accuracy of 71%. Our contribution is an interpretable model with similar
+accuracy to more complex models. With that, our model advances the field of DR
+detection and proves to be a key step towards AI-focused medical diagnosis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 2 tables, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LAMP: Learn A Motion Pattern for Few-Shot-Based Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10769v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10769v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruiqi Wu, Liangyu Chen, Tong Yang, Chunle Guo, Chongyi Li, Xiangyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the impressive progress in diffusion-based text-to-image generation,
+extending such powerful generative ability to text-to-video raises enormous
+attention. Existing methods either require large-scale text-video pairs and a
+large number of training resources or learn motions that are precisely aligned
+with template videos. It is non-trivial to balance a trade-off between the
+degree of generation freedom and the resource costs for video generation. In
+our study, we present a few-shot-based tuning framework, LAMP, which enables
+text-to-image diffusion model Learn A specific Motion Pattern with 8~16 videos
+on a single GPU. Specifically, we design a first-frame-conditioned pipeline
+that uses an off-the-shelf text-to-image model for content generation so that
+our tuned video diffusion model mainly focuses on motion learning. The
+well-developed text-to-image techniques can provide visually pleasing and
+diverse content as generation conditions, which highly improves video quality
+and generation freedom. To capture the features of temporal dimension, we
+expand the pretrained 2D convolution layers of the T2I model to our novel
+temporal-spatial motion learning layers and modify the attention blocks to the
+temporal level. Additionally, we develop an effective inference trick,
+shared-noise sampling, which can improve the stability of videos with
+computational costs. Our method can also be flexibly applied to other tasks,
+e.g. real-world image animation and video editing. Extensive experiments
+demonstrate that LAMP can effectively learn the motion pattern on limited data
+and generate high-quality videos. The code and models are available at
+https://rq-wu.github.io/projects/LAMP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://rq-wu.github.io/projects/LAMP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BiomedJourney: Counterfactual Biomedical Image Generation by
+  Instruction-Learning from Multimodal Patient Journeys 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10765v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10765v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Gu, Jianwei Yang, Naoto Usuyama, Chunyuan Li, Sheng Zhang, Matthew P. Lungren, Jianfeng Gao, Hoifung Poon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rapid progress has been made in instruction-learning for image editing with
+natural-language instruction, as exemplified by InstructPix2Pix. In
+biomedicine, such methods can be applied to counterfactual image generation,
+which helps differentiate causal structure from spurious correlation and
+facilitate robust image interpretation for disease progression modeling.
+However, generic image-editing models are ill-suited for the biomedical domain,
+and counterfactual biomedical image generation is largely underexplored. In
+this paper, we present BiomedJourney, a novel method for counterfactual
+biomedical image generation by instruction-learning from multimodal patient
+journeys. Given a patient with two biomedical images taken at different time
+points, we use GPT-4 to process the corresponding imaging reports and generate
+a natural language description of disease progression. The resulting triples
+(prior image, progression description, new image) are then used to train a
+latent diffusion model for counterfactual biomedical image generation. Given
+the relative scarcity of image time series data, we introduce a two-stage
+curriculum that first pretrains the denoising network using the much more
+abundant single image-report pairs (with dummy prior image), and then continues
+training using the counterfactual triples. Experiments using the standard
+MIMIC-CXR dataset demonstrate the promise of our method. In a comprehensive
+battery of tests on counterfactual medical image generation, BiomedJourney
+substantially outperforms prior state-of-the-art methods in instruction image
+editing and medical image generation such as InstructPix2Pix and RoentGen. To
+facilitate future study in counterfactual medical generation, we plan to
+release our instruction-learning code and pretrained models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Conditional Shape Models for 3D cardiac image segmentation <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10756v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10756v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Athira J Jacob, Puneet Sharma, Daniel Ruckert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Delineation of anatomical structures is often the first step of many medical
+image analysis workflows. While convolutional neural networks achieve high
+performance, these do not incorporate anatomical shape information. We
+introduce a novel segmentation algorithm that uses Deep Conditional Shape
+models (DCSMs) as a core component. Using deep implicit shape representations,
+the algorithm learns a modality-agnostic shape model that can generate the
+signed distance functions for any anatomy of interest. To fit the generated
+shape to the image, the shape model is conditioned on anatomic landmarks that
+can be automatically detected or provided by the user. Finally, we add a
+modality-dependent, lightweight refinement network to capture any fine details
+not represented by the implicit function. The proposed DCSM framework is
+evaluated on the problem of cardiac left ventricle (LV) segmentation from
+multiple 3D modalities (contrast-enhanced CT, non-contrasted CT, 3D
+echocardiography-3DE). We demonstrate that the automatic DCSM outperforms the
+baseline for non-contrasted CT without the local refinement, and with the
+refinement for contrasted CT and 3DE, especially with significant improvement
+in the Hausdorff distance. The semi-automatic DCSM with user-input landmarks,
+while only trained on contrasted CT, achieves greater than 92% Dice for all
+modalities. Both automatic DCSM with refinement and semi-automatic DCSM achieve
+equivalent or better performance compared to inter-user variability for these
+modalities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted and presented as oral presentation at Statistical Atlases
+  and Computational Modeling of the Heart (STACOM) workshop at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IDRNet: Intervention-Driven Relation Network for Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10755v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10755v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenchao Jin, Xiaowei Hu, Lingting Zhu, Luchuan Song, Li Yuan, Lequan Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Co-occurrent visual patterns suggest that pixel relation modeling facilitates
+dense prediction tasks, which inspires the development of numerous context
+modeling paradigms, \emph{e.g.}, multi-scale-driven and similarity-driven
+context schemes. Despite the impressive results, these existing paradigms often
+suffer from inadequate or ineffective contextual information aggregation due to
+reliance on large amounts of predetermined priors. To alleviate the issues, we
+propose a novel \textbf{I}ntervention-\textbf{D}riven \textbf{R}elation
+\textbf{Net}work (\textbf{IDRNet}), which leverages a deletion diagnostics
+procedure to guide the modeling of contextual relations among different pixels.
+Specifically, we first group pixel-level representations into semantic-level
+representations with the guidance of pseudo labels and further improve the
+distinguishability of the grouped representations with a feature enhancement
+module. Next, a deletion diagnostics procedure is conducted to model relations
+of these semantic-level representations via perceiving the network outputs and
+the extracted relations are utilized to guide the semantic-level
+representations to interact with each other. Finally, the interacted
+representations are utilized to augment original pixel-level representations
+for final predictions. Extensive experiments are conducted to validate the
+effectiveness of IDRNet quantitatively and qualitatively. Notably, our
+intervention-driven context scheme brings consistent performance improvements
+to state-of-the-art segmentation frameworks and achieves competitive results on
+popular benchmark datasets, including ADE20K, COCO-Stuff, PASCAL-Context, LIP,
+and Cityscapes. Code is available at
+\url{https://github.com/SegmentationBLWX/sssegmentation}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPs 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HairCLIPv2: Unifying Hair Editing via Proxy Feature Blending <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10651v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10651v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianyi Wei, Dongdong Chen, Wenbo Zhou, Jing Liao, Weiming Zhang, Gang Hua, Nenghai Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hair editing has made tremendous progress in recent years. Early hair editing
+methods use well-drawn sketches or masks to specify the editing conditions.
+Even though they can enable very fine-grained local control, such interaction
+modes are inefficient for the editing conditions that can be easily specified
+by language descriptions or reference images. Thanks to the recent breakthrough
+of cross-modal models (e.g., CLIP), HairCLIP is the first work that enables
+hair editing based on text descriptions or reference images. However, such
+text-driven and reference-driven interaction modes make HairCLIP unable to
+support fine-grained controls specified by sketch or mask. In this paper, we
+propose HairCLIPv2, aiming to support all the aforementioned interactions with
+one unified framework. Simultaneously, it improves upon HairCLIP with better
+irrelevant attributes (e.g., identity, background) preservation and unseen text
+descriptions support. The key idea is to convert all the hair editing tasks
+into hair transfer tasks, with editing conditions converted into different
+proxies accordingly. The editing effects are added upon the input image by
+blending the corresponding proxy features within the hairstyle or hair color
+feature spaces. Besides the unprecedented user interaction mode support,
+quantitative and qualitative experiments demonstrate the superiority of
+HairCLIPv2 in terms of editing effects, irrelevant attribute preservation and
+visual naturalness. Our code is available at
+\url{https://github.com/wty-ustc/HairCLIPv2}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023, code is available at
+  https://github.com/wty-ustc/HairCLIPv2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TraM-NeRF: Tracing Mirror and Near-Perfect Specular Reflections through
+  Neural Radiance Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10650v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10650v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Leif Van Holland, Ruben Bliersbach, Jan U. Müller, Patrick Stotko, Reinhard Klein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Implicit representations like Neural Radiance Fields (NeRF) showed impressive
+results for photorealistic rendering of complex scenes with fine details.
+However, ideal or near-perfectly specular reflecting objects such as mirrors,
+which are often encountered in various indoor scenes, impose ambiguities and
+inconsistencies in the representation of the reconstructed scene leading to
+severe artifacts in the synthesized renderings. In this paper, we present a
+novel reflection tracing method tailored for the involved volume rendering
+within NeRF that takes these mirror-like objects into account while avoiding
+the cost of straightforward but expensive extensions through standard path
+tracing. By explicitly modeling the reflection behavior using physically
+plausible materials and estimating the reflected radiance with Monte-Carlo
+methods within the volume rendering formulation, we derive efficient strategies
+for importance sampling and the transmittance computation along rays from only
+few samples. We show that our novel method enables the training of consistent
+representations of such challenging scenes and achieves superior results in
+comparison to previous state-of-the-art approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on Video Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10647v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10647v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen Xing, Qijun Feng, Haoran Chen, Qi Dai, Han Hu, Hang Xu, Zuxuan Wu, Yu-Gang Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent wave of AI-generated content (AIGC) has witnessed substantial
+success in computer vision, with the diffusion model playing a crucial role in
+this achievement. Due to their impressive generative capabilities, diffusion
+models are gradually superseding methods based on GANs and auto-regressive
+Transformers, demonstrating exceptional performance not only in image
+generation and editing, but also in the realm of video-related research.
+However, existing surveys mainly focus on diffusion models in the context of
+image generation, with few up-to-date reviews on their application in the video
+domain. To address this gap, this paper presents a comprehensive review of
+video diffusion models in the AIGC era. Specifically, we begin with a concise
+introduction to the fundamentals and evolution of diffusion models.
+Subsequently, we present an overview of research on diffusion models in the
+video domain, categorizing the work into three key areas: video generation,
+video editing, and other video understanding tasks. We conduct a thorough
+review of the literature in these three key areas, including further
+categorization and practical contributions in the field. Finally, we discuss
+the challenges faced by research in this domain and outline potential future
+developmental trends. A comprehensive list of video diffusion models studied in
+this survey is available at
+https://github.com/ChenHsing/Awesome-Video-Diffusion-Models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TOSS:High-quality Text-guided Novel View Synthesis from a Single Image 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10644v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10644v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yukai Shi, Jianan Wang, He Cao, Boshi Tang, Xianbiao Qi, Tianyu Yang, Yukun Huang, Shilong Liu, Lei Zhang, Heung-Yeung Shum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present TOSS, which introduces text to the task of novel
+view synthesis (NVS) from just a single RGB image. While Zero-1-to-3 has
+demonstrated impressive zero-shot open-set NVS capability, it treats NVS as a
+pure image-to-image translation problem. This approach suffers from the
+challengingly under-constrained nature of single-view NVS: the process lacks
+means of explicit user control and often results in implausible NVS
+generations. To address this limitation, TOSS uses text as high-level semantic
+information to constrain the NVS solution space. TOSS fine-tunes text-to-image
+Stable Diffusion pre-trained on large-scale text-image pairs and introduces
+modules specifically tailored to image and camera pose conditioning, as well as
+dedicated training for pose correctness and preservation of fine details.
+Comprehensive experiments are conducted with results showing that our proposed
+TOSS outperforms Zero-1-to-3 with more plausible, controllable and
+multiview-consistent NVS results. We further support these results with
+comprehensive ablations that underscore the effectiveness and potential of the
+introduced semantic guidance and architecture design.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Real-time Photorealistic Dynamic Scene Representation and Rendering with
+  4D Gaussian Splatting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10642v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10642v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyu Yang, Hongye Yang, Zijie Pan, Xiatian Zhu, Li Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reconstructing dynamic 3D scenes from 2D images and generating diverse views
+over time is challenging due to scene complexity and temporal dynamics. Despite
+advancements in neural implicit models, limitations persist: (i) Inadequate
+Scene Structure: Existing methods struggle to reveal the spatial and temporal
+structure of dynamic scenes from directly learning the complex 6D plenoptic
+function. (ii) Scaling Deformation Modeling: Explicitly modeling scene element
+deformation becomes impractical for complex dynamics. To address these issues,
+we consider the spacetime as an entirety and propose to approximate the
+underlying spatio-temporal 4D volume of a dynamic scene by optimizing a
+collection of 4D primitives, with explicit geometry and appearance modeling.
+Learning to optimize the 4D primitives enables us to synthesize novel views at
+any desired time with our tailored rendering routine. Our model is conceptually
+simple, consisting of a 4D Gaussian parameterized by anisotropic ellipses that
+can rotate arbitrarily in space and time, as well as view-dependent and
+time-evolved appearance represented by the coefficient of 4D spherindrical
+harmonics. This approach offers simplicity, flexibility for variable-length
+video and end-to-end training, and efficient real-time rendering, making it
+suitable for capturing complex dynamic scene motions. Experiments across
+various benchmarks, including monocular and multi-view scenarios, demonstrate
+our 4DGS model's superior visual quality and efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical Report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLM Blueprint: Enabling Text-to-Image Generation with Complex and
+  Detailed <span class="highlight-title">Prompt</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10640v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10640v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanan Gani, Shariq Farooq Bhat, Muzammal Naseer, Salman Khan, Peter Wonka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion-based generative models have significantly advanced text-to-image
+generation but encounter challenges when processing lengthy and intricate text
+prompts describing complex scenes with multiple objects. While excelling in
+generating images from short, single-object descriptions, these models often
+struggle to faithfully capture all the nuanced details within longer and more
+elaborate textual inputs. In response, we present a novel approach leveraging
+Large Language Models (LLMs) to extract critical components from text prompts,
+including bounding box coordinates for foreground objects, detailed textual
+descriptions for individual objects, and a succinct background context. These
+components form the foundation of our layout-to-image generation model, which
+operates in two phases. The initial Global Scene Generation utilizes object
+layouts and background context to create an initial scene but often falls short
+in faithfully representing object characteristics as specified in the prompts.
+To address this limitation, we introduce an Iterative Refinement Scheme that
+iteratively evaluates and refines box-level content to align them with their
+textual descriptions, recomposing objects as needed to ensure consistency. Our
+evaluation on complex prompts featuring multiple objects demonstrates a
+substantial improvement in recall compared to baseline diffusion models. This
+is further validated by a user study, underscoring the efficacy of our approach
+in generating coherent and detailed scenes from intricate textual inputs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code: https://github.com/hananshafi/llmblueprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Scenario-based Safety Validation for Autonomous Trains with Deep
+  Generative Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10635v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10635v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thomas Decker, Ananta R. Bhattarai, Michael Lebacher
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern AI techniques open up ever-increasing possibilities for autonomous
+vehicles, but how to appropriately verify the reliability of such systems
+remains unclear. A common approach is to conduct safety validation based on a
+predefined Operational Design Domain (ODD) describing specific conditions under
+which a system under test is required to operate properly. However, collecting
+sufficient realistic test cases to ensure comprehensive ODD coverage is
+challenging. In this paper, we report our practical experiences regarding the
+utility of data simulation with deep generative models for scenario-based ODD
+validation. We consider the specific use case of a camera-based rail-scene
+segmentation system designed to support autonomous train operation. We
+demonstrate the capabilities of semantically editing railway scenes with deep
+generative models to make a limited amount of test data more representative. We
+also show how our approach helps to analyze the degree to which a system
+complies with typical ODD requirements. Specifically, we focus on evaluating
+proper operation under different lighting and weather conditions as well as
+while transitioning between them.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>International Conference on Computer Safety, Reliability, and
+  Security 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Video Language Planning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10625v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10625v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yilun Du, Mengjiao Yang, Pete Florence, Fei Xia, Ayzaan Wahid, Brian Ichter, Pierre Sermanet, Tianhe Yu, Pieter Abbeel, Joshua B. Tenenbaum, Leslie Kaelbling, Andy Zeng, Jonathan Tompson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We are interested in enabling visual planning for complex long-horizon tasks
+in the space of generated videos and language, leveraging recent advances in
+large generative models pretrained on Internet-scale data. To this end, we
+present video language planning (VLP), an algorithm that consists of a tree
+search procedure, where we train (i) vision-language models to serve as both
+policies and value functions, and (ii) text-to-video models as dynamics models.
+VLP takes as input a long-horizon task instruction and current image
+observation, and outputs a long video plan that provides detailed multimodal
+(video and language) specifications that describe how to complete the final
+task. VLP scales with increasing computation budget where more computation time
+results in improved video plans, and is able to synthesize long-horizon video
+plans across different robotics domains: from multi-object rearrangement, to
+multi-camera bi-arm dexterous manipulation. Generated video plans can be
+translated into real robot actions via goal-conditioned policies, conditioned
+on each intermediate frame of the generated video. Experiments show that VLP
+substantially improves long-horizon task success rates compared to prior
+methods on both simulated and real robots (across 3 hardware platforms).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>https://video-language-planning.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DynVideo-E: Harnessing Dynamic NeRF for Large-Scale Motion- and
+  View-Change Human-Centric Video Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10624v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10624v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia-Wei Liu, Yan-Pei Cao, Jay Zhangjie Wu, Weijia Mao, Yuchao Gu, Rui Zhao, Jussi Keppo, Ying Shan, Mike Zheng Shou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite remarkable research advances in diffusion-based video editing,
+existing methods are limited to short-length videos due to the contradiction
+between long-range consistency and frame-wise editing. Recent approaches
+attempt to tackle this challenge by introducing video-2D representations to
+degrade video editing to image editing. However, they encounter significant
+difficulties in handling large-scale motion- and view-change videos especially
+for human-centric videos. This motivates us to introduce the dynamic Neural
+Radiance Fields (NeRF) as the human-centric video representation to ease the
+video editing problem to a 3D space editing task. As such, editing can be
+performed in the 3D spaces and propagated to the entire video via the
+deformation field. To provide finer and direct controllable editing, we propose
+the image-based 3D space editing pipeline with a set of effective designs.
+These include multi-view multi-pose Score Distillation Sampling (SDS) from both
+2D personalized diffusion priors and 3D diffusion priors, reconstruction losses
+on the reference image, text-guided local parts super-resolution, and style
+transfer for 3D background space. Extensive experiments demonstrate that our
+method, dubbed as DynVideo-E, significantly outperforms SOTA approaches on two
+challenging datasets by a large margin of 50% ~ 95% in terms of human
+preference. Compelling video comparisons are provided in the project page
+https://showlab.github.io/DynVideo-E/. Our code and data will be released to
+the community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://showlab.github.io/DynVideo-E/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Motion2Language, Unsupervised learning of synchronized semantic motion
+  segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10594v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10594v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karim Radouane, Andon Tchechmedjiev, Sylvie Ranwez, Julien Lagarde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigate building a sequence to sequence architecture
+for motion to language translation and synchronization. The aim is to translate
+motion capture inputs into English natural-language descriptions, such that the
+descriptions are generated synchronously with the actions performed, enabling
+semantic segmentation as a byproduct, but without requiring synchronized
+training data. We propose a new recurrent formulation of local attention that
+is suited for synchronous/live text generation, as well as an improved motion
+encoder architecture better suited to smaller data and for synchronous
+generation. We evaluate both contributions in individual experiments, using the
+standard BLEU4 metric, as well as a simple semantic equivalence measure, on the
+KIT motion language dataset. In a follow-up experiment, we assess the quality
+of the synchronization of generated text in our proposed approaches through
+multiple evaluation metrics. We find that both contributions to the attention
+mechanism and the encoder architecture additively improve the quality of
+generated text (BLEU and semantic equivalence), but also of synchronization.
+Our code will be made available at
+\url{https://github.com/rd20karim/M2T-Segmentation/tree/main}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interpreting and Controlling Vision Foundation Models via Text
+  Explanations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10591v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10591v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haozhe Chen, Junfeng Yang, Carl Vondrick, Chengzhi Mao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale pre-trained vision foundation models, such as CLIP, have become
+de facto backbones for various vision tasks. However, due to their black-box
+nature, understanding the underlying rules behind these models' predictions and
+controlling model behaviors have remained open challenges. We present a
+framework for interpreting vision transformer's latent tokens with natural
+language. Given a latent token, our framework retains its semantic information
+to the final layer using transformer's local operations and retrieves the
+closest text for explanation. Our approach enables understanding of model
+visual reasoning procedure without needing additional model training or data
+collection. Based on the obtained interpretations, our framework allows for
+model editing that controls model reasoning behaviors and improves model
+robustness against biases and spurious correlations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BiLL-VTG: Bridging Large Language Models and Lightweight Visual Tools
+  for Video-based Texts Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10586v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10586v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ji Qi, Kaixuan Ji, Jifan Yu, Duokang Wang, Bin Xu, Lei Hou, Juanzi Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building models that generate textual responses to user instructions for
+videos is a practical and challenging topic, as it requires both vision
+understanding and knowledge reasoning. Compared to language and image
+modalities, training efficiency remains a serious problem as existing studies
+train models on massive sparse videos aligned with brief descriptions. In this
+paper, we introduce BiLL-VTG, a fast adaptive framework that leverages large
+language models (LLMs) to reasoning on videos based on essential lightweight
+visual tools. Specifically, we reveal the key to response specific instructions
+is the concentration on relevant video events, and utilize two visual tools of
+structured scene graph generation and descriptive image caption generation to
+gather and represent the events information. Thus, a LLM equipped with world
+knowledge is adopted as the reasoning agent to achieve the response by
+performing multiple reasoning steps on specified video events.To address the
+difficulty of specifying events from agent, we further propose an
+Instruction-oriented Video Events Recognition (InsOVER) algorithm based on the
+efficient Hungarian matching to localize corresponding video events using
+linguistic instructions, enabling LLMs to interact with long videos. Extensive
+experiments on two typical video-based texts generations tasks show that our
+tuning-free framework outperforms the pre-trained models including
+Flamingo-80B, to achieve the state-of-the-art performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automated Natural Language Explanation of Deep Visual Neurons with Large
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10708v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10708v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenxu Zhao, Wei Qian, Yucheng Shi, Mengdi Huai, Ninghao Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep neural networks have exhibited remarkable performance across a wide
+range of real-world tasks. However, comprehending the underlying reasons for
+their effectiveness remains a challenging problem. Interpreting deep neural
+networks through examining neurons offers distinct advantages when it comes to
+exploring the inner workings of neural networks. Previous research has
+indicated that specific neurons within deep vision networks possess semantic
+meaning and play pivotal roles in model performance. Nonetheless, the current
+methods for generating neuron semantics heavily rely on human intervention,
+which hampers their scalability and applicability. To address this limitation,
+this paper proposes a novel post-hoc framework for generating semantic
+explanations of neurons with large foundation models, without requiring human
+intervention or prior knowledge. Our framework is designed to be compatible
+with various model architectures and datasets, facilitating automated and
+scalable neuron interpretation. Experiments are conducted with both qualitative
+and quantitative analysis to verify the effectiveness of our proposed approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Matching the Neuronal Representations of V1 is Necessary to Improve
+  Robustness in CNNs with V1-like Front-ends 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10575v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10575v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruxandra Barbulescu, Tiago Marques, Arlindo L. Oliveira
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While some convolutional neural networks (CNNs) have achieved great success
+in object recognition, they struggle to identify objects in images corrupted
+with different types of common noise patterns. Recently, it was shown that
+simulating computations in early visual areas at the front of CNNs leads to
+improvements in robustness to image corruptions. Here, we further explore this
+result and show that the neuronal representations that emerge from precisely
+matching the distribution of RF properties found in primate V1 is key for this
+improvement in robustness. We built two variants of a model with a front-end
+modeling the primate primary visual cortex (V1): one sampling RF properties
+uniformly and the other sampling from empirical biological distributions. The
+model with the biological sampling has a considerably higher robustness to
+image corruptions that the uniform variant (relative difference of 8.72%).
+While similar neuronal sub-populations across the two variants have similar
+response properties and learn similar downstream weights, the impact on
+downstream processing is strikingly different. This result sheds light on the
+origin of the improvements in robustness observed in some biologically-inspired
+models, pointing to the need of precisely mimicking the neuronal
+representations found in the primate brain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RefConv: Re-parameterized Refocusing Convolution for Powerful ConvNets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10563v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10563v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhicheng Cai, Xiaohan Ding, Qiu Shen, Xun Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Re-parameterized Refocusing Convolution (RefConv) as a replacement
+for regular convolutional layers, which is a plug-and-play module to improve
+the performance without any inference costs. Specifically, given a pre-trained
+model, RefConv applies a trainable Refocusing Transformation to the basis
+kernels inherited from the pre-trained model to establish connections among the
+parameters. For example, a depth-wise RefConv can relate the parameters of a
+specific channel of convolution kernel to the parameters of the other kernel,
+i.e., make them refocus on the other parts of the model they have never
+attended to, rather than focus on the input features only. From another
+perspective, RefConv augments the priors of existing model structures by
+utilizing the representations encoded in the pre-trained parameters as the
+priors and refocusing on them to learn novel representations, thus further
+enhancing the representational capacity of the pre-trained model. Experimental
+results validated that RefConv can improve multiple CNN-based models by a clear
+margin on image classification (up to 1.47% higher top-1 accuracy on ImageNet),
+object detection and semantic segmentation without introducing any extra
+inference costs or altering the original model structure. Further studies
+demonstrated that RefConv can reduce the redundancy of channels and smooth the
+loss landscape, which explains its effectiveness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InfoGCN++: Learning Representation by Predicting the Future for Online
+  Human Skeleton-based Action Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10547v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10547v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seunggeun Chi, Hyung-gun Chi, Qixing Huang, Karthik Ramani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Skeleton-based action recognition has made significant advancements recently,
+with models like InfoGCN showcasing remarkable accuracy. However, these models
+exhibit a key limitation: they necessitate complete action observation prior to
+classification, which constrains their applicability in real-time situations
+such as surveillance and robotic systems. To overcome this barrier, we
+introduce InfoGCN++, an innovative extension of InfoGCN, explicitly developed
+for online skeleton-based action recognition. InfoGCN++ augments the abilities
+of the original InfoGCN model by allowing real-time categorization of action
+types, independent of the observation sequence's length. It transcends
+conventional approaches by learning from current and anticipated future
+movements, thereby creating a more thorough representation of the entire
+sequence. Our approach to prediction is managed as an extrapolation issue,
+grounded on observed actions. To enable this, InfoGCN++ incorporates Neural
+Ordinary Differential Equations, a concept that lets it effectively model the
+continuous evolution of hidden states. Following rigorous evaluations on three
+skeleton-based action recognition benchmarks, InfoGCN++ demonstrates
+exceptional performance in online action recognition. It consistently equals or
+exceeds existing techniques, highlighting its significant potential to reshape
+the landscape of real-time action recognition applications. Consequently, this
+work represents a major leap forward from InfoGCN, pushing the limits of what's
+possible in online, skeleton-based action recognition. The code for InfoGCN++
+is publicly available at https://github.com/stnoah1/infogcn2 for further
+exploration and validation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to the IEEE for possible publication.
+  Copyright may be transferred without notice, after which this version may no
+  longer be accessible</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ViPE: Visualise Pretty-much Everything <span class="chip">EMNLP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10543v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10543v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hassan Shahmohammadi, Adhiraj Ghosh, Hendrik P. A. Lensch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Figurative and non-literal expressions are profoundly integrated in human
+communication. Visualising such expressions allow us to convey our creative
+thoughts, and evoke nuanced emotions. Recent text-to-image models like Stable
+Diffusion, on the other hand, struggle to depict non-literal expressions.
+Recent works primarily deal with this issue by compiling humanly annotated
+datasets on a small scale, which not only demands specialised expertise but
+also proves highly inefficient. To address this issue, we introduce ViPE:
+Visualise Pretty-much Everything. ViPE offers a series of lightweight and
+robust language models that have been trained on a large-scale set of lyrics
+with noisy visual descriptions that represent their implicit meaning. The
+synthetic visual descriptions are generated by GPT3.5 relying on neither human
+annotations nor images. ViPE effectively expresses any arbitrary piece of text
+into a visualisable description, enabling meaningful and high-quality image
+generation. We provide compelling evidence that ViPE is more robust than GPT3.5
+in synthesising visual elaborations. ViPE also exhibits an understanding of
+figurative expressions comparable to human experts, providing a powerful and
+open-source backbone to many downstream applications such as music video and
+caption generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be presented in EMNLP2023 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient <span class="highlight-title">Dataset</span> Distillation through Alignment with Smooth and
+  High-Quality Expert Trajectories 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10541v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10541v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiyuan Shen, Wenzhuo Yang, Kwok-Yan Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training a large and state-of-the-art machine learning model typically
+necessitates the use of large-scale datasets, which, in turn, makes the
+training and parameter-tuning process expensive and time-consuming. Some
+researchers opt to distil information from real-world datasets into tiny and
+compact synthetic datasets while maintaining their ability to train a
+well-performing model, hence proposing a data-efficient method known as Dataset
+Distillation (DD). Despite recent progress in this field, existing methods
+still underperform and cannot effectively replace large datasets. In this
+paper, unlike previous methods that focus solely on improving the efficacy of
+student distillation, we are the first to recognize the important interplay
+between expert and student. We argue the significant impact of expert
+smoothness when employing more potent expert trajectories in subsequent dataset
+distillation. Based on this, we introduce the integration of clipping loss and
+gradient penalty to regulate the rate of parameter changes in expert
+trajectories. Furthermore, in response to the sensitivity exhibited towards
+randomly initialized variables during distillation, we propose representative
+initialization for synthetic dataset and balanced inner-loop loss. Finally, we
+present two enhancement strategies, namely intermediate matching loss and
+weight perturbation, to mitigate the potential occurrence of cumulative errors.
+We conduct extensive experiments on datasets of different scales, sizes, and
+resolutions. The results demonstrate that the proposed method significantly
+outperforms prior methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Most Important Person-guided Dual-branch Cross-Patch Attention for Group
+  Affect Recognition <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.07055v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.07055v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongxia Xie, Ming-Xian Lee, Tzu-Jui Chen, Hung-Jen Chen, Hou-I Liu, Hong-Han Shuai, Wen-Huang Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Group affect refers to the subjective emotion that is evoked by an external
+stimulus in a group, which is an important factor that shapes group behavior
+and outcomes. Recognizing group affect involves identifying important
+individuals and salient objects among a crowd that can evoke emotions. However,
+most existing methods lack attention to affective meaning in group dynamics and
+fail to account for the contextual relevance of faces and objects in
+group-level images. In this work, we propose a solution by incorporating the
+psychological concept of the Most Important Person (MIP), which represents the
+most noteworthy face in a crowd and has affective semantic meaning. We present
+the Dual-branch Cross-Patch Attention Transformer (DCAT) which uses global
+image and MIP together as inputs. Specifically, we first learn the informative
+facial regions produced by the MIP and the global context separately. Then, the
+Cross-Patch Attention module is proposed to fuse the features of MIP and global
+context together to complement each other. Our proposed method outperforms
+state-of-the-art methods on GAF 3.0, GroupEmoW, and HECO datasets. Moreover, we
+demonstrate the potential for broader applications by showing that our proposed
+model can be transferred to another group affect task, group cohesion, and
+achieve comparable results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Inversion by Direct Iteration: An Alternative to Denoising Diffusion for
+  Image Restoration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11435v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11435v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mauricio Delbracio, Peyman Milanfar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inversion by Direct Iteration (InDI) is a new formulation for supervised
+image restoration that avoids the so-called ``regression to the mean'' effect
+and produces more realistic and detailed images than existing regression-based
+methods. It does this by gradually improving image quality in small steps,
+similar to generative denoising diffusion models. Image restoration is an
+ill-posed problem where multiple high-quality images are plausible
+reconstructions of a given low-quality input. Therefore, the outcome of a
+single step regression model is typically an aggregate of all possible
+explanations, therefore lacking details and realism. The main advantage of InDI
+is that it does not try to predict the clean target image in a single step but
+instead gradually improves the image in small steps, resulting in better
+perceptual quality. While generative denoising diffusion models also work in
+small steps, our formulation is distinct in that it does not require knowledge
+of any analytic form of the degradation process. Instead, we directly learn an
+iterative restoration process from low-quality and high-quality paired
+examples. InDI can be applied to virtually any image degradation, given paired
+training data. In conditional denoising diffusion image restoration the
+denoising network generates the restored image by repeatedly denoising an
+initial image of pure noise, conditioned on the degraded input. Contrary to
+conditional denoising formulations, InDI directly proceeds by iteratively
+restoring the input low-quality image, producing high-quality results on a
+variety of image restoration tasks, including motion and out-of-focus
+deblurring, super-resolution, compression artifact removal, and denoising.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zero-shot Medical Image Translation via Frequency-Guided Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.02742v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.02742v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunxiang Li, Hua-Chieh Shao, Xiao Liang, Liyuan Chen, Ruiqi Li, Steve Jiang, Jing Wang, You Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the diffusion model has emerged as a superior generative model that
+can produce high quality and realistic images. However, for medical image
+translation, the existing diffusion models are deficient in accurately
+retaining structural information since the structure details of source domain
+images are lost during the forward diffusion process and cannot be fully
+recovered through learned reverse diffusion, while the integrity of anatomical
+structures is extremely important in medical images. For instance, errors in
+image translation may distort, shift, or even remove structures and tumors,
+leading to incorrect diagnosis and inadequate treatments. Training and
+conditioning diffusion models using paired source and target images with
+matching anatomy can help. However, such paired data are very difficult and
+costly to obtain, and may also reduce the robustness of the developed model to
+out-of-distribution testing data. We propose a frequency-guided diffusion model
+(FGDM) that employs frequency-domain filters to guide the diffusion model for
+structure-preserving image translation. Based on its design, FGDM allows
+zero-shot learning, as it can be trained solely on the data from the target
+domain, and used directly for source-to-target domain translation without any
+exposure to the source-domain data during training. We trained FGDM solely on
+the head-and-neck CT data, and evaluated it on both head-and-neck and lung
+cone-beam CT (CBCT)-to-CT translation tasks. FGDM outperformed the
+state-of-the-art methods (GAN-based, VAE-based, and diffusion-based) in metrics
+of Fr\'echet Inception Distance (FID), Peak Signal-to-Noise Ratio (PSNR), and
+Structural Similarity Index Measure (SSIM), showing its significant advantages
+in zero-shot medical image translation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Joint cortical registration of geometry and function using
+  semi-supervised learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.01592v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.01592v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jian Li, Greta Tuckute, Evelina Fedorenko, Brian L. Edlow, Bruce Fischl, Adrian V. Dalca
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain surface-based image registration, an important component of brain image
+analysis, establishes spatial correspondence between cortical surfaces.
+Existing iterative and learning-based approaches focus on accurate registration
+of folding patterns of the cerebral cortex, and assume that geometry predicts
+function and thus functional areas will also be well aligned. However,
+structure/functional variability of anatomically corresponding areas across
+subjects has been widely reported. In this work, we introduce a learning-based
+cortical registration framework, JOSA, which jointly aligns folding patterns
+and functional maps while simultaneously learning an optimal atlas. We
+demonstrate that JOSA can substantially improve registration performance in
+both anatomical and functional domains over existing methods. By employing a
+semi-supervised training strategy, the proposed framework obviates the need for
+functional data during inference, enabling its use in broad neuroscientific
+domains where functional data may not be observed. The source code of JOSA will
+be released to the public at https://voxelmorph.net.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>B. Fischl and A. V. Dalca are co-senior authors with equal
+  contribution. This work has been published in MIDL 2023
+  (https://openreview.net/forum?id=n9v_BuIcY7G) Medical Imaging with Deep
+  Learning, Nashville, TN, Jul. 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Photo Rater: Photographs Auto-Selector with Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.14420v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.14420v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wentao Guo, Charlie Ruan, Claire Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Photo Rater is a computer vision project that uses neural networks to help
+photographers select the best photo among those that are taken based on the
+same scene. This process is usually referred to as "culling" in photography,
+and it can be tedious and time-consuming if done manually. Photo Rater utilizes
+three separate neural networks to complete such a task: one for general image
+quality assessment, one for classifying whether the photo is blurry (either due
+to unsteady hands or out-of-focusness), and one for assessing general
+aesthetics (including the composition of the photo, among others). After
+feeding the image through each neural network, Photo Rater outputs a final
+score for each image, ranking them based on this score and presenting it to the
+user.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The authors discovered issues in the code that produced figures 8 and
+  9</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ REFLECT: Summarizing Robot Experiences for Failure Explanation and
+  Correction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15724v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15724v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyi Liu, Arpit Bahety, Shuran Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to detect and analyze failed executions automatically is crucial
+for an explainable and robust robotic system. Recently, Large Language Models
+(LLMs) have demonstrated strong reasoning abilities on textual inputs. To
+leverage the power of LLMs for robot failure explanation, we introduce REFLECT,
+a framework which queries LLM for failure reasoning based on a hierarchical
+summary of robot past experiences generated from multisensory observations. The
+failure explanation can further guide a language-based planner to correct the
+failure and complete the task. To systematically evaluate the framework, we
+create the RoboFail dataset with a variety of tasks and failure scenarios. We
+demonstrate that the LLM-based framework is able to generate informative
+failure explanations that assist successful correction planning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Conference on Robot Learning (CoRL) 2023; Project website:
+  https://robot-reflect.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ COLD Fusion: Calibrated and Ordinal Latent Distribution Fusion for
+  Uncertainty-Aware Multimodal Emotion Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.05833v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.05833v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mani Kumar Tellamekala, Shahin Amiriparian, Björn W. Schuller, Elisabeth André, Timo Giesbrecht, Michel Valstar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatically recognising apparent emotions from face and voice is hard, in
+part because of various sources of uncertainty, including in the input data and
+the labels used in a machine learning framework. This paper introduces an
+uncertainty-aware audiovisual fusion approach that quantifies modality-wise
+uncertainty towards emotion prediction. To this end, we propose a novel fusion
+framework in which we first learn latent distributions over audiovisual
+temporal context vectors separately, and then constrain the variance vectors of
+unimodal latent distributions so that they represent the amount of information
+each modality provides w.r.t. emotion recognition. In particular, we impose
+Calibration and Ordinal Ranking constraints on the variance vectors of
+audiovisual latent distributions. When well-calibrated, modality-wise
+uncertainty scores indicate how much their corresponding predictions may differ
+from the ground truth labels. Well-ranked uncertainty scores allow the ordinal
+ranking of different frames across the modalities. To jointly impose both these
+constraints, we propose a softmax distributional matching loss. In both
+classification and regression settings, we compare our uncertainty-aware fusion
+model with standard model-agnostic fusion baselines. Our evaluation on two
+emotion recognition corpora, AVEC 2019 CES and IEMOCAP, shows that audiovisual
+emotion recognition can considerably benefit from well-calibrated and
+well-ranked latent uncertainty measures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE Transactions on Pattern Analysis and Machine
+  Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CommonScenes: Generating Commonsense 3D Indoor Scenes with Scene Graphs <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16283v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16283v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guangyao Zhai, Evin Pınar Örnek, Shun-Cheng Wu, Yan Di, Federico Tombari, Nassir Navab, Benjamin Busam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Controllable scene synthesis aims to create interactive environments for
+various industrial use cases. Scene graphs provide a highly suitable interface
+to facilitate these applications by abstracting the scene context in a compact
+manner. Existing methods, reliant on retrieval from extensive databases or
+pre-trained shape embeddings, often overlook scene-object and object-object
+relationships, leading to inconsistent results due to their limited generation
+capacity. To address this issue, we present CommonScenes, a fully generative
+model that converts scene graphs into corresponding controllable 3D scenes,
+which are semantically realistic and conform to commonsense. Our pipeline
+consists of two branches, one predicting the overall scene layout via a
+variational auto-encoder and the other generating compatible shapes via latent
+diffusion, capturing global scene-object and local inter-object relationships
+while preserving shape diversity. The generated scenes can be manipulated by
+editing the input scene graph and sampling the noise in the diffusion model.
+Due to lacking a scene graph dataset offering high-quality object-level meshes
+with relations, we also construct SG-FRONT, enriching the off-the-shelf indoor
+dataset 3D-FRONT with additional scene graph labels. Extensive experiments are
+conducted on SG-FRONT where CommonScenes shows clear advantages over other
+methods regarding generation consistency, quality, and diversity. Codes and the
+dataset will be released upon acceptance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023 accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Sequential Acquisition Policies for Robot-Assisted Feeding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.05197v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.05197v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Priya Sundaresan, Jiajun Wu, Dorsa Sadigh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A robot providing mealtime assistance must perform specialized maneuvers with
+various utensils in order to pick up and feed a range of food items. Beyond
+these dexterous low-level skills, an assistive robot must also plan these
+strategies in sequence over a long horizon to clear a plate and complete a
+meal. Previous methods in robot-assisted feeding introduce highly specialized
+primitives for food handling without a means to compose them together.
+Meanwhile, existing approaches to long-horizon manipulation lack the
+flexibility to embed highly specialized primitives into their frameworks. We
+propose Visual Action Planning OveR Sequences (VAPORS), a framework for
+long-horizon food acquisition. VAPORS learns a policy for high-level action
+selection by leveraging learned latent plate dynamics in simulation. To carry
+out sequential plans in the real world, VAPORS delegates action execution to
+visually parameterized primitives. We validate our approach on complex
+real-world acquisition trials involving noodle acquisition and bimanual
+scooping of jelly beans. Across 38 plates, VAPORS acquires much more
+efficiently than baselines, generalizes across realistic plate variations such
+as toppings and sauces, and qualitatively appeals to user feeding preferences
+in a survey conducted across 49 individuals. Code, datasets, videos, and
+supplementary materials can be found on our website:
+https://sites.google.com/view/vaporsbot.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">GPT</span>-Driver: Learning to Drive with <span class="highlight-title">GPT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01415v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01415v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiageng Mao, Yuxi Qian, Hang Zhao, Yue Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a simple yet effective approach that can transform the OpenAI
+GPT-3.5 model into a reliable motion planner for autonomous vehicles. Motion
+planning is a core challenge in autonomous driving, aiming to plan a driving
+trajectory that is safe and comfortable. Existing motion planners predominantly
+leverage heuristic methods to forecast driving trajectories, yet these
+approaches demonstrate insufficient generalization capabilities in the face of
+novel and unseen driving scenarios. In this paper, we propose a novel approach
+to motion planning that capitalizes on the strong reasoning capabilities and
+generalization potential inherent to Large Language Models (LLMs). The
+fundamental insight of our approach is the reformulation of motion planning as
+a language modeling problem, a perspective not previously explored.
+Specifically, we represent the planner inputs and outputs as language tokens,
+and leverage the LLM to generate driving trajectories through a language
+description of coordinate positions. Furthermore, we propose a novel
+prompting-reasoning-finetuning strategy to stimulate the numerical reasoning
+potential of the LLM. With this strategy, the LLM can describe highly precise
+trajectory coordinates and also its internal decision-making process in natural
+language. We evaluate our approach on the large-scale nuScenes dataset, and
+extensive experiments substantiate the effectiveness, generalization ability,
+and interpretability of our GPT-based motion planner. Code is now available at
+https://github.com/PointsCoder/GPT-Driver.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page:
+  https://pointscoder.github.io/projects/gpt_driver/index.html</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Segment Any Building For Remote Sensing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01164v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01164v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of identifying and segmenting buildings within remote sensing
+imagery has perennially stood at the forefront of scholarly investigations.
+This manuscript accentuates the potency of harnessing diversified datasets in
+tandem with cutting-edge representation learning paradigms for building
+segmentation in such images. Through the strategic amalgamation of disparate
+datasets, we have not only expanded the informational horizon accessible for
+model training but also manifested unparalleled performance metrics across
+multiple datasets. Our avant-garde joint training regimen underscores the merit
+of our approach, bearing significant implications in pivotal domains such as
+urban infrastructural development, disaster mitigation strategies, and
+ecological surveillance. Our methodology, predicated upon the fusion of
+datasets and gleaning insights from pre-trained models, carves a new benchmark
+in the annals of building segmentation endeavors. The outcomes of this research
+both fortify the foundations for ensuing scholarly pursuits and presage a
+horizon replete with innovative applications in the discipline of building
+segmentation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PoSynDA: Multi-Hypothesis Pose Synthesis Domain Adaptation for Robust 3D
+  Human Pose Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.09678v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.09678v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanbing Liu, Jun-Yan He, Zhi-Qi Cheng, Wangmeng Xiang, Qize Yang, Wenhao Chai, Gaoang Wang, Xu Bao, Bin Luo, Yifeng Geng, Xuansong Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing 3D human pose estimators face challenges in adapting to new datasets
+due to the lack of 2D-3D pose pairs in training sets. To overcome this issue,
+we propose \textit{Multi-Hypothesis \textbf{P}ose \textbf{Syn}thesis
+\textbf{D}omain \textbf{A}daptation} (\textbf{PoSynDA}) framework to bridge
+this data disparity gap in target domain. Typically, PoSynDA uses a
+diffusion-inspired structure to simulate 3D pose distribution in the target
+domain. By incorporating a multi-hypothesis network, PoSynDA generates diverse
+pose hypotheses and aligns them with the target domain. To do this, it first
+utilizes target-specific source augmentation to obtain the target domain
+distribution data from the source domain by decoupling the scale and position
+parameters. The process is then further refined through the teacher-student
+paradigm and low-rank adaptation. With extensive comparison of benchmarks such
+as Human3.6M and MPI-INF-3DHP, PoSynDA demonstrates competitive performance,
+even comparable to the target-trained MixSTE model\cite{zhang2022mixste}. This
+work paves the way for the practical application of 3D human pose estimation in
+unseen domains. The code is available at https://github.com/hbing-l/PoSynDA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM Multimedia 2023; 10 pages, 4 figures, 8 tables; the
+  code is at https://github.com/hbing-l/PoSynDA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating Explainable AI on a Multi-Modal Medical Imaging Task: Can
+  Existing Algorithms Fulfill Clinical Requirements? <span class="chip">AAAI 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.06487v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.06487v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weina Jin, Xiaoxiao Li, Ghassan Hamarneh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Being able to explain the prediction to clinical end-users is a necessity to
+leverage the power of artificial intelligence (AI) models for clinical decision
+support. For medical images, a feature attribution map, or heatmap, is the most
+common form of explanation that highlights important features for AI models'
+prediction. However, it is unknown how well heatmaps perform on explaining
+decisions on multi-modal medical images, where each image modality or channel
+visualizes distinct clinical information of the same underlying biomedical
+phenomenon. Understanding such modality-dependent features is essential for
+clinical users' interpretation of AI decisions. To tackle this clinically
+important but technically ignored problem, we propose the modality-specific
+feature importance (MSFI) metric. It encodes clinical image and explanation
+interpretation patterns of modality prioritization and modality-specific
+feature localization. We conduct a clinical requirement-grounded, systematic
+evaluation using computational methods and a clinician user study. Results show
+that the examined 16 heatmap algorithms failed to fulfill clinical requirements
+to correctly indicate AI model decision process or decision quality. The
+evaluation and MSFI metric can guide the design and selection of XAI algorithms
+to meet clinical requirements on multi-modal explanation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>AAAI 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bridging Discrete and Backpropagation: Straight-Through and Beyond <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.08612v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.08612v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liyuan Liu, Chengyu Dong, Xiaodong Liu, Bin Yu, Jianfeng Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Backpropagation, the cornerstone of deep learning, is limited to computing
+gradients for continuous variables. This limitation poses challenges for
+problems involving discrete latent variables. To address this issue, we propose
+a novel approach to approximate the gradient of parameters involved in
+generating discrete latent variables. First, we examine the widely used
+Straight-Through (ST) heuristic and demonstrate that it works as a first-order
+approximation of the gradient. Guided by our findings, we propose ReinMax,
+which achieves second-order accuracy by integrating Heun's method, a
+second-order numerical method for solving ODEs. ReinMax does not require
+Hessian or other second-order derivatives, thus having negligible computation
+overheads. Extensive experimental results on various tasks demonstrate the
+superiority of ReinMax over the state of the art. Implementations are released
+at https://github.com/microsoft/ReinMax.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023 (Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Anomaly Segmentation with Multi-Granularity Cross-Domain
+  Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.08696v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.08696v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ji Zhang, Xiao Wu, Zhi-Qi Cheng, Qi He, Wei Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly segmentation plays a pivotal role in identifying atypical objects in
+images, crucial for hazard detection in autonomous driving systems. While
+existing methods demonstrate noteworthy results on synthetic data, they often
+fail to consider the disparity between synthetic and real-world data domains.
+Addressing this gap, we introduce the Multi-Granularity Cross-Domain Alignment
+(MGCDA) framework, tailored to harmonize features across domains at both the
+scene and individual sample levels. Our contributions are twofold: i) We
+present the Multi-source Domain Adversarial Training module. This integrates a
+multi-source adversarial loss coupled with dynamic label smoothing,
+facilitating the learning of domain-agnostic representations across multiple
+processing stages. ii) We propose an innovative Cross-domain Anomaly-aware
+Contrastive Learning methodology.} This method adeptly selects challenging
+anchor points and images using an anomaly-centric strategy, ensuring precise
+alignment at the sample level. Extensive evaluations of the Fishyscapes and
+RoadAnomaly datasets demonstrate MGCDA's superior performance and adaptability.
+Additionally, its ability to perform parameter-free inference and function with
+various network architectures highlights its distinctiveness in advancing the
+frontier of anomaly segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Extending TrOCR for Text Localization-Free OCR of Full-Page Scanned
+  Receipt Images <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.05525v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.05525v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongkuan Zhang, Edward Whittaker, Ikuo Kitagishi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Digitization of scanned receipts aims to extract text from receipt images and
+save it into structured documents. This is usually split into two sub-tasks:
+text localization and optical character recognition (OCR). Most existing OCR
+models only focus on the cropped text instance images, which require the
+bounding box information provided by a text region detection model. Introducing
+an additional detector to identify the text instance images in advance adds
+complexity, however instance-level OCR models have very low accuracy when
+processing the whole image for the document-level OCR, such as receipt images
+containing multiple text lines arranged in various layouts. To this end, we
+propose a localization-free document-level OCR model for transcribing all the
+characters in a receipt image into an ordered sequence end-to-end.
+Specifically, we finetune the pretrained instance-level model TrOCR with
+randomly cropped image chunks, and gradually increase the image chunk size to
+generalize the recognition ability from instance images to full-page images. In
+our experiments on the SROIE receipt OCR dataset, the model finetuned with our
+strategy achieved 64.4 F1-score and a 22.8% character error rate (CER),
+respectively, which outperforms the baseline results with 48.5 F1-score and
+50.6% CER. The best model, which splits the full image into 15 equally sized
+chunks, gives 87.8 F1-score and 4.98% CER with minimal additional pre or
+post-processing of the output. Moreover, the characters in the generated
+document-level sequences are arranged in the reading order, which is practical
+for real-world applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023 RCV Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ One-for-All: Generalized LoRA for Parameter-Efficient Fine-tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07967v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07967v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arnav Chavan, Zhuang Liu, Deepak Gupta, Eric Xing, Zhiqiang Shen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Generalized LoRA (GLoRA), an advanced approach for universal
+parameter-efficient fine-tuning tasks. Enhancing Low-Rank Adaptation (LoRA),
+GLoRA employs a generalized prompt module to optimize pre-trained model weights
+and adjust intermediate activations, providing more flexibility and capability
+across diverse tasks and datasets. Moreover, GLoRA facilitates efficient
+parameter adaptation by employing a scalable, modular, layer-wise structure
+search that learns individual adapter of each layer. Originating from a unified
+mathematical formulation, GLoRA exhibits strong transfer learning, few-shot
+learning and domain generalization abilities, as it adapts to new tasks through
+not only weights but also additional dimensions like activations. Comprehensive
+experiments demonstrate that GLoRA outperforms all previous methods in natural,
+specialized, and structured vision benchmarks, achieving superior accuracy with
+fewer parameters and computations. The proposed method on LLaMA-1 and LLaMA-2
+also show considerable enhancements compared to the original LoRA in the
+language domain. Furthermore, our structural re-parameterization design ensures
+that GLoRA incurs no extra inference cost, rendering it a practical solution
+for resource-limited applications. Code and models are available at:
+https://github.com/Arnav0400/ViT-Slim/tree/master/GLoRA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Technical report. v2: Add LLaMA-1&2 results. Code and models at
+  https://github.com/Arnav0400/ViT-Slim/tree/master/GLoRA</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">13</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ If the Sources Could Talk: Evaluating Large Language Models for Research
+  Assistance in History 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10808v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10808v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giselle Gonzalez Garcia, Christian Weilbach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent advent of powerful Large-Language Models (LLM) provides a new
+conversational form of inquiry into historical memory (or, training data, in
+this case). We show that by augmenting such LLMs with vector embeddings from
+highly specialized academic sources, a conversational methodology can be made
+accessible to historians and other researchers in the Humanities. Concretely,
+we evaluate and demonstrate how LLMs have the ability of assisting researchers
+while they examine a customized corpora of different types of documents,
+including, but not exclusive to: (1). primary sources, (2). secondary sources
+written by experts, and (3). the combination of these two. Compared to
+established search interfaces for digital catalogues, such as metadata and
+full-text search, we evaluate the richer conversational style of LLMs on the
+performance of two main types of tasks: (1). question-answering, and (2).
+extraction and organization of data. We demonstrate that LLMs semantic
+retrieval and reasoning abilities on problem-specific tasks can be applied to
+large textual archives that have not been part of the its training data.
+Therefore, LLMs can be augmented with sources relevant to specific research
+projects, and can be queried privately by researchers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Will be published at CHR2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Type-aware Decoding via Explicitly Aggregating Event Information for
+  Document-level Event Extraction <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10487v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10487v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gang Zhao, Yidong Shi, Shudong Lu, Xinjie Yang, Guanting Dong, Jian Xu, Xiaocheng Gong, Si Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Document-level event extraction (DEE) faces two main challenges:
+arguments-scattering and multi-event. Although previous methods attempt to
+address these challenges, they overlook the interference of event-unrelated
+sentences during event detection and neglect the mutual interference of
+different event roles during argument extraction. Therefore, this paper
+proposes a novel Schema-based Explicitly Aggregating~(SEA) model to address
+these limitations. SEA aggregates event information into event type and role
+representations, enabling the decoding of event records based on specific
+type-aware representations. By detecting each event based on its event type
+representation, SEA mitigates the interference caused by event-unrelated
+information. Furthermore, SEA extracts arguments for each role based on its
+role-aware representations, reducing mutual interference between different
+roles. Experimental results on the ChFinAnn and DuEE-fin datasets show that SEA
+outperforms the SOTA methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DemoSG: Demonstration-enhanced Schema-guided Generation for Low-resource
+  Event Extraction <span class="chip">EMNLP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10481v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10481v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gang Zhao, Xiaocheng Gong, Xinjie Yang, Guanting Dong, Shudong Lu, Si Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most current Event Extraction (EE) methods focus on the high-resource
+scenario, which requires a large amount of annotated data and can hardly be
+applied to low-resource domains. To address EE more effectively with limited
+resources, we propose the Demonstration-enhanced Schema-guided Generation
+(DemoSG) model, which benefits low-resource EE from two aspects: Firstly, we
+propose the demonstration-based learning paradigm for EE to fully use the
+annotated data, which transforms them into demonstrations to illustrate the
+extraction process and help the model learn effectively. Secondly, we formulate
+EE as a natural language generation task guided by schema-based prompts,
+thereby leveraging label semantics and promoting knowledge transfer in
+low-resource scenarios. We conduct extensive experiments under in-domain and
+domain adaptation low-resource settings on three datasets, and study the
+robustness of DemoSG. The results show that DemoSG significantly outperforms
+current methods in low-resource scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Findings of EMNLP2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BeatDance: A Beat-Based Model-Agnostic Contrastive Learning Framework
+  for Music-Dance Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10300v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10300v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaixing Yang, Xukun Zhou, Xulong Tang, Ran Diao, Hongyan Liu, Jun He, Zhaoxin Fan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dance and music are closely related forms of expression, with mutual
+retrieval between dance videos and music being a fundamental task in various
+fields like education, art, and sports. However, existing methods often suffer
+from unnatural generation effects or fail to fully explore the correlation
+between music and dance. To overcome these challenges, we propose BeatDance, a
+novel beat-based model-agnostic contrastive learning framework. BeatDance
+incorporates a Beat-Aware Music-Dance InfoExtractor, a Trans-Temporal Beat
+Blender, and a Beat-Enhanced Hubness Reducer to improve dance-music retrieval
+performance by utilizing the alignment between music beats and dance movements.
+We also introduce the Music-Dance (MD) dataset, a large-scale collection of
+over 10,000 music-dance video pairs for training and testing. Experimental
+results on the MD dataset demonstrate the superiority of our method over
+existing baselines, achieving state-of-the-art performance. The code and
+dataset will be made public available upon acceptance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Financial Service Promotion With Hybrid Recommender Systems
+  at PicPay <span class="chip">KDD '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10268v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10268v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel Mendonça, Matheus Santos, André Gonçalves, Yan Almeida
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The fintech PicPay offers a wide range of financial services to its 30
+million monthly active users, with more than 50 thousand items recommended in
+the PicPay mobile app. In this scenario, promoting specific items that are
+strategic to the company can be very challenging. In this work, we present a
+Switching Hybrid Recommender System that combines two algorithms to effectively
+promote items without negatively impacting the user's experience. The results
+of our A/B tests show an uplift of up to 3.2\% when compared to a default
+recommendation strategy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 4 figures, submitted to ACM KDD '23 2nd Workshop on End-End
+  Customer Journey Optimization</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Interpretable Deep-Learning Framework for Predicting Hospital
+  Readmissions From Electronic Health Records 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10187v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10187v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabio Azzalini, Tommaso Dolci, Marco Vagaggini
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the increasing availability of patients' data, modern medicine is
+shifting towards prospective healthcare. Electronic health records contain a
+variety of information useful for clinical patient description and can be
+exploited for the construction of predictive models, given that similar medical
+histories will likely lead to similar progressions. One example is unplanned
+hospital readmission prediction, an essential task for reducing hospital costs
+and improving patient health. Despite predictive models showing very good
+performances especially with deep-learning models, they are often criticized
+for the poor interpretability of their results, a fundamental characteristic in
+the medical field, where incorrect predictions might have serious consequences
+for the patient health. In this paper we propose a novel, interpretable
+deep-learning framework for predicting unplanned hospital readmissions,
+supported by NLP findings on word embeddings and by neural-network models
+(ConvLSTM) for better handling temporal data. We validate our system on the two
+predictive tasks of hospital readmission within 30 and 180 days, using
+real-world data. In addition, we introduce and test a model-dependent technique
+to make the representation of results easily interpretable by the medical
+staff. Our solution achieves better performances compared to traditional models
+based on machine learning, while providing at the same time more interpretable
+results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DemoNSF: A Multi-task Demonstration-based Generative Framework for Noisy
+  Slot Filling Task <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10169v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10169v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanting Dong, Tingfeng Hui, Zhuoma GongQue, Jinxu Zhao, Daichi Guo, Gang Zhao, Keqing He, Weiran Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, prompt-based generative frameworks have shown impressive
+capabilities in sequence labeling tasks. However, in practical dialogue
+scenarios, relying solely on simplistic templates and traditional corpora
+presents a challenge for these methods in generalizing to unknown input
+perturbations. To address this gap, we propose a multi-task demonstration based
+generative framework for noisy slot filling, named DemoNSF. Specifically, we
+introduce three noisy auxiliary tasks, namely noisy recovery (NR), random mask
+(RM), and hybrid discrimination (HD), to implicitly capture semantic structural
+information of input perturbations at different granularities. In the
+downstream main task, we design a noisy demonstration construction strategy for
+the generative framework, which explicitly incorporates task-specific
+information and perturbed distribution during training and inference.
+Experiments on two benchmarks demonstrate that DemoNSF outperforms all baseline
+methods and achieves strong generalization. Further analysis provides empirical
+guidance for the practical application of generative frameworks. Our code is
+released at https://github.com/dongguanting/Demo-NSF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of EMNLP 2023 (Short Paper)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DNA: Denoised Neighborhood Aggregation for Fine-grained Category
+  Discovery <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10151v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10151v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenbin An, Feng Tian, Wenkai Shi, Yan Chen, Qinghua Zheng, QianYing Wang, Ping Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Discovering fine-grained categories from coarsely labeled data is a practical
+and challenging task, which can bridge the gap between the demand for
+fine-grained analysis and the high annotation cost. Previous works mainly focus
+on instance-level discrimination to learn low-level features, but ignore
+semantic similarities between data, which may prevent these models learning
+compact cluster representations. In this paper, we propose Denoised
+Neighborhood Aggregation (DNA), a self-supervised framework that encodes
+semantic structures of data into the embedding space. Specifically, we retrieve
+k-nearest neighbors of a query as its positive keys to capture semantic
+similarities between data and then aggregate information from the neighbors to
+learn compact cluster representations, which can make fine-grained categories
+more separatable. However, the retrieved neighbors can be noisy and contain
+many false-positive keys, which can degrade the quality of learned embeddings.
+To cope with this challenge, we propose three principles to filter out these
+false neighbors for better representation learning. Furthermore, we
+theoretically justify that the learning objective of our framework is
+equivalent to a clustering loss, which can capture semantic similarities
+between data to form compact fine-grained clusters. Extensive experiments on
+three benchmark datasets show that our method can retrieve more accurate
+neighbors (21.31% accuracy improvement) and outperform state-of-the-art models
+by a large margin (average 9.96% improvement on three metrics). Our code and
+data are available at https://github.com/Lackel/DNA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by EMNLP 2023 Main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Generative Agents in Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10108v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10108v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        An Zhang, Leheng Sheng, Yuxin Chen, Hao Li, Yang Deng, Xiang Wang, Tat-Seng Chua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recommender systems are the cornerstone of today's information dissemination,
+yet a disconnect between offline metrics and online performance greatly hinders
+their development. Addressing this challenge, we envision a recommendation
+simulator, capitalizing on recent breakthroughs in human-level intelligence
+exhibited by Large Language Models (LLMs). We propose Agent4Rec, a novel movie
+recommendation simulator, leveraging LLM-empowered generative agents equipped
+with user profile, memory, and actions modules specifically tailored for the
+recommender system. In particular, these agents' profile modules are
+initialized using the MovieLens dataset, capturing users' unique tastes and
+social traits; memory modules log both factual and emotional memories and are
+integrated with an emotion-driven reflection mechanism; action modules support
+a wide variety of behaviors, spanning both taste-driven and emotion-driven
+actions. Each agent interacts with personalized movie recommendations in a
+page-by-page manner, relying on a pre-implemented collaborative filtering-based
+recommendation algorithm. We delve into both the capabilities and limitations
+of Agent4Rec, aiming to explore an essential research question: to what extent
+can LLM-empowered generative agents faithfully simulate the behavior of real,
+autonomous humans in recommender systems? Extensive and multi-faceted
+evaluations of Agent4Rec highlight both the alignment and deviation between
+agents and user-personalized preferences. Beyond mere performance comparison,
+we explore insightful experiments, such as emulating the filter bubble effect
+and discovering the underlying causal relationships in recommendation tasks.
+Our codes are available at https://github.com/LehengTHU/Agent4Rec.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages,14 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual-Scale Interest Extraction Framework with Self-Supervision for
+  Sequential Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10025v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10025v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liangliang Chen, Hongzhan Lin, Jinshan Ma, Guang Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the sequential recommendation task, the recommender generally learns
+multiple embeddings from a user's historical behaviors, to catch the diverse
+interests of the user. Nevertheless, the existing approaches just extract each
+interest independently for the corresponding sub-sequence while ignoring the
+global correlation of the entire interaction sequence, which may fail to
+capture the user's inherent preference for the potential interests
+generalization and unavoidably make the recommended items homogeneous with the
+historical behaviors. In this paper, we propose a novel Dual-Scale Interest
+Extraction framework (DSIE) to precisely estimate the user's current interests.
+Specifically, DSIE explicitly models the user's inherent preference with
+contrastive learning by attending over his/her entire interaction sequence at
+the global scale and catches the user's diverse interests in a fine granularity
+at the local scale. Moreover, we develop a novel interest aggregation module to
+integrate the multi-interests according to the inherent preference to generate
+the user's current interests for the next-item prediction. Experiments
+conducted on three real-world benchmark datasets demonstrate that DSIE
+outperforms the state-of-the-art models in terms of recommendation preciseness
+and novelty.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Editable User Profiles for Controllable Text Recommendation <span class="chip">SIGIR-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.04250v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.04250v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheshera Mysore, Mahmood Jasim, Andrew McCallum, Hamed Zamani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Methods for making high-quality recommendations often rely on learning latent
+representations from interaction data. These methods, while performant, do not
+provide ready mechanisms for users to control the recommendation they receive.
+Our work tackles this problem by proposing LACE, a novel concept value
+bottleneck model for controllable text recommendations. LACE represents each
+user with a succinct set of human-readable concepts through retrieval given
+user-interacted documents and learns personalized representations of the
+concepts based on user documents. This concept based user profile is then
+leveraged to make recommendations. The design of our model affords control over
+the recommendations through a number of intuitive interactions with a
+transparent user profile. We first establish the quality of recommendations
+obtained from LACE in an offline evaluation on three recommendation tasks
+spanning six datasets in warm-start, cold-start, and zero-shot setups. Next, we
+validate the controllability of LACE under simulated user interactions.
+Finally, we implement LACE in an interactive controllable recommender system
+and conduct a user study to demonstrate that users are able to improve the
+quality of recommendations they receive through interactions with an editable
+user profile.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGIR-2023 paper with extended results</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vertical Allocation-based Fair Exposure Amortizing in Ranking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.03046v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.03046v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Yang, Zhichao Xu, Qingyao Ai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Result ranking often affects consumer satisfaction as well as the amount of
+exposure each item receives in the ranking services. Myopically maximizing
+customer satisfaction by ranking items only according to relevance will lead to
+unfair distribution of exposure for items, followed by unfair opportunities and
+economic gains for item producers/providers. Such unfairness will force
+providers to leave the system and discourage new providers from coming in.
+Eventually, fewer purchase options would be left for consumers, and the
+utilities of both consumers and providers would be harmed. Thus, to maintain a
+balance between ranking relevance and fairness is crucial for both parties. In
+this paper, we focus on the exposure fairness in ranking services. We
+demonstrate that existing methods for amortized fairness optimization could be
+suboptimal in terms of fairness-relevance tradeoff because they fail to utilize
+the prior knowledge of consumers. We further propose a novel algorithm named
+Vertical Allocation-based Fair Exposure Amortizing in Ranking, or VerFair, to
+reach a better balance between exposure fairness and ranking performance.
+Extensive experiments on three real-world datasets show that VerFair
+significantly outperforms state-of-the-art fair ranking algorithms in
+fairness-performance trade-offs from both the individual level and the group
+level.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Looks Can Be Deceiving: Linking User-Item Interactions and User's
+  Propensity Towards Multi-Objective Recommendations <span class="chip">RecSys 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00654v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00654v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrik Dokoupil, Ladislav Peska, Ludovico Boratto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-objective recommender systems (MORS) provide suggestions to users
+according to multiple (and possibly conflicting) goals. When a system optimizes
+its results at the individual-user level, it tailors them on a user's
+propensity towards the different objectives. Hence, the capability to
+understand users' fine-grained needs towards each goal is crucial. In this
+paper, we present the results of a user study in which we monitored the way
+users interacted with recommended items, as well as their self-proclaimed
+propensities towards relevance, novelty and diversity objectives. The study was
+divided into several sessions, where users evaluated recommendation lists
+originating from a relevance-only single-objective baseline as well as MORS. We
+show that despite MORS-based recommendations attracted less selections, its
+presence in the early sessions is crucial for users' satisfaction in the later
+stages. Surprisingly, the self-proclaimed willingness of users to interact with
+novel and diverse items is not always reflected in the recommendations they
+accept. Post-study questionnaires provide insights on how to deal with this
+matter, suggesting that MORS-based results should be accompanied by elements
+that allow users to understand the recommendations, so as to facilitate their
+acceptance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted as a short paper at ACM RecSys 2023 conference. See
+  https://doi.org/10.1145/3604915.3608848</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">8</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Lead Sheet Generation via Semantic Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10772v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10772v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zachary Novack, Nikita Srivatsan, Taylor Berg-Kirkpatrick, Julian McAuley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lead sheets have become commonplace in generative music research, being used
+as an initial compressed representation for downstream tasks like multitrack
+music generation and automatic arrangement. Despite this, researchers have
+often fallen back on deterministic reduction methods (such as the skyline
+algorithm) to generate lead sheets when seeking paired lead sheets and full
+scores, with little attention being paid toward the quality of the lead sheets
+themselves and how they accurately reflect their orchestrated counterparts. To
+address these issues, we propose the problem of conditional lead sheet
+generation (i.e. generating a lead sheet given its full score version), and
+show that this task can be formulated as an unsupervised music compression
+task, where the lead sheet represents a compressed latent version of the score.
+We introduce a novel model, called Lead-AE, that models the lead sheets as a
+discrete subselection of the original sequence, using a differentiable top-k
+operator to allow for controllable local sparsity constraints. Across both
+automatic proxy tasks and direct human evaluations, we find that our method
+improves upon the established deterministic baseline and produces coherent
+reductions of large multitrack scores.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Effortless Cross-Platform Video Codec: A Codebook-Based Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10292v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10292v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuan Tian, Yonghang Guan, Jinxi Xiang, Jun Zhang, Xiao Han, Wei Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Under certain circumstances, advanced neural video codecs can surpass the
+most complex traditional codecs in their rate-distortion (RD) performance. One
+of the main reasons for the high performance of existing neural video codecs is
+the use of the entropy model, which can provide more accurate probability
+distribution estimations for compressing the latents. This also implies the
+rigorous requirement that entropy models running on different platforms should
+use consistent distribution estimations. However, in cross-platform scenarios,
+entropy models running on different platforms usually yield inconsistent
+probability distribution estimations due to floating point computation errors
+that are platform-dependent, which can cause the decoding side to fail in
+correctly decoding the compressed bitstream sent by the encoding side. In this
+paper, we propose a cross-platform video compression framework based on
+codebooks, which avoids autoregressive entropy modeling and achieves video
+compression by transmitting the index sequence of the codebooks. Moreover,
+instead of using optical flow for context alignment, we propose to use the
+conditional cross-attention module to obtain the context between frames. Due to
+the absence of autoregressive modeling and optical flow alignment, we can
+design an extremely minimalist framework that can greatly benefit computational
+efficiency. Importantly, our framework no longer contains any distribution
+estimation modules for entropy modeling, and thus computations across platforms
+are not necessarily consistent. Experimental results show that our method can
+outperform the traditional H.265 (medium) even without any entropy constraints,
+while achieving the cross-platform property intrinsically.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evading Detection Actively: Toward Anti-Forensics against Forgery
+  Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10036v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10036v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Long Zhuo, Shenghai Luo, Shunquan Tan, Han Chen, Bin Li, Jiwu Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anti-forensics seeks to eliminate or conceal traces of tampering artifacts.
+Typically, anti-forensic methods are designed to deceive binary detectors and
+persuade them to misjudge the authenticity of an image. However, to the best of
+our knowledge, no attempts have been made to deceive forgery detectors at the
+pixel level and mis-locate forged regions. Traditional adversarial attack
+methods cannot be directly used against forgery localization due to the
+following defects: 1) they tend to just naively induce the target forensic
+models to flip their pixel-level pristine or forged decisions; 2) their
+anti-forensics performance tends to be severely degraded when faced with the
+unseen forensic models; 3) they lose validity once the target forensic models
+are retrained with the anti-forensics images generated by them. To tackle the
+three defects, we propose SEAR (Self-supErvised Anti-foRensics), a novel
+self-supervised and adversarial training algorithm that effectively trains
+deep-learning anti-forensic models against forgery localization. SEAR sets a
+pretext task to reconstruct perturbation for self-supervised learning. In
+adversarial training, SEAR employs a forgery localization model as a supervisor
+to explore tampering features and constructs a deep-learning concealer to erase
+corresponding traces. We have conducted largescale experiments across diverse
+datasets. The experimental results demonstrate that, through the combination of
+self-supervised learning and adversarial learning, SEAR successfully deceives
+the state-of-the-art forgery localization methods, as well as tackle the three
+defects regarding traditional adversarial attack methods mentioned above.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Unfolding Network for Image Compressed Sensing by Content-adaptive
+  Gradient Updating and Deformation-invariant Non-local Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.10033v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.10033v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenxue Cui, Xiaopeng Fan, Jian Zhang, Debin Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inspired by certain optimization solvers, the deep unfolding network (DUN)
+has attracted much attention in recent years for image compressed sensing (CS).
+However, there still exist the following two issues: 1) In existing DUNs, most
+hyperparameters are usually content independent, which greatly limits their
+adaptability for different input contents. 2) In each iteration, a plain
+convolutional neural network is usually adopted, which weakens the perception
+of wider context prior and therefore depresses the expressive ability. In this
+paper, inspired by the traditional Proximal Gradient Descent (PGD) algorithm, a
+novel DUN for image compressed sensing (dubbed DUN-CSNet) is proposed to solve
+the above two issues. Specifically, for the first issue, a novel content
+adaptive gradient descent network is proposed, in which a well-designed step
+size generation sub-network is developed to dynamically allocate the
+corresponding step sizes for different textures of input image by generating a
+content-aware step size map, realizing a content-adaptive gradient updating.
+For the second issue, considering the fact that many similar patches exist in
+an image but have undergone a deformation, a novel deformation-invariant
+non-local proximal mapping network is developed, which can adaptively build the
+long-range dependencies between the nonlocal patches by deformation-invariant
+non-local modeling, leading to a wider perception on context priors. Extensive
+experiments manifest that the proposed DUN-CSNet outperforms existing
+state-of-the-art CS methods by large margins.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 13 figures. Accepted by IEEE Transactions on Multimedia
+  (TMM)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Whispering LLaMA: A Cross-Modal Generative Error Correction Framework
+  for Speech Recognition <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06434v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06434v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Srijith Radhakrishnan, Chao-Han Huck Yang, Sumeer Ahmad Khan, Rohit Kumar, Narsis A. Kiani, David Gomez-Cabrero, Jesper N. Tegner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a new cross-modal fusion technique designed for generative error
+correction in automatic speech recognition (ASR). Our methodology leverages
+both acoustic information and external linguistic representations to generate
+accurate speech transcription contexts. This marks a step towards a fresh
+paradigm in generative error correction within the realm of n-best hypotheses.
+Unlike the existing ranking-based rescoring methods, our approach adeptly uses
+distinct initialization techniques and parameter-efficient algorithms to boost
+ASR performance derived from pre-trained speech and text models. Through
+evaluation across diverse ASR datasets, we evaluate the stability and
+reproducibility of our fusion technique, demonstrating its improved word error
+rate relative (WERR) performance in comparison to n-best hypotheses by
+relatively 37.66%. To encourage future research, we have made our code and
+pre-trained models open source at
+https://github.com/Srijith-rkr/Whispering-LLaMA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP 2023 as main paper. 10 pages. Revised math
+  notations. GitHub: https://github.com/Srijith-rkr/Whispering-LLaMA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ COLD Fusion: Calibrated and Ordinal Latent Distribution Fusion for
+  Uncertainty-Aware Multimodal Emotion Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.05833v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.05833v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mani Kumar Tellamekala, Shahin Amiriparian, Björn W. Schuller, Elisabeth André, Timo Giesbrecht, Michel Valstar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatically recognising apparent emotions from face and voice is hard, in
+part because of various sources of uncertainty, including in the input data and
+the labels used in a machine learning framework. This paper introduces an
+uncertainty-aware audiovisual fusion approach that quantifies modality-wise
+uncertainty towards emotion prediction. To this end, we propose a novel fusion
+framework in which we first learn latent distributions over audiovisual
+temporal context vectors separately, and then constrain the variance vectors of
+unimodal latent distributions so that they represent the amount of information
+each modality provides w.r.t. emotion recognition. In particular, we impose
+Calibration and Ordinal Ranking constraints on the variance vectors of
+audiovisual latent distributions. When well-calibrated, modality-wise
+uncertainty scores indicate how much their corresponding predictions may differ
+from the ground truth labels. Well-ranked uncertainty scores allow the ordinal
+ranking of different frames across the modalities. To jointly impose both these
+constraints, we propose a softmax distributional matching loss. In both
+classification and regression settings, we compare our uncertainty-aware fusion
+model with standard model-agnostic fusion baselines. Our evaluation on two
+emotion recognition corpora, AVEC 2019 CES and IEMOCAP, shows that audiovisual
+emotion recognition can considerably benefit from well-calibrated and
+well-ranked latent uncertainty measures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE Transactions on Pattern Analysis and Machine
+  Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PoSynDA: Multi-Hypothesis Pose Synthesis Domain Adaptation for Robust 3D
+  Human Pose Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.09678v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.09678v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanbing Liu, Jun-Yan He, Zhi-Qi Cheng, Wangmeng Xiang, Qize Yang, Wenhao Chai, Gaoang Wang, Xu Bao, Bin Luo, Yifeng Geng, Xuansong Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing 3D human pose estimators face challenges in adapting to new datasets
+due to the lack of 2D-3D pose pairs in training sets. To overcome this issue,
+we propose \textit{Multi-Hypothesis \textbf{P}ose \textbf{Syn}thesis
+\textbf{D}omain \textbf{A}daptation} (\textbf{PoSynDA}) framework to bridge
+this data disparity gap in target domain. Typically, PoSynDA uses a
+diffusion-inspired structure to simulate 3D pose distribution in the target
+domain. By incorporating a multi-hypothesis network, PoSynDA generates diverse
+pose hypotheses and aligns them with the target domain. To do this, it first
+utilizes target-specific source augmentation to obtain the target domain
+distribution data from the source domain by decoupling the scale and position
+parameters. The process is then further refined through the teacher-student
+paradigm and low-rank adaptation. With extensive comparison of benchmarks such
+as Human3.6M and MPI-INF-3DHP, PoSynDA demonstrates competitive performance,
+even comparable to the target-trained MixSTE model\cite{zhang2022mixste}. This
+work paves the way for the practical application of 3D human pose estimation in
+unseen domains. The code is available at https://github.com/hbing-l/PoSynDA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM Multimedia 2023; 10 pages, 4 figures, 8 tables; the
+  code is at https://github.com/hbing-l/PoSynDA</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Anomaly Segmentation with Multi-Granularity Cross-Domain
+  Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.08696v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.08696v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ji Zhang, Xiao Wu, Zhi-Qi Cheng, Qi He, Wei Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly segmentation plays a pivotal role in identifying atypical objects in
+images, crucial for hazard detection in autonomous driving systems. While
+existing methods demonstrate noteworthy results on synthetic data, they often
+fail to consider the disparity between synthetic and real-world data domains.
+Addressing this gap, we introduce the Multi-Granularity Cross-Domain Alignment
+(MGCDA) framework, tailored to harmonize features across domains at both the
+scene and individual sample levels. Our contributions are twofold: i) We
+present the Multi-source Domain Adversarial Training module. This integrates a
+multi-source adversarial loss coupled with dynamic label smoothing,
+facilitating the learning of domain-agnostic representations across multiple
+processing stages. ii) We propose an innovative Cross-domain Anomaly-aware
+Contrastive Learning methodology.} This method adeptly selects challenging
+anchor points and images using an anomaly-centric strategy, ensuring precise
+alignment at the sample level. Extensive evaluations of the Fishyscapes and
+RoadAnomaly datasets demonstrate MGCDA's superior performance and adaptability.
+Additionally, its ability to perform parameter-free inference and function with
+various network architectures highlights its distinctiveness in advancing the
+frontier of anomaly segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ACM Multimedia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-10-15T00:00:00Z">2023-10-15</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">9</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Farzi Data: Autoregressive Data Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09983v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09983v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Noveen Sachdeva, Zexue He, Wang-Cheng Kang, Jianmo Ni, Derek Zhiyuan Cheng, Julian McAuley
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study data distillation for auto-regressive machine learning tasks, where
+the input and output have a strict left-to-right causal structure. More
+specifically, we propose Farzi, which summarizes an event sequence dataset into
+a small number of synthetic sequences -- Farzi Data -- which are optimized to
+maintain (if not improve) model performance compared to training on the full
+dataset. Under the hood, Farzi conducts memory-efficient data distillation by
+(i) deriving efficient reverse-mode differentiation of the Adam optimizer by
+leveraging Hessian-Vector Products; and (ii) factorizing the high-dimensional
+discrete event-space into a latent-space which provably promotes implicit
+regularization. Empirically, for sequential recommendation and language
+modeling tasks, we are able to achieve 98-120% of downstream full-data
+performance when training state-of-the-art models on Farzi Data of size as
+little as 0.1% of the original dataset. Notably, being able to train better
+models with significantly less data sheds light on the design of future large
+auto-regressive models, and opens up new opportunities to further scale up
+model and data sizes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review. 23 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Large Language Models (LLMs) to Empower Training-Free <span class="highlight-title">Dataset</span>
+  Condensation for Content-Based Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09874v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09874v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahao Wu, Qijiong Liu, Hengchang Hu, Wenqi Fan, Shengcai Liu, Qing Li, Xiao-Ming Wu, Ke Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern techniques in Content-based Recommendation (CBR) leverage item content
+information to provide personalized services to users, but suffer from
+resource-intensive training on large datasets. To address this issue, we
+explore the dataset condensation for textual CBR in this paper. The goal of
+dataset condensation is to synthesize a small yet informative dataset, upon
+which models can achieve performance comparable to those trained on large
+datasets. While existing condensation approaches are tailored to classification
+tasks for continuous data like images or embeddings, direct application of them
+to CBR has limitations. To bridge this gap, we investigate efficient dataset
+condensation for content-based recommendation. Inspired by the remarkable
+abilities of large language models (LLMs) in text comprehension and generation,
+we leverage LLMs to empower the generation of textual content during
+condensation. To handle the interaction data involving both users and items, we
+devise a dual-level condensation method: content-level and user-level. At
+content-level, we utilize LLMs to condense all contents of an item into a new
+informative title. At user-level, we design a clustering-based synthesis
+module, where we first utilize LLMs to extract user interests. Then, the user
+interests and user embeddings are incorporated to condense users and generate
+interactions for condensed users. Notably, the condensation paradigm of this
+method is forward and free from iterative optimization on the synthesized
+dataset. Extensive empirical findings from our study, conducted on three
+authentic datasets, substantiate the efficacy of the proposed method.
+Particularly, we are able to approximate up to 97% of the original performance
+while reducing the dataset size by 95% (i.e., on dataset MIND).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalizing Few-Shot Named Entity Recognizers to Unseen Domains with
+  Type-Related Features <span class="chip">EMNLP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09846v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09846v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Wang, Ziqi Zhao, Zhumin Chen, Pengjie Ren, Maarten de Rijke, Zhaochun Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot named entity recognition (NER) has shown remarkable progress in
+identifying entities in low-resource domains. However, few-shot NER methods
+still struggle with out-of-domain (OOD) examples due to their reliance on
+manual labeling for the target domain. To address this limitation, recent
+studies enable generalization to an unseen target domain with only a few
+labeled examples using data augmentation techniques. Two important challenges
+remain: First, augmentation is limited to the training data, resulting in
+minimal overlap between the generated data and OOD examples. Second, knowledge
+transfer is implicit and insufficient, severely hindering model
+generalizability and the integration of knowledge from the source domain. In
+this paper, we propose a framework, prompt learning with type-related features
+(PLTR), to address these challenges. To identify useful knowledge in the source
+domain and enhance knowledge transfer, PLTR automatically extracts entity
+type-related features (TRFs) based on mutual information criteria. To bridge
+the gap between training and OOD data, PLTR generates a unique prompt for each
+unseen example by selecting relevant TRFs. We show that PLTR achieves
+significant performance improvements on in-domain and cross-domain datasets.
+The use of PLTR facilitates model adaptation and increases representation
+similarities between the source and unseen domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at EMNLP findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can LSH (Locality-Sensitive Hashing) Be Replaced by Neural Network? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09806v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09806v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renyang Liu, Jun Zhao, Xing Chu, Yu Liang, Wei Zhou, Jing He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid development of GPU (Graphics Processing Unit) technologies and
+neural networks, we can explore more appropriate data structures and
+algorithms. Recent progress shows that neural networks can partly replace
+traditional data structures. In this paper, we proposed a novel DNN (Deep
+Neural Network)-based learned locality-sensitive hashing, called LLSH, to
+efficiently and flexibly map high-dimensional data to low-dimensional space.
+LLSH replaces the traditional LSH (Locality-sensitive Hashing) function
+families with parallel multi-layer neural networks, which reduces the time and
+memory consumption and guarantees query accuracy simultaneously. The proposed
+LLSH demonstrate the feasibility of replacing the hash index with
+learning-based neural networks and open a new door for developers to design and
+configure data organization more accurately to improve information-searching
+performance. Extensive experiments on different types of datasets show the
+superiority of the proposed method in query accuracy, time consumption, and
+memory usage.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Conversational Search: Large Language Model-Aided Informative
+  Query Rewriting <span class="chip">EMNLP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09716v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09716v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fanghua Ye, Meng Fang, Shenghui Li, Emine Yilmaz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Query rewriting plays a vital role in enhancing conversational search by
+transforming context-dependent user queries into standalone forms. Existing
+approaches primarily leverage human-rewritten queries as labels to train query
+rewriting models. However, human rewrites may lack sufficient information for
+optimal retrieval performance. To overcome this limitation, we propose
+utilizing large language models (LLMs) as query rewriters, enabling the
+generation of informative query rewrites through well-designed instructions. We
+define four essential properties for well-formed rewrites and incorporate all
+of them into the instruction. In addition, we introduce the role of rewrite
+editors for LLMs when initial query rewrites are available, forming a
+``rewrite-then-edit'' process. Furthermore, we propose distilling the rewriting
+capabilities of LLMs into smaller models to reduce rewriting latency. Our
+experimental evaluation on the QReCC dataset demonstrates that informative
+query rewrites can yield substantially improved retrieval performance compared
+to human rewrites, especially with sparse retrievers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, accepted to EMNLP Findings 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AdaptSSR: <span class="highlight-title">Pre-train</span>ing User Model with Augmentation-Adaptive
+  <span class="highlight-title">Self-Supervised</span> Ranking <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09706v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09706v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Yu, Qi Liu, Kai Zhang, Yuren Zhang, Chao Song, Min Hou, Yuqing Yuan, Zhihao Ye, Zaixi Zhang, Sanshi Lei Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  User modeling, which aims to capture users' characteristics or interests,
+heavily relies on task-specific labeled data and suffers from the data sparsity
+issue. Several recent studies tackled this problem by pre-training the user
+model on massive user behavior sequences with a contrastive learning task.
+Generally, these methods assume different views of the same behavior sequence
+constructed via data augmentation are semantically consistent, i.e., reflecting
+similar characteristics or interests of the user, and thus maximizing their
+agreement in the feature space. However, due to the diverse interests and heavy
+noise in user behaviors, existing augmentation methods tend to lose certain
+characteristics of the user or introduce noisy interests. Thus, forcing the
+user model to directly maximize the similarity between the augmented views may
+result in a negative transfer. To this end, we propose to replace the
+contrastive learning task with a new pretext task: Augmentation-Adaptive
+Self-Supervised Ranking (AdaptSSR), which alleviates the requirement of
+semantic consistency between the augmented views while pre-training a
+discriminative user model. Specifically, we adopt a multiple pairwise ranking
+loss which trains the user model to capture the similarity orders between the
+implicitly augmented view, the explicitly augmented view, and views from other
+users. We further employ an in-batch hard negative sampling strategy to
+facilitate model training. Moreover, considering the distinct impacts of data
+augmentation on different behavior sequences, we design an
+augmentation-adaptive fusion mechanism to automatically adjust the similarity
+order constraint applied to each sample based on the estimated similarity
+between the augmented views. Extensive experiments on both public and
+industrial datasets with six downstream tasks verify the effectiveness of
+AdaptSSR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sentiment Analysis Using Averaged Weighted Word Vector Features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2002.05606v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2002.05606v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ali Erkan, Tunga Gungor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  People use the world wide web heavily to share their experience with entities
+such as products, services, or travel destinations. Texts that provide online
+feedback in the form of reviews and comments are essential to make consumer
+decisions. These comments create a valuable source that may be used to measure
+satisfaction related to products or services. Sentiment analysis is the task of
+identifying opinions expressed in such text fragments. In this work, we develop
+two methods that combine different types of word vectors to learn and estimate
+polarity of reviews. We develop average review vectors from word vectors and
+add weights to this review vectors using word frequencies in positive and
+negative sensitivity-tagged reviews. We applied the methods to several datasets
+from different domains that are used as standard benchmarks for sentiment
+analysis. We ensemble the techniques with each other and existing methods, and
+we make a comparison with the approaches in the literature. The results show
+that the performances of our approaches outperform the state-of-the-art success
+rates.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Knowledge Graph Embedding: A <span class="highlight-title">Survey</span> from the Perspective of
+  Representation Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.03536v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.03536v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahang Cao, Jinyuan Fang, Zaiqiao Meng, Shangsong Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge graph embedding (KGE) is an increasingly popular technique that
+aims to represent entities and relations of knowledge graphs into
+low-dimensional semantic spaces for a wide spectrum of applications such as
+link prediction, knowledge reasoning and knowledge completion. In this paper,
+we provide a systematic review of existing KGE techniques based on
+representation spaces. Particularly, we build a fine-grained classification to
+categorise the models based on three mathematical perspectives of the
+representation spaces: (1) Algebraic perspective, (2) Geometric perspective,
+and (3) Analytical perspective. We introduce the rigorous definitions of
+fundamental mathematical spaces before diving into KGE models and their
+mathematical properties. We further discuss different KGE methods over the
+three categories, as well as summarise how spatial advantages work over
+different embedding needs. By collating the experimental results from
+downstream tasks, we also explore the advantages of mathematical space in
+different scenarios and the reasons behind them. We further state some
+promising research directions from a representation space perspective, with
+which we hope to inspire researchers to design their KGE models as well as
+their related applications with more consideration of their mathematical space
+properties.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>42 pages, 6 figures, 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Query-as-context <span class="highlight-title">Pre-train</span>ing for Dense Passage Retrieval <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.09598v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.09598v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xing Wu, Guangyuan Ma, Wanhui Qian, Zijia Lin, Songlin Hu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, methods have been developed to improve the performance of dense
+passage retrieval by using context-supervised pre-training. These methods
+simply consider two passages from the same document to be relevant, without
+taking into account the possibility of weakly correlated pairs. Thus, this
+paper proposes query-as-context pre-training, a simple yet effective
+pre-training technique to alleviate the issue. Query-as-context pre-training
+assumes that the query derived from a passage is more likely to be relevant to
+that passage and forms a passage-query pair. These passage-query pairs are then
+used in contrastive or generative context-supervised pre-training. The
+pre-trained models are evaluated on large-scale passage retrieval benchmarks
+and out-of-domain zero-shot benchmarks. Experimental results show that
+query-as-context pre-training brings considerable gains and meanwhile speeds up
+training, demonstrating its effectiveness and efficiency. Our code will be
+available at https://github.com/caskcsg/ir/tree/main/cotmae-qc .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">2</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MERTech: Instrument Playing Technique Detection Using <span class="highlight-title">Self-Supervised</span>
+  <span class="highlight-title">Pretrain</span>ed Model With Multi-Task Finetuning <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09853v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09853v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dichucheng Li, Yinghao Ma, Weixing Wei, Qiuqiang Kong, Yulun Wu, Mingjin Che, Fan Xia, Emmanouil Benetos, Wei Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instrument playing techniques (IPTs) constitute a pivotal component of
+musical expression. However, the development of automatic IPT detection methods
+suffers from limited labeled data and inherent class imbalance issues. In this
+paper, we propose to apply a self-supervised learning model pre-trained on
+large-scale unlabeled music data and finetune it on IPT detection tasks. This
+approach addresses data scarcity and class imbalance challenges. Recognizing
+the significance of pitch in capturing the nuances of IPTs and the importance
+of onset in locating IPT events, we investigate multi-task finetuning with
+pitch and onset detection as auxiliary tasks. Additionally, we apply a
+post-processing approach for event-level prediction, where an IPT activation
+initiates an event only if the onset output confirms an onset in that frame.
+Our method outperforms prior approaches in both frame-level and event-level
+metrics across multiple IPT benchmark datasets. Further experiments demonstrate
+the efficacy of multi-task finetuning on each IPT class.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted to ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Redistributing the Precision and Content in 3D-LUT-based Inverse
+  Tone-mapping for HDR/WCG Display <span class="chip">SIGGRAPH</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17160v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17160v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng Guo, Leidong Fan, Qian Zhang, Hanyuan Liu, Kanglin Liu, Xiuhua Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  ITM(inverse tone-mapping) converts SDR (standard dynamic range) footage to
+HDR/WCG (high dynamic range /wide color gamut) for media production. It happens
+not only when remastering legacy SDR footage in front-end content provider, but
+also adapting on-theair SDR service on user-end HDR display. The latter
+requires more efficiency, thus the pre-calculated LUT (look-up table) has
+become a popular solution. Yet, conventional fixed LUT lacks adaptability, so
+we learn from research community and combine it with AI. Meanwhile,
+higher-bit-depth HDR/WCG requires larger LUT than SDR, so we consult
+traditional ITM for an efficiency-performance trade-off: We use 3 smaller LUTs,
+each has a non-uniform packing (precision) respectively denser in dark, middle
+and bright luma range. In this case, their results will have less error only in
+their own range, so we use a contribution map to combine their best parts to
+final result. With the guidance of this map, the elements (content) of 3 LUTs
+will also be redistributed during training. We conduct ablation studies to
+verify method's effectiveness, and subjective and objective experiments to show
+its practicability. Code is available at: https://github.com/AndreGuo/ITMLUT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in CVMP2023 (the 20th ACM SIGGRAPH European Conference on
+  Visual Media Production)</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-10-14T00:00:00Z">2023-10-14</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">6</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Context-aware Session-based Recommendation with Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09593v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09593v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihui Zhang, JianXiang Yu, Xiang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Session-based recommendation (SBR) is a task that aims to predict items based
+on anonymous sequences of user behaviors in a session. While there are methods
+that leverage rich context information in sessions for SBR, most of them have
+the following limitations: 1) they fail to distinguish the item-item edge types
+when constructing the global graph for exploiting cross-session contexts; 2)
+they learn a fixed embedding vector for each item, which lacks the flexibility
+to reflect the variation of user interests across sessions; 3) they generally
+use the one-hot encoded vector of the target item as the hard label to predict,
+thus failing to capture the true user preference. To solve these issues, we
+propose CARES, a novel context-aware session-based recommendation model with
+graph neural networks, which utilizes different types of contexts in sessions
+to capture user interests. Specifically, we first construct a multi-relation
+cross-session graph to connect items according to intra- and cross-session
+item-level contexts. Further, to encode the variation of user interests, we
+design personalized item representations. Finally, we employ a label
+collaboration strategy for generating soft user preference distribution as
+labels. Experiments on three benchmark datasets demonstrate that CARES
+consistently outperforms state-of-the-art models in terms of P@20 and MRR@20.
+Our data and codes are publicly available at
+https://github.com/brilliantZhang/CARES.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 10 figures, conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CarExpert: Leveraging Large Language Models for In-Car Conversational
+  Question Answering <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09536v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09536v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Rashad Al Hasan Rony, Christian Suess, Sinchana Ramakanth Bhat, Viju Sudhi, Julia Schneider, Maximilian Vogel, Roman Teucher, Ken E. Friedl, Soumya Sahoo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated remarkable performance by
+following natural language instructions without fine-tuning them on
+domain-specific tasks and data. However, leveraging LLMs for domain-specific
+question answering suffers from severe limitations. The generated answer tends
+to hallucinate due to the training data collection time (when using
+off-the-shelf), complex user utterance and wrong retrieval (in
+retrieval-augmented generation). Furthermore, due to the lack of awareness
+about the domain and expected output, such LLMs may generate unexpected and
+unsafe answers that are not tailored to the target domain. In this paper, we
+propose CarExpert, an in-car retrieval-augmented conversational
+question-answering system leveraging LLMs for different tasks. Specifically,
+CarExpert employs LLMs to control the input, provide domain-specific documents
+to the extractive and generative answering components, and controls the output
+to ensure safe and domain-specific answers. A comprehensive empirical
+evaluation exhibits that CarExpert outperforms state-of-the-art LLMs in
+generating natural, safe and car-specific answers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted into EMNLP 2023 (industry track), corresponding Author: Md
+  Rashad Al Hasan Rony</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Findability: A Novel Measure of Information Accessibility <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09508v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09508v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aman Sinha, Priyanshu Raj Mall, Dwaipayan Roy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The overwhelming volume of data generated and indexed by search engines poses
+a significant challenge in retrieving documents from the index efficiently and
+effectively. Even with a well-crafted query, several relevant documents often
+get buried among a multitude of competing documents, resulting in reduced
+accessibility or `findability' of the desired document. Consequently, it is
+crucial to develop a robust methodology for assessing this dimension of
+Information Retrieval (IR) system performance. While previous studies have
+focused on measuring document accessibility disregarding user queries and
+document relevance, there exists no metric to quantify the findability of a
+document within a given IR system without resorting to manual labor. This paper
+aims to address this gap by defining and deriving a metric to evaluate the
+findability of documents as perceived by end-users. Through experiments, we
+demonstrate the varying impact of different retrieval models and collections on
+the findability of documents. Furthermore, we establish the findability measure
+as an independent metric distinct from retrievability, an accessibility measure
+introduced in prior literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at CIKM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Setwise Approach for Effective and Highly Efficient Zero-shot Ranking
+  with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09497v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09497v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengyao Zhuang, Honglei Zhuang, Bevan Koopman, Guido Zuccon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) demonstrate impressive effectiveness in
+zero-shot document ranking tasks. Pointwise, Pairwise, and Listwise prompting
+approaches have been proposed for LLM-based zero-shot ranking. Our study begins
+by thoroughly evaluating these existing approaches within a consistent
+experimental framework, considering factors like model size, token consumption,
+latency, among others. This first-of-its-kind comparative evaluation of these
+approaches allows us to identify the trade-offs between effectiveness and
+efficiency inherent in each approach. We find that while Pointwise approaches
+score high on efficiency, they suffer from poor effectiveness. Conversely,
+Pairwise approaches demonstrate superior effectiveness but incur high
+computational overhead. To further enhance the efficiency of LLM-based
+zero-shot ranking, we propose a novel Setwise prompting approach. Our approach
+reduces the number of LLM inferences and the amount of prompt token consumption
+during the ranking procedure, significantly improving the efficiency of
+LLM-based zero-shot ranking. We test our method using the TREC DL datasets and
+the BEIR zero-shot document ranking benchmark. The empirical results indicate
+that our approach considerably reduces computational costs while also retaining
+high zero-shot ranking effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Non-Stationary Contextual Bandit Learning via Neural Predictive Ensemble
+  Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07786v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07786v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheqing Zhu, Yueyang Liu, Xu Kuang, Benjamin Van Roy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world applications of contextual bandits often exhibit non-stationarity
+due to seasonality, serendipity, and evolving social trends. While a number of
+non-stationary contextual bandit learning algorithms have been proposed in the
+literature, they excessively explore due to a lack of prioritization for
+information of enduring value, or are designed in ways that do not scale in
+modern applications with high-dimensional user-specific features and large
+action set, or both. In this paper, we introduce a novel non-stationary
+contextual bandit algorithm that addresses these concerns. It combines a
+scalable, deep-neural-network-based architecture with a carefully designed
+exploration mechanism that strategically prioritizes collecting information
+with the most lasting value in a non-stationary environment. Through empirical
+evaluations on two real-world recommendation datasets, which exhibit pronounced
+non-stationarity, we demonstrate that our approach significantly outperforms
+the state-of-the-art baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VIP5: Towards Multimodal Foundation Models for Recommendation <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14302v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14302v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shijie Geng, Juntao Tan, Shuchang Liu, Zuohui Fu, Yongfeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computer Vision (CV), Natural Language Processing (NLP), and Recommender
+Systems (RecSys) are three prominent AI applications that have traditionally
+developed independently, resulting in disparate modeling and engineering
+methodologies. This has impeded the ability for these fields to directly
+benefit from each other's advancements. With the recent development of
+foundation models, large language models have emerged as a potential
+general-purpose interface for unifying different modalities and problem
+formulations. In light of this, we propose the development of a multimodal
+foundation model (MFM) considering visual, textual, and personalization
+modalities under the P5 recommendation paradigm, thus named VIP5 (Visual P5),
+to unify various modalities and recommendation tasks. This will enable the
+processing of multiple modalities in a shared architecture for improved
+recommendations. To achieve this, we introduce multimodal personalized prompts
+to accommodate multiple modalities under a shared format. Additionally, we
+propose a parameter-efficient training method for foundation models, which
+involves freezing the P5 backbone and fine-tuning lightweight adapters,
+resulting in improved recommendation performance and increased efficiency in
+terms of training time and memory usage. Code and data of VIP5 are available at
+https://github.com/jeykigung/VIP5.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">3</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Energy-Efficient Multi-Codec Bitrate-Ladder Estimation for Adaptive
+  Video Streaming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09570v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09570v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vignesh V Menon, Reza Farahani, Prajit T Rajendran, Samira Afzal, Klaus Schoeffmann, Christian Timmerer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the emergence of multiple modern video codecs, streaming service
+providers are forced to encode, store, and transmit bitrate ladders of multiple
+codecs separately, consequently suffering from additional energy costs for
+encoding, storage, and transmission. To tackle this issue, we introduce an
+online energy-efficient Multi-Codec Bitrate ladder Estimation scheme (MCBE) for
+adaptive video streaming applications. In MCBE, quality representations within
+the bitrate ladder of new-generation codecs (e.g., High Efficiency Video Coding
+(HEVC), Alliance for Open Media Video 1 (AV1)) that lie below the predicted
+rate-distortion curve of the Advanced Video Coding (AVC) codec are removed.
+Moreover, perceptual redundancy between representations of the bitrate ladders
+of the considered codecs is also minimized based on a Just Noticeable
+Difference (JND) threshold. Therefore, random forest-based models predict the
+VMAF score of bitrate ladder representations of each codec. In a live streaming
+session where all clients support the decoding of AVC, HEVC, and AV1, MCBE
+achieves impressive results, reducing cumulative encoding energy by 56.45%,
+storage energy usage by 94.99%, and transmission energy usage by 77.61%
+(considering a JND of six VMAF points). These energy reductions are in
+comparison to a baseline bitrate ladder encoding based on current industry
+practice.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IEEE International Conference on Visual Communications
+  and Image Processing (VCIP), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MAC: ModAlity Calibration for Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09461v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09461v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yutian Lei, Jun Liu, Dong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The flourishing success of Deep Neural Networks(DNNs) on RGB-input perception
+tasks has opened unbounded possibilities for non-RGB-input perception tasks,
+such as object detection from wireless signals, lidar scans, and infrared
+images. Compared to the matured development pipeline of RGB-input (source
+modality) models, developing non-RGB-input (target-modality) models from
+scratch poses excessive challenges in the modality-specific network
+design/training tricks and labor in the target-modality annotation. In this
+paper, we propose ModAlity Calibration (MAC), an efficient pipeline for
+calibrating target-modality inputs to the DNN object detection models developed
+on the RGB (source) modality. We compose a target-modality-input model by
+adding a small calibrator module ahead of a source-modality model and introduce
+MAC training techniques to impose dense supervision on the calibrator. By
+leveraging (1) prior knowledge synthesized from the source-modality model and
+(2) paired {target, source} data with zero manual annotations, our
+target-modality models reach comparable or better metrics than baseline models
+that require 100% manual annotations. We demonstrate the effectiveness of MAC
+by composing the WiFi-input, Lidar-input, and Thermal-Infrared-input models
+upon the pre-trained RGB-input models respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VIP5: Towards Multimodal Foundation Models for Recommendation <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14302v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14302v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shijie Geng, Juntao Tan, Shuchang Liu, Zuohui Fu, Yongfeng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computer Vision (CV), Natural Language Processing (NLP), and Recommender
+Systems (RecSys) are three prominent AI applications that have traditionally
+developed independently, resulting in disparate modeling and engineering
+methodologies. This has impeded the ability for these fields to directly
+benefit from each other's advancements. With the recent development of
+foundation models, large language models have emerged as a potential
+general-purpose interface for unifying different modalities and problem
+formulations. In light of this, we propose the development of a multimodal
+foundation model (MFM) considering visual, textual, and personalization
+modalities under the P5 recommendation paradigm, thus named VIP5 (Visual P5),
+to unify various modalities and recommendation tasks. This will enable the
+processing of multiple modalities in a shared architecture for improved
+recommendations. To achieve this, we introduce multimodal personalized prompts
+to accommodate multiple modalities under a shared format. Additionally, we
+propose a parameter-efficient training method for foundation models, which
+involves freezing the P5 backbone and fine-tuning lightweight adapters,
+resulting in improved recommendation performance and increased efficiency in
+terms of training time and memory usage. Code and data of VIP5 are available at
+https://github.com/jeykigung/VIP5.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-10-13T00:00:00Z">2023-10-13</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">76</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ User Inference Attacks on Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09266v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09266v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhil Kandpal, Krishna Pillutla, Alina Oprea, Peter Kairouz, Christopher A. Choquette-Choo, Zheng Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning is a common and effective method for tailoring large language
+models (LLMs) to specialized tasks and applications. In this paper, we study
+the privacy implications of fine-tuning LLMs on user data. To this end, we
+define a realistic threat model, called user inference, wherein an attacker
+infers whether or not a user's data was used for fine-tuning. We implement
+attacks for this threat model that require only a small set of samples from a
+user (possibly different from the samples used for training) and black-box
+access to the fine-tuned LLM. We find that LLMs are susceptible to user
+inference attacks across a variety of fine-tuning datasets, at times with near
+perfect attack success rates. Further, we investigate which properties make
+users vulnerable to user inference, finding that outlier users (i.e. those with
+data distributions sufficiently different from other users) and users who
+contribute large quantities of data are most susceptible to attack. Finally, we
+explore several heuristics for mitigating privacy attacks. We find that
+interventions in the training algorithm, such as batch or per-example gradient
+clipping and early stopping fail to prevent user inference. However, limiting
+the number of fine-tuning samples from a single user can reduce attack
+effectiveness, albeit at the cost of reducing the total amount of fine-tuning
+data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span>RE: Weakly-Supervised Document-Level Relation Extraction via
+  <span class="highlight-title">Prompt</span>ing-Based Data Programming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09265v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09265v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chufan Gao, Xulin Fan, Jimeng Sun, Xuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Relation extraction aims to classify the relationships between two entities
+into pre-defined categories. While previous research has mainly focused on
+sentence-level relation extraction, recent studies have expanded the scope to
+document-level relation extraction. Traditional relation extraction methods
+heavily rely on human-annotated training data, which is time-consuming and
+labor-intensive. To mitigate the need for manual annotation, recent
+weakly-supervised approaches have been developed for sentence-level relation
+extraction while limited work has been done on document-level relation
+extraction. Weakly-supervised document-level relation extraction faces
+significant challenges due to an imbalanced number "no relation" instances and
+the failure of directly probing pretrained large language models for document
+relation extraction. To address these challenges, we propose PromptRE, a novel
+weakly-supervised document-level relation extraction method that combines
+prompting-based techniques with data programming. Furthermore, PromptRE
+incorporates the label distribution and entity types as prior knowledge to
+improve the performance. By leveraging the strengths of both prompting and data
+programming, PromptRE achieves improved performance in relation classification
+and effectively handles the "no relation" problem. Experimental results on
+ReDocRED, a benchmark dataset for document-level relation extraction,
+demonstrate the superiority of PromptRE over baseline approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Table-<span class="highlight-title">GPT</span>: Table-tuned <span class="highlight-title">GPT</span> for Diverse Table Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09263v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09263v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peng Li, Yeye He, Dror Yashar, Weiwei Cui, Song Ge, Haidong Zhang, Danielle Rifinski Fainman, Dongmei Zhang, Surajit Chaudhuri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models, such as GPT-3.5 and ChatGPT, demonstrate remarkable
+abilities to follow diverse human instructions and perform a wide range of
+tasks. However, when probing language models using a range of basic
+table-understanding tasks, we observe that today's language models are still
+sub-optimal in many table-related tasks, likely because they are pre-trained
+predominantly on \emph{one-dimensional} natural-language texts, whereas
+relational tables are \emph{two-dimensional} objects.
+  In this work, we propose a new "\emph{table-tuning}" paradigm, where we
+continue to train/fine-tune language models like GPT-3.5 and ChatGPT, using
+diverse table-tasks synthesized from real tables as training data, with the
+goal of enhancing language models' ability to understand tables and perform
+table tasks. We show that our resulting Table-GPT models demonstrate (1) better
+\emph{table-understanding} capabilities, by consistently outperforming the
+vanilla GPT-3.5 and ChatGPT, on a wide-range of table tasks, including holdout
+unseen tasks, and (2) strong \emph{generalizability}, in its ability to respond
+to diverse human instructions to perform new table-tasks, in a manner similar
+to GPT-3.5 and ChatGPT.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Political claim identification and categorization in a multilingual
+  setting: First experiments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09256v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09256v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Urs Zaberer, Sebastian Padó, Gabriella Lapesa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The identification and classification of political claims is an important
+step in the analysis of political newspaper reports; however, resources for
+this task are few and far between. This paper explores different strategies for
+the cross-lingual projection of political claims analysis. We conduct
+experiments on a German dataset, DebateNet2.0, covering the policy debate
+sparked by the 2015 refugee crisis. Our evaluation involves two tasks (claim
+identification and categorization), three languages (German, English, and
+French) and two methods (machine translation -- the best method in our
+experiments -- and multilingual embeddings).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at KONVENS 2023, Ingolstadt, Germany</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hypernymy Understanding Evaluation of Text-to-Image Models via WordNet
+  Hierarchy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09247v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09247v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anton Baryshnikov, Max Ryabinin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image synthesis has recently attracted widespread attention due to
+rapidly improving quality and numerous practical applications. However, the
+language understanding capabilities of text-to-image models are still poorly
+understood, which makes it difficult to reason about prompt formulations that a
+given model would understand well. In this work, we measure the capability of
+popular text-to-image models to understand $\textit{hypernymy}$, or the "is-a"
+relation between words. We design two automatic metrics based on the WordNet
+semantic hierarchy and existing image classifiers pretrained on ImageNet. These
+metrics both enable broad quantitative comparison of linguistic capabilities
+for text-to-image models and offer a way of finding fine-grained qualitative
+differences, such as words that are unknown to models and thus are difficult
+for them to draw. We comprehensively evaluate popular text-to-image models,
+including GLIDE, Latent Diffusion, and Stable Diffusion, showing how our
+metrics can provide a better understanding of the individual strengths and
+weaknesses of these models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Precedent-Enhanced Legal Judgment Prediction with LLM and Domain-Model
+  Collaboration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09241v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09241v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiquan Wu, Siying Zhou, Yifei Liu, Weiming Lu, Xiaozhong Liu, Yating Zhang, Changlong Sun, Fei Wu, Kun Kuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Legal Judgment Prediction (LJP) has become an increasingly crucial task in
+Legal AI, i.e., predicting the judgment of the case in terms of case fact
+description. Precedents are the previous legal cases with similar facts, which
+are the basis for the judgment of the subsequent case in national legal
+systems. Thus, it is worthwhile to explore the utilization of precedents in the
+LJP. Recent advances in deep learning have enabled a variety of techniques to
+be used to solve the LJP task. These can be broken down into two categories:
+large language models (LLMs) and domain-specific models. LLMs are capable of
+interpreting and generating complex natural language, while domain models are
+efficient in learning task-specific information. In this paper, we propose the
+precedent-enhanced LJP framework (PLJP), a system that leverages the strength
+of both LLM and domain models in the context of precedents. Specifically, the
+domain models are designed to provide candidate labels and find the proper
+precedents efficiently, and the large models will make the final prediction
+with an in-context precedents comprehension. Experiments on the real-world
+dataset demonstrate the effectiveness of our PLJP. Moreover, our work shows a
+promising direction for LLM and domain-model collaboration that can be
+generalized to other vertical domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BanglaNLP at BLP-2023 Task 2: Benchmarking different <span class="highlight-title">Transformer</span> Models
+  for Sentiment Analysis of Bangla Social Media Posts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09238v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09238v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saumajit Saha, Albert Nanda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bangla is the 7th most widely spoken language globally, with a staggering 234
+million native speakers primarily hailing from India and Bangladesh. This
+morphologically rich language boasts a rich literary tradition, encompassing
+diverse dialects and language-specific challenges. Despite its linguistic
+richness and history, Bangla remains categorized as a low-resource language
+within the natural language processing (NLP) and speech community. This paper
+presents our submission to Task 2 (Sentiment Analysis of Bangla Social Media
+Posts) of the BLP Workshop. We experiment with various Transformer-based
+architectures to solve this task. Our quantitative results show that transfer
+learning really helps in better learning of the models in this low-resource
+language scenario. This becomes evident when we further finetune a model which
+has already been finetuned on twitter data for sentiment analysis task and that
+finetuned model performs the best among all other models. We also perform a
+detailed error analysis where we find some instances where ground truth labels
+need to be relooked at. We obtain a micro-F1 of 67.02\% on the test set and our
+performance in this shared task is ranked at 21 in the leaderboard.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 2 figures, workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AgentCF: Collaborative Learning with Autonomous Language Agents for
+  Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09233v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09233v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junjie Zhang, Yupeng Hou, Ruobing Xie, Wenqi Sun, Julian McAuley, Wayne Xin Zhao, Leyu Lin, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, there has been an emergence of employing LLM-powered agents as
+believable human proxies, based on their remarkable decision-making capability.
+However, existing studies mainly focus on simulating human dialogue. Human
+non-verbal behaviors, such as item clicking in recommender systems, although
+implicitly exhibiting user preferences and could enhance the modeling of users,
+have not been deeply explored. The main reasons lie in the gap between language
+modeling and behavior modeling, as well as the incomprehension of LLMs about
+user-item relations.
+  To address this issue, we propose AgentCF for simulating user-item
+interactions in recommender systems through agent-based collaborative
+filtering. We creatively consider not only users but also items as agents, and
+develop a collaborative learning approach that optimizes both kinds of agents
+together. Specifically, at each time step, we first prompt the user and item
+agents to interact autonomously. Then, based on the disparities between the
+agents' decisions and real-world interaction records, user and item agents are
+prompted to reflect on and adjust the misleading simulations collaboratively,
+thereby modeling their two-sided relations. The optimized agents can also
+propagate their preferences to other agents in subsequent interactions,
+implicitly capturing the collaborative filtering idea. Overall, the optimized
+agents exhibit diverse interaction behaviors within our framework, including
+user-item, user-user, item-item, and collective interactions. The results show
+that these agents can demonstrate personalized behaviors akin to those of
+real-world individuals, sparking the development of next-generation user
+behavior simulation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automated Claim Matching with Large Language Models: Empowering
+  Fact-Checkers in the Fight Against Misinformation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09223v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09223v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eun Cheol Choi, Emilio Ferrara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In today's digital era, the rapid spread of misinformation poses threats to
+public well-being and societal trust. As online misinformation proliferates,
+manual verification by fact checkers becomes increasingly challenging. We
+introduce FACT-GPT (Fact-checking Augmentation with Claim matching
+Task-oriented Generative Pre-trained Transformer), a framework designed to
+automate the claim matching phase of fact-checking using Large Language Models
+(LLMs). This framework identifies new social media content that either supports
+or contradicts claims previously debunked by fact-checkers. Our approach
+employs GPT-4 to generate a labeled dataset consisting of simulated social
+media posts. This data set serves as a training ground for fine-tuning more
+specialized LLMs. We evaluated FACT-GPT on an extensive dataset of social media
+content related to public health. The results indicate that our fine-tuned LLMs
+rival the performance of larger pre-trained LLMs in claim matching tasks,
+aligning closely with human annotations. This study achieves three key
+milestones: it provides an automated framework for enhanced fact-checking;
+demonstrates the potential of LLMs to complement human expertise; offers public
+resources, including datasets and models, to further research and applications
+in the fact-checking domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ "Kelly is a Warm Person, Joseph is a Role Model": Gender Biases in
+  LLM-Generated Reference Letters 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09219v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09219v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixin Wan, George Pu, Jiao Sun, Aparna Garimella, Kai-Wei Chang, Nanyun Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As generative language models advance, users have started to utilize Large
+Language Models (LLMs) to assist in writing various types of content, including
+professional documents such as recommendation letters. Despite their
+convenience, these applications introduce unprecedented fairness concerns. As
+generated reference letters might be directly utilized by users in professional
+or academic scenarios, they have the potential to cause direct social harms,
+such as lowering success rates for female applicants. Therefore, it is imminent
+and necessary to comprehensively study fairness issues and associated harms in
+such real-world use cases for future mitigation and monitoring. In this paper,
+we critically examine gender bias in LLM-generated reference letters. Inspired
+by findings in social science, we design evaluation methods to manifest gender
+biases in LLM-generated letters through 2 dimensions: biases in language style
+and biases in lexical content. Furthermore, we investigate the extent of bias
+propagation by separately analyze bias amplification in model-hallucinated
+contents, which we define to be the hallucination bias of model-generated
+documents. Through benchmarking evaluation on 4 popular LLMs, including
+ChatGPT, Alpaca, Vicuna and StableLM, our study reveals significant gender
+biases in LLM-generated recommendation letters. Our findings further point
+towards the importance and imminence to recognize biases in LLM-generated
+professional documents.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explore-Instruct: Enhancing Domain-Specific Instruction Coverage through
+  Active Exploration <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09168v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09168v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fanqi Wan, Xinting Huang, Tao Yang, Xiaojun Quan, Wei Bi, Shuming Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Instruction-tuning can be substantially optimized through enhanced diversity,
+resulting in models capable of handling a broader spectrum of tasks. However,
+existing data employed for such tuning often exhibit an inadequate coverage of
+individual domains, limiting the scope for nuanced comprehension and
+interactions within these areas. To address this deficiency, we propose
+Explore-Instruct, a novel approach to enhance the data coverage to be used in
+domain-specific instruction-tuning through active exploration via Large
+Language Models (LLMs). Built upon representative domain use cases,
+Explore-Instruct explores a multitude of variations or possibilities by
+implementing a search algorithm to obtain diversified and domain-focused
+instruction-tuning data. Our data-centric analysis validates the effectiveness
+of this proposed approach in improving domain-specific instruction coverage.
+Moreover, our model's performance demonstrates considerable advancements over
+multiple baselines, including those utilizing domain-specific data enhancement.
+Our findings offer a promising opportunity to improve instruction coverage,
+especially in domain-specific contexts, thereby advancing the development of
+adaptable language models. Our code, model weights, and data are public at
+\url{https://github.com/fanqiwan/Explore-Instruct}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP 2023 (Main Conference)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Developing a Natural Language Understanding Model to Characterize Cable
+  News Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09166v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09166v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seth P. Benson, Iain J. Cruickshank
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Media bias has been extensively studied by both social and computational
+sciences. However, current work still has a large reliance on human input and
+subjective assessment to label biases. This is especially true for cable news
+research. To address these issues, we develop an unsupervised machine learning
+method to characterize the bias of cable news programs without any human input.
+This method relies on the analysis of what topics are mentioned through Named
+Entity Recognition and how those topics are discussed through Stance Analysis
+in order to cluster programs with similar biases together. Applying our method
+to 2020 cable news transcripts, we find that program clusters are consistent
+over time and roughly correspond to the cable news network of the program. This
+method reveals the potential for future tools to objectively assess media bias
+and characterize unfamiliar media environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BibRank: Automatic Keyphrase Extraction Platform Using~Metadata 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09151v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09151v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdelrhman Eldallal, Eduard Barbu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic Keyphrase Extraction involves identifying essential phrases in a
+document. These keyphrases are crucial in various tasks such as document
+classification, clustering, recommendation, indexing, searching, summarization,
+and text simplification. This paper introduces a platform that integrates
+keyphrase datasets and facilitates the evaluation of keyphrase extraction
+algorithms. The platform includes BibRank, an automatic keyphrase extraction
+algorithm that leverages a rich dataset obtained by parsing bibliographic data
+in BibTeX format. BibRank combines innovative weighting techniques with
+positional, statistical, and word co-occurrence information to extract
+keyphrases from documents. The platform proves valuable for researchers and
+developers seeking to enhance their keyphrase extraction algorithms and advance
+the field of natural language processing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages , 4 figures, 8 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Puo<span class="highlight-title">BERT</span>a: Training and evaluation of a curated language model for
+  Setswana 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09141v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09141v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vukosi Marivate, Moseli Mots'Oehli, Valencia Wagner, Richard Lastrucci, Isheanesu Dzingirai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Natural language processing (NLP) has made significant progress for
+well-resourced languages such as English but lagged behind for low-resource
+languages like Setswana. This paper addresses this gap by presenting PuoBERTa,
+a customised masked language model trained specifically for Setswana. We cover
+how we collected, curated, and prepared diverse monolingual texts to generate a
+high-quality corpus for PuoBERTa's training. Building upon previous efforts in
+creating monolingual resources for Setswana, we evaluated PuoBERTa across
+several NLP tasks, including part-of-speech (POS) tagging, named entity
+recognition (NER), and news categorisation. Additionally, we introduced a new
+Setswana news categorisation dataset and provided the initial benchmarks using
+PuoBERTa. Our work demonstrates the efficacy of PuoBERTa in fostering NLP
+capabilities for understudied languages like Setswana and paves the way for
+future research directions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Consensus Game: Language Model Generation via Equilibrium Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09139v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09139v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Athul Paul Jacob, Yikang Shen, Gabriele Farina, Jacob Andreas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When applied to question answering and other text generation tasks, language
+models (LMs) may be queried generatively (by sampling answers from their output
+distribution) or discriminatively (by using them to score or rank a set of
+candidate outputs). These procedures sometimes yield very different
+predictions. How do we reconcile mutually incompatible scoring procedures to
+obtain coherent LM predictions? We introduce a new, a training-free,
+game-theoretic procedure for language model decoding. Our approach casts
+language model decoding as a regularized imperfect-information sequential
+signaling game - which we term the CONSENSUS GAME - in which a GENERATOR seeks
+to communicate an abstract correctness parameter using natural language
+sentences to a DISCRIMINATOR. We develop computational procedures for finding
+approximate equilibria of this game, resulting in a decoding algorithm we call
+EQUILIBRIUM-RANKING. Applied to a large number of tasks (including reading
+comprehension, commonsense reasoning, mathematical problem-solving, and
+dialog), EQUILIBRIUM-RANKING consistently, and sometimes substantially,
+improves performance over existing LM decoding procedures - on multiple
+benchmarks, we observe that applying EQUILIBRIUM-RANKING to LLaMA-7B
+outperforms the much larger LLaMA-65B and PaLM-540B models. These results
+highlight the promise of game-theoretic tools for addressing fundamental
+challenges of truthfulness and consistency in LMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HierarchicalContrast: A Coarse-to-Fine Contrastive Learning Framework
+  for Cross-Domain Zero-Shot Slot Filling <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09135v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09135v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junwen Zhang, Yin Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In task-oriented dialogue scenarios, cross-domain zero-shot slot filling
+plays a vital role in leveraging source domain knowledge to learn a model with
+high generalization ability in unknown target domain where annotated data is
+unavailable. However, the existing state-of-the-art zero-shot slot filling
+methods have limited generalization ability in target domain, they only show
+effective knowledge transfer on seen slots and perform poorly on unseen slots.
+To alleviate this issue, we present a novel Hierarchical Contrastive Learning
+Framework (HiCL) for zero-shot slot filling. Specifically, we propose a coarse-
+to fine-grained contrastive learning based on Gaussian-distributed embedding to
+learn the generalized deep semantic relations between utterance-tokens, by
+optimizing inter- and intra-token distribution distance. This encourages HiCL
+to generalize to the slot types unseen at training phase. Furthermore, we
+present a new iterative label set semantics inference method to unbiasedly and
+separately evaluate the performance of unseen slot types which entangled with
+their counterparts (i.e., seen slot types) in the previous zero-shot slot
+filling evaluation methods. The extensive empirical experiments on four
+datasets demonstrate that the proposed method achieves comparable or even
+better performance than the current state-of-the-art zero-shot slot filling
+approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Frustratingly Easy Plug-and-Play Detection-and-Reasoning Module for
+  Chinese Spelling Check <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09119v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09119v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haojing Huang, Jingheng Ye, Qingyu Zhou, Yinghui Li, Yangning Li, Feng Zhou, Hai-Tao Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, Chinese Spelling Check (CSC) has been greatly improved by
+designing task-specific pre-training methods or introducing auxiliary tasks,
+which mostly solve this task in an end-to-end fashion. In this paper, we
+propose to decompose the CSC workflow into detection, reasoning, and searching
+subtasks so that the rich external knowledge about the Chinese language can be
+leveraged more directly and efficiently. Specifically, we design a
+plug-and-play detection-and-reasoning module that is compatible with existing
+SOTA non-autoregressive CSC models to further boost their performance. We find
+that the detection-and-reasoning module trained for one model can also benefit
+other models. We also study the primary interpretability provided by the task
+decomposition. Extensive experiments and detailed analyses demonstrate the
+effectiveness and competitiveness of the proposed module.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GLoRE: Evaluating Logical Reasoning of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09107v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09107v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanmeng liu, Zhiyang Teng, Ruoxi Ning, Jian Liu, Qiji Zhou, Yue Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, large language models (LLMs), including notable models such as
+GPT-4 and burgeoning community models, have showcased significant general
+language understanding abilities. However, there has been a scarcity of
+attempts to assess the logical reasoning capacities of these LLMs, an essential
+facet of natural language understanding. To encourage further investigation in
+this area, we introduce GLoRE, a meticulously assembled General Logical
+Reasoning Evaluation benchmark comprised of 12 datasets that span three
+different types of tasks. Our experimental results show that compared to the
+performance of human and supervised fine-tuning, the logical reasoning
+capabilities of open LLM models necessitate additional improvement; ChatGPT and
+GPT-4 show a strong capability of logical reasoning, with GPT-4 surpassing
+ChatGPT by a large margin. We propose a self-consistency probing method to
+enhance the accuracy of ChatGPT and a fine-tuned method to boost the
+performance of an open LLM. We release the datasets and evaluation programs to
+facilitate future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Qilin-Med: Multi-stage Knowledge Injection Advanced Medical Large
+  Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09089v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09089v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qichen Ye, Junling Liu, Dading Chong, Peilin Zhou, Yining Hua, Andrew Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Integrating large language models (LLMs) into healthcare presents potential
+but faces challenges. Directly pre-training LLMs for domains like medicine is
+resource-heavy and sometimes unfeasible. Sole reliance on Supervised
+Fine-tuning (SFT) can result in overconfident predictions and may not tap into
+domain specific insights. Addressing these challenges, we present a multi-stage
+training method combining Domain-specific Continued Pre-training (DCPT), SFT,
+and Direct Preference Optimization (DPO). A notable contribution of our study
+is the introduction of a 3Gb Chinese Medicine (ChiMed) dataset, encompassing
+medical question answering, plain texts, knowledge graphs, and dialogues,
+segmented into three training stages. The medical LLM trained with our
+pipeline, Qilin-Med, exhibits significant performance boosts. In the CPT and
+SFT phases, it achieves 38.4% and 40.0% accuracy on the CMExam, surpassing
+Baichuan-7B's 33.5%. In the DPO phase, on the Huatuo-26M test set, it scores
+16.66 in BLEU-1 and 27.44 in ROUGE1, outperforming the SFT's 12.69 and 24.21.
+This highlights the strength of our training approach in refining LLMs for
+medical applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dialect Transfer for Swiss German Speech Translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09088v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09088v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Claudio Paonessa, Yanick Schraner, Jan Deriu, Manuela Hürlimann, Manfred Vogel, Mark Cieliebak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper investigates the challenges in building Swiss German speech
+translation systems, specifically focusing on the impact of dialect diversity
+and differences between Swiss German and Standard German. Swiss German is a
+spoken language with no formal writing system, it comprises many diverse
+dialects and is a low-resource language with only around 5 million speakers.
+The study is guided by two key research questions: how does the inclusion and
+exclusion of dialects during the training of speech translation models for
+Swiss German impact the performance on specific dialects, and how do the
+differences between Swiss German and Standard German impact the performance of
+the systems? We show that dialect diversity and linguistic differences pose
+significant challenges to Swiss German speech translation, which is in line
+with linguistic hypotheses derived from empirical investigations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KCTS: Knowledge-Constrained Tree Search Decoding with Token-Level
+  Hallucination Detection <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09044v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09044v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sehyun Choi, Tianqing Fang, Zhaowei Wang, Yangqiu Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated remarkable human-level natural
+language generation capabilities. However, their potential to generate
+misinformation, often called the hallucination problem, poses a significant
+risk to their deployment. A common approach to address this issue is to
+retrieve relevant knowledge and fine-tune the LLM with the knowledge in its
+input. Unfortunately, this method incurs high training costs and may cause
+catastrophic forgetting for multi-tasking models. To overcome these
+limitations, we propose a knowledge-constrained decoding method called KCTS
+(Knowledge-Constrained Tree Search), which guides a frozen LM to generate text
+aligned with the reference knowledge at each decoding step using a knowledge
+classifier score and MCTS (Monte-Carlo Tree Search). To adapt the
+sequence-level knowledge classifier to token-level guidance, we also propose a
+novel token-level hallucination detection method called RIPA (Reward Inflection
+Point Approximation). Our empirical results on knowledge-grounded dialogue and
+abstractive summarization demonstrate the strength of KCTS as a plug-and-play,
+model-agnostic decoding method that can effectively reduce hallucinations in
+natural language generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at EMNLP 2023 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MM-BigBench: Evaluating Multimodal Models on Multimodal Content
+  Comprehension Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09036v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09036v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaocui Yang, Wenfang Wu, Shi Feng, Ming Wang, Daling Wang, Yang Li, Qi Sun, Yifei Zhang, Xiaoming Fu, Soujanya Poria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The popularity of multimodal large language models (MLLMs) has triggered a
+recent surge in research efforts dedicated to evaluating these models.
+Nevertheless, existing evaluation studies of MLLMs primarily focus on the
+comprehension and reasoning of unimodal (vision) content, neglecting
+performance evaluations in the domain of multimodal (vision-language) content
+understanding. Beyond multimodal reasoning, tasks related to multimodal content
+comprehension necessitate a profound understanding of multimodal contexts,
+achieved through the multimodal interaction to obtain a final answer. In this
+paper, we introduce a comprehensive assessment framework called MM-BigBench,
+which incorporates a diverse range of metrics to offer an extensive evaluation
+of the performance of various models and instructions across a wide spectrum of
+diverse multimodal content comprehension tasks. Consequently, our work
+complements research on the performance of MLLMs in multimodal comprehension
+tasks, achieving a more comprehensive and holistic evaluation of MLLMs. To
+begin, we employ the Best Performance metric to ascertain each model's
+performance upper bound on different datasets. Subsequently, the Mean Relative
+Gain metric offers an assessment of the overall performance of various models
+and instructions, while the Stability metric measures their sensitivity.
+Furthermore, previous research centers on evaluating models independently or
+solely assessing instructions, neglecting the adaptability between models and
+instructions. We propose the Adaptability metric to quantify the adaptability
+between models and instructions. Our paper evaluates a total of 20 language
+models (14 MLLMs) on 14 multimodal datasets spanning 6 tasks, with 10
+instructions for each task, and derives novel insights. Our code will be
+released at https://github.com/declare-lab/MM-BigBench.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Underview</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dont Add, dont Miss: Effective Content Preserving Generation from
+  Pre-Selected Text Spans <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09017v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09017v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aviv Slobodkin, Avi Caciularu, Eran Hirsch, Ido Dagan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recently introduced Controlled Text Reduction (CTR) task isolates the
+text generation step within typical summarization-style tasks. It does so by
+challenging models to generate coherent text conforming to pre-selected content
+within the input text ("highlights").
+  This framing enables increased modularity in summarization-like tasks,
+allowing to couple a single CTR model with various content-selection setups and
+modules.
+  However, there are currently no reliable CTR models, while the performance of
+the existing baseline for the task is mediocre, falling short of practical
+utility.
+  Here, we address this gap by introducing a high-quality, open-source CTR
+model that tackles two prior key limitations: inadequate enforcement of the
+content-preservation constraint, and suboptimal silver training data.
+  Addressing these, we amplify the content-preservation constraint in both
+training, via RL, and inference, via a controlled decoding strategy.
+  Further, we substantially improve the silver training data quality via GPT-4
+distillation.
+  Overall, pairing the distilled dataset with the highlight-adherence
+strategies yields marked gains over the current baseline, of up to 30 ROUGE-L
+points, providing a reliable CTR model for downstream use.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023, findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CodeChain: Towards Modular Code Generation Through Chain of
+  Self-revisions with Representative Sub-modules 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08992v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08992v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hung Le, Hailin Chen, Amrita Saha, Akash Gokul, Doyen Sahoo, Shafiq Joty
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have already become quite proficient at solving
+simpler programming tasks like those in HumanEval or MBPP benchmarks. However,
+solving more complex and competitive programming tasks is still quite
+challenging for these models - possibly due to their tendency to generate
+solutions as monolithic code blocks instead of decomposing them into logical
+sub-tasks and sub-modules. On the other hand, experienced programmers
+instinctively write modularized code with abstraction for solving complex
+tasks, often reusing previously developed modules. To address this gap, we
+propose CodeChain, a novel framework for inference that elicits modularized
+code generation through a chain of self-revisions, each being guided by some
+representative sub-modules generated in previous iterations. Concretely,
+CodeChain first instructs the LLM to generate modularized codes through
+chain-of-thought prompting. Then it applies a chain of self-revisions by
+iterating the two steps: 1) extracting and clustering the generated sub-modules
+and selecting the cluster representatives as the more generic and re-usable
+implementations, and 2) augmenting the original chain-of-thought prompt with
+these selected module-implementations and instructing the LLM to re-generate
+new modularized solutions. We find that by naturally encouraging the LLM to
+reuse the previously developed and verified sub-modules, CodeChain can
+significantly boost both modularity as well as correctness of the generated
+solutions, achieving relative pass@1 improvements of 35% on APPS and 76% on
+CodeContests. It is shown to be effective on both OpenAI LLMs as well as
+open-sourced LLMs like WizardCoder. We also conduct comprehensive ablation
+studies with different methods of prompting, number of clusters, model sizes,
+program qualities, etc., to provide useful insights that underpin CodeChain's
+success.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ChatKBQA: A Generate-then-Retrieve Framework for Knowledge Base Question
+  Answering with Fine-tuned Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08975v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08975v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoran Luo, Haihong E, Zichen Tang, Shiyao Peng, Yikai Guo, Wentai Zhang, Chenghao Ma, Guanting Dong, Meina Song, Wei Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Base Question Answering (KBQA) aims to derive answers to natural
+language questions over large-scale knowledge bases (KBs), which are generally
+divided into two research components: knowledge retrieval and semantic parsing.
+However, three core challenges remain, including inefficient knowledge
+retrieval, retrieval errors adversely affecting semantic parsing, and the
+complexity of previous KBQA methods. In the era of large language models
+(LLMs), we introduce ChatKBQA, a novel generate-then-retrieve KBQA framework
+built on fine-tuning open-source LLMs such as Llama-2, ChatGLM2 and Baichuan2.
+ChatKBQA proposes generating the logical form with fine-tuned LLMs first, then
+retrieving and replacing entities and relations through an unsupervised
+retrieval method, which improves both generation and retrieval more
+straightforwardly. Experimental results reveal that ChatKBQA achieves new
+state-of-the-art performance on standard KBQA datasets, WebQSP, and
+ComplexWebQuestions (CWQ). This work also provides a new paradigm for combining
+LLMs with knowledge graphs (KGs) for interpretable and knowledge-required
+question answering. Our code is publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Example-Based NMT with Multi-Levenshtein <span class="highlight-title">Transformer</span>s <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08967v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08967v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maxime Bouthors, Josep Crego, François Yvon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrieval-Augmented Machine Translation (RAMT) is attracting growing
+attention. This is because RAMT not only improves translation metrics, but is
+also assumed to implement some form of domain adaptation. In this contribution,
+we study another salient trait of RAMT, its ability to make translation
+decisions more transparent by allowing users to go back to examples that
+contributed to these decisions.
+  For this, we propose a novel architecture aiming to increase this
+transparency. This model adapts a retrieval-augmented version of the
+Levenshtein Transformer and makes it amenable to simultaneously edit multiple
+fuzzy matches found in memory. We discuss how to perform training and inference
+in this model, based on multi-way alignment algorithms and imitation learning.
+Our experiments show that editing several examples positively impacts
+translation scores, notably increasing the number of target spans that are
+copied from existing instances.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, EMNLP 2023 submission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ xDial-Eval: A Multilingual Open-Domain Dialogue Evaluation Benchmark <span class="chip">EMNLP-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08958v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08958v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Zhang, Luis Fernando D'Haro, Chengguang Tang, Ke Shi, Guohua Tang, Haizhou Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in reference-free learned metrics for open-domain
+dialogue evaluation have been driven by the progress in pre-trained language
+models and the availability of dialogue data with high-quality human
+annotations. However, current studies predominantly concentrate on English
+dialogues, and the generalization of these metrics to other languages has not
+been fully examined. This is largely due to the absence of a multilingual
+dialogue evaluation benchmark. To address the issue, we introduce xDial-Eval,
+built on top of open-source English dialogue evaluation datasets. xDial-Eval
+includes 12 turn-level and 6 dialogue-level English datasets, comprising 14930
+annotated turns and 8691 annotated dialogues respectively. The English dialogue
+data are extended to nine other languages with commercial machine translation
+systems. On xDial-Eval, we conduct comprehensive analyses of previous
+BERT-based metrics and the recently-emerged large language models. Lastly, we
+establish strong self-supervised and multilingual baselines. In terms of
+average Pearson correlations over all datasets and languages, the best baseline
+outperforms OpenAI's ChatGPT by absolute improvements of 6.5% and 4.6% at the
+turn and dialogue levels respectively, albeit with much fewer parameters. The
+data and code are publicly available at https://github.com/e0397123/xDial-Eval.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP-2023 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Textual Analysis of ICALEPCS and IPAC Conference Proceedings: Revealing
+  Research Trends, Topics, and Collaborations for Future Insights and Advanced
+  Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08954v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08954v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Antonin Sulc, Annika Eichler, Tim Wilksen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we show a textual analysis of past ICALEPCS and IPAC
+conference proceedings to gain insights into the research trends and topics
+discussed in the field. We use natural language processing techniques to
+extract meaningful information from the abstracts and papers of past conference
+proceedings. We extract topics to visualize and identify trends, analyze their
+evolution to identify emerging research directions, and highlight interesting
+publications based solely on their content with an analysis of their network.
+Additionally, we will provide an advanced search tool to better search the
+existing papers to prevent duplication and easier reference findings. Our
+analysis provides a comprehensive overview of the research landscape in the
+field and helps researchers and practitioners to better understand the
+state-of-the-art and identify areas for future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Making Multimodal Generation Easier: When Diffusion Models Meet LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08949v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08949v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangyu Zhao, Bo Liu, Qijiong Liu, Guangyuan Shi, Xiao-Ming Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present EasyGen, an efficient model designed to enhance multimodal
+understanding and generation by harnessing the capabilities of diffusion models
+and large language models (LLMs). Unlike existing multimodal models that
+predominately depend on encoders like CLIP or ImageBind and need ample amounts
+of training data to bridge the gap between modalities, EasyGen is built upon a
+bidirectional conditional diffusion model named BiDiffuser, which promotes more
+efficient interactions between modalities. EasyGen handles image-to-text
+generation by integrating BiDiffuser and an LLM via a simple projection layer.
+Unlike most existing multimodal models that are limited to generating text
+responses, EasyGen can also facilitate text-to-image generation by leveraging
+the LLM to create textual descriptions, which can be interpreted by BiDiffuser
+to generate appropriate visual responses. Extensive quantitative and
+qualitative experiments demonstrate the effectiveness of EasyGen, whose
+training can be easily achieved in a lab setting. The source code is available
+at https://github.com/zxy556677/EasyGen.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CAMELL: Confidence-based Acquisition Model for Efficient <span class="highlight-title">Self-supervised</span>
+  Active Learning with Label Validation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08944v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08944v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carel van Niekerk, Christian Geishauser, Michael Heck, Shutong Feng, Hsien-chin Lin, Nurul Lubis, Benjamin Ruppik, Renato Vukovic, Milica Gašić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Supervised neural approaches are hindered by their dependence on large,
+meticulously annotated datasets, a requirement that is particularly cumbersome
+for sequential tasks. The quality of annotations tends to deteriorate with the
+transition from expert-based to crowd-sourced labelling. To address these
+challenges, we present \textbf{CAMELL} (Confidence-based Acquisition Model for
+Efficient self-supervised active Learning with Label validation), a pool-based
+active learning framework tailored for sequential multi-output problems. CAMELL
+possesses three core features: (1) it requires expert annotators to label only
+a fraction of a chosen sequence, (2) it facilitates self-supervision for the
+remainder of the sequence, and (3) it employs a label validation mechanism to
+prevent erroneous labels from contaminating the dataset and harming model
+performance. We evaluate CAMELL on sequential tasks, with a special emphasis on
+dialogue belief tracking, a task plagued by the constraints of limited and
+noisy datasets. Our experiments demonstrate that CAMELL outperforms the
+baselines in terms of efficiency. Furthermore, the data corrections suggested
+by our method contribute to an overall improvement in the quality of the
+resulting datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-level Adaptive Contrastive Learning for Knowledge Internalization
+  in Dialogue Generation <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08943v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08943v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenxu Yang, Zheng Lin, Lanrui Wang, Chong Tian, Liang Pang, Jiangnan Li, Yanan Cao, Weiping Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge-grounded dialogue generation aims to mitigate the issue of text
+degeneration by incorporating external knowledge to supplement the context.
+However, the model often fails to internalize this information into responses
+in a human-like manner. Instead, it simply inserts segments of the provided
+knowledge into generic responses. As a result, the generated responses tend to
+be tedious, incoherent, and in lack of interactivity which means the
+degeneration problem is still unsolved. In this work, we first find that such
+copying-style degeneration is primarily due to the weak likelihood objective,
+which allows the model to "cheat" the objective by merely duplicating knowledge
+segments in a superficial pattern matching based on overlap. To overcome this
+challenge, we then propose a Multi-level Adaptive Contrastive Learning (MACL)
+framework that dynamically samples negative examples and subsequently penalizes
+degeneration behaviors at both the token-level and sequence-level. Extensive
+experiments on the WoW dataset demonstrate the effectiveness of our approach
+across various pre-trained models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Informative Few-Shot <span class="highlight-title">Prompt</span> with Maximum Information Gain for
+  In-Context Learning <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08923v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08923v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongfu Liu, Ye Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language models (LLMs) possess the capability to engage In-context
+Learning (ICL) by leveraging a few demonstrations pertaining to a new
+downstream task as conditions. However, this particular learning paradigm
+suffers from high instability stemming from substantial variances induced by
+factors such as the input distribution of selected examples, their ordering,
+and prompt formats. In this work, we demonstrate that even when all these
+factors are held constant, the random selection of examples still results in
+high variance. Consequently, we aim to explore the informative ability of data
+examples by quantifying the Information Gain (IG) obtained in prediction after
+observing a given example candidate. Then we propose to sample those with
+maximum IG. Additionally, we identify the presence of template bias, which can
+lead to unfair evaluations of IG during the sampling process. To mitigate this
+bias, we introduce Calibration Before Sampling strategy. The experimental
+results illustrate that our proposed method can yield an average relative
+improvement of 14.3% across six classification tasks using three LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to the Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Relation-aware Ensemble Learning for Knowledge Graph Embedding <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08917v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08917v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ling Yue, Yongqi Zhang, Quanming Yao, Yong Li, Xian Wu, Ziheng Zhang, Zhenxi Lin, Yefeng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge graph (KG) embedding is a fundamental task in natural language
+processing, and various methods have been proposed to explore semantic patterns
+in distinctive ways. In this paper, we propose to learn an ensemble by
+leveraging existing methods in a relation-aware manner. However, exploring
+these semantics using relation-aware ensemble leads to a much larger search
+space than general ensemble methods. To address this issue, we propose a
+divide-search-combine algorithm RelEns-DSC that searches the relation-wise
+ensemble weights independently. This algorithm has the same computation cost as
+general ensemble methods but with much better performance. Experimental results
+on benchmark datasets demonstrate the effectiveness of the proposed method in
+efficiently searching relation-aware ensemble weights and achieving
+state-of-the-art embedding performance. The code is public at
+https://github.com/LARS-research/RelEns.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This short paper has been accepted by EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Human-in-the-loop Machine Translation with Large Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08908v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08908v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyi Yang, Runzhe Zhan, Derek F. Wong, Junchao Wu, Lidia S. Chao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The large language model (LLM) has garnered significant attention due to its
+in-context learning mechanisms and emergent capabilities. The research
+community has conducted several pilot studies to apply LLMs to machine
+translation tasks and evaluate their performance from diverse perspectives.
+However, previous research has primarily focused on the LLM itself and has not
+explored human intervention in the inference process of LLM. The
+characteristics of LLM, such as in-context learning and prompt engineering,
+closely mirror human cognitive abilities in language tasks, offering an
+intuitive solution for human-in-the-loop generation. In this study, we propose
+a human-in-the-loop pipeline that guides LLMs to produce customized outputs
+with revision instructions. The pipeline initiates by prompting the LLM to
+produce a draft translation, followed by the utilization of automatic retrieval
+or human feedback as supervision signals to enhance the LLM's translation
+through in-context learning. The human-machine interactions generated in this
+pipeline are also stored in an external database to expand the in-context
+retrieval database, enabling us to leverage human supervision in an offline
+setting. We evaluate the proposed pipeline using GPT-3.5-turbo API on five
+domain-specific benchmarks for German-English translation. The results
+demonstrate the effectiveness of the pipeline in tailoring in-domain
+translations and improving translation performance compared to direct
+translation. Additionally, we discuss the results from the following
+perspectives: 1) the effectiveness of different in-context retrieval methods;
+2) the construction of a retrieval database under low-resource scenarios; 3)
+the observed domains differences; 4) the quantitative analysis of linguistic
+statistics; and 5) the qualitative analysis of translation cases. The code and
+data are available at https://github.com/NLP2CT/HIL-MT/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MT Summit 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SeqX<span class="highlight-title">GPT</span>: Sentence-Level AI-Generated Text Detection <span class="chip">EMNLP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08903v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08903v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengyu Wang, Linyang Li, Ke Ren, Botian Jiang, Dong Zhang, Xipeng Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Widely applied large language models (LLMs) can generate human-like content,
+raising concerns about the abuse of LLMs. Therefore, it is important to build
+strong AI-generated text (AIGT) detectors. Current works only consider
+document-level AIGT detection, therefore, in this paper, we first introduce a
+sentence-level detection challenge by synthesizing a dataset that contains
+documents that are polished with LLMs, that is, the documents contain sentences
+written by humans and sentences modified by LLMs. Then we propose
+\textbf{Seq}uence \textbf{X} (Check) \textbf{GPT}, a novel method that utilizes
+log probability lists from white-box LLMs as features for sentence-level AIGT
+detection. These features are composed like \textit{waves} in speech processing
+and cannot be studied by LLMs. Therefore, we build SeqXGPT based on convolution
+and self-attention networks. We test it in both sentence and document-level
+detection challenges. Experimental results show that previous methods struggle
+in solving sentence-level AIGT detection, while our method not only
+significantly surpasses baseline methods in both sentence and document-level
+detection challenges but also exhibits strong generalization capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by EMNLP2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Welfare Diplomacy: Benchmarking Language Model Cooperation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08901v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08901v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel Mukobi, Hannah Erlebach, Niklas Lauffer, Lewis Hammond, Alan Chan, Jesse Clifton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The growing capabilities and increasingly widespread deployment of AI systems
+necessitate robust benchmarks for measuring their cooperative capabilities.
+Unfortunately, most multi-agent benchmarks are either zero-sum or purely
+cooperative, providing limited opportunities for such measurements. We
+introduce a general-sum variant of the zero-sum board game Diplomacy -- called
+Welfare Diplomacy -- in which players must balance investing in military
+conquest and domestic welfare. We argue that Welfare Diplomacy facilitates both
+a clearer assessment of and stronger training incentives for cooperative
+capabilities. Our contributions are: (1) proposing the Welfare Diplomacy rules
+and implementing them via an open-source Diplomacy engine; (2) constructing
+baseline agents using zero-shot prompted language models; and (3) conducting
+experiments where we find that baselines using state-of-the-art models attain
+high social welfare but are exploitable. Our work aims to promote societal
+safety by aiding researchers in developing and assessing multi-agent AI
+systems. Code to evaluate Welfare Diplomacy and reproduce our experiments is
+available at https://github.com/mukobi/welfare-diplomacy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploration with Principles for Diverse AI Supervision 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08899v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08899v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Liu, Matei Zaharia, Pieter Abbeel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training large transformers using next-token prediction has given rise to
+groundbreaking advancements in AI. While this generative AI approach has
+produced impressive results, it heavily leans on human supervision. Even
+state-of-the-art AI models like ChatGPT depend on fine-tuning through human
+demonstrations, demanding extensive human input and domain expertise. This
+strong reliance on human oversight poses a significant hurdle to the
+advancement of AI innovation. To address this limitation, we propose a novel
+paradigm termed Exploratory AI (EAI) aimed at autonomously generating
+high-quality training data. Drawing inspiration from unsupervised reinforcement
+learning (RL) pretraining, EAI achieves exploration within the natural language
+space. We accomplish this by harnessing large language models to assess the
+novelty of generated content. Our approach employs two key components: an actor
+that generates novel content following exploration principles and a critic that
+evaluates the generated content, offering critiques to guide the actor.
+Empirical evaluations demonstrate that EAI significantly boosts model
+performance on complex reasoning tasks, addressing the limitations of
+human-intensive supervision.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PerturbScore: Connecting Discrete and Continuous Perturbations in NLP <span class="chip">EMNLP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08889v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08889v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linyang Li, Ke Ren, Yunfan Shao, Pengyu Wang, Xipeng Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid development of neural network applications in NLP, model
+robustness problem is gaining more attention. Different from computer vision,
+the discrete nature of texts makes it more challenging to explore robustness in
+NLP. Therefore, in this paper, we aim to connect discrete perturbations with
+continuous perturbations, therefore we can use such connections as a bridge to
+help understand discrete perturbations in NLP models. Specifically, we first
+explore how to connect and measure the correlation between discrete
+perturbations and continuous perturbations. Then we design a regression task as
+a PerturbScore to learn the correlation automatically. Through experimental
+results, we find that we can build a connection between discrete and continuous
+perturbations and use the proposed PerturbScore to learn such correlation,
+surpassing previous methods used in discrete perturbation measuring. Further,
+the proposed PerturbScore can be well generalized to different datasets,
+perturbation methods, indicating that we can use it as a powerful tool to study
+model robustness in NLP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by Findings of EMNLP2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InstructTODS: Large Language Models for End-to-End Task-Oriented
+  Dialogue Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08885v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08885v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Willy Chung, Samuel Cahyawijaya, Bryan Wilie, Holy Lovenia, Pascale Fung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have been used for diverse tasks in natural
+language processing (NLP), yet remain under-explored for task-oriented dialogue
+systems (TODS), especially for end-to-end TODS. We present InstructTODS, a
+novel off-the-shelf framework for zero-shot end-to-end task-oriented dialogue
+systems that can adapt to diverse domains without fine-tuning. By leveraging
+LLMs, InstructTODS generates a proxy belief state that seamlessly translates
+user intentions into dynamic queries for efficient interaction with any KB. Our
+extensive experiments demonstrate that InstructTODS achieves comparable
+performance to fully fine-tuned TODS in guiding dialogues to successful
+completion without prior knowledge or task-specific data. Furthermore, a
+rigorous human evaluation of end-to-end TODS shows that InstructTODS produces
+dialogue responses that notably outperform both the gold responses and the
+state-of-the-art TODS in terms of helpfulness, informativeness, and humanness.
+Moreover, the effectiveness of LLMs in TODS is further supported by our
+comprehensive evaluations on TODS subtasks: dialogue state tracking, intent
+classification, and response generation. Code and implementations could be
+found here https://github.com/WillyHC22/InstructTODS/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Retrieval-Generation Alignment for End-to-End Task-Oriented Dialogue
+  System <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08877v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08877v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weizhou Shen, Yingqi Gao, Canbin Huang, Fanqi Wan, Xiaojun Quan, Wei Bi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Developing an efficient retriever to retrieve knowledge from a large-scale
+knowledge base (KB) is critical for task-oriented dialogue systems to
+effectively handle localized and specialized tasks. However, widely used
+generative models such as T5 and ChatGPT often struggle to differentiate subtle
+differences among the retrieved KB records when generating responses, resulting
+in suboptimal quality of generated responses. In this paper, we propose the
+application of maximal marginal likelihood to train a perceptive retriever by
+utilizing signals from response generation for supervision. In addition, our
+approach goes beyond considering solely retrieved entities and incorporates
+various meta knowledge to guide the generator, thus improving the utilization
+of knowledge. We evaluate our approach on three task-oriented dialogue datasets
+using T5 and ChatGPT as the backbone models. The results demonstrate that when
+combined with meta knowledge, the response generator can effectively leverage
+high-quality knowledge records from the retriever and enhance the quality of
+generated responses. The codes and models of this paper are available at
+https://github.com/shenwzh3/MK-TOD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP 2023 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Guiding AMR Parsing with Reverse Graph Linearization <span class="chip">EMNLP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08860v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08860v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bofei Gao, Liang Chen, Peiyi Wang, Zhifang Sui, Baobao Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Abstract Meaning Representation (AMR) parsing aims to extract an abstract
+semantic graph from a given sentence. The sequence-to-sequence approaches,
+which linearize the semantic graph into a sequence of nodes and edges and
+generate the linearized graph directly, have achieved good performance.
+However, we observed that these approaches suffer from structure loss
+accumulation during the decoding process, leading to a much lower F1-score for
+nodes and edges decoded later compared to those decoded earlier. To address
+this issue, we propose a novel Reverse Graph Linearization (RGL) enhanced
+framework. RGL defines both default and reverse linearization orders of an AMR
+graph, where most structures at the back part of the default order appear at
+the front part of the reversed order and vice versa. RGL incorporates the
+reversed linearization to the original AMR parser through a two-pass
+self-distillation mechanism, which guides the model when generating the default
+linearizations. Our analysis shows that our proposed method significantly
+mitigates the problem of structure loss accumulation, outperforming the
+previously best AMR parsing model by 0.8 and 0.5 Smatch scores on the AMR 2.0
+and AMR 3.0 dataset, respectively. The code are available at
+https://github.com/pkunlp-icler/AMR_reverse_graph_linearization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of EMNLP2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Large Language Models as Source Planner for Personalized
+  Knowledge-grounded Dialogue 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08840v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08840v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongru Wang, Minda Hu, Yang Deng, Rui Wang, Fei Mi, Weichao Wang, Yasheng Wang, Wai-Chung Kwan, Irwin King, Kam-Fai Wong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Open-domain dialogue system usually requires different sources of knowledge
+to generate more informative and evidential responses. However, existing
+knowledge-grounded dialogue systems either focus on a single knowledge source
+or overlook the dependency between multiple sources of knowledge, which may
+result in generating inconsistent or even paradoxical responses. To incorporate
+multiple knowledge sources and dependencies between them, we propose SAFARI, a
+novel framework that leverages the exceptional capabilities of large language
+models (LLMs) in planning, understanding, and incorporating under both
+supervised and unsupervised settings. Specifically, SAFARI decouples the
+knowledge grounding into multiple sources and response generation, which allows
+easy extension to various knowledge sources including the possibility of not
+using any sources. To study the problem, we construct a personalized
+knowledge-grounded dialogue dataset \textit{\textbf{K}nowledge \textbf{B}ehind
+\textbf{P}ersona}~(\textbf{KBP}), which is the first to consider the dependency
+between persona and implicit knowledge. Experimental results on the KBP dataset
+demonstrate that the SAFARI framework can effectively produce
+persona-consistent and knowledge-enhanced responses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comparative Analysis of Task-Agnostic Distillation Methods for
+  Compressing <span class="highlight-title">Transformer</span> Language Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08797v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08797v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takuma Udagawa, Aashka Trivedi, Michele Merler, Bishwaranjan Bhattacharjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models have become a vital component in modern NLP, achieving
+state of the art performance in a variety of tasks. However, they are often
+inefficient for real-world deployment due to their expensive inference costs.
+Knowledge distillation is a promising technique to improve their efficiency
+while retaining most of their effectiveness. In this paper, we reproduce,
+compare and analyze several representative methods for task-agnostic
+(general-purpose) distillation of Transformer language models. Our target of
+study includes Output Distribution (OD) transfer, Hidden State (HS) transfer
+with various layer mapping strategies, and Multi-Head Attention (MHA) transfer
+based on MiniLMv2. Through our extensive experiments, we study the
+effectiveness of each method for various student architectures in both
+monolingual (English) and multilingual settings. Overall, we show that MHA
+transfer based on MiniLMv2 is generally the best option for distillation and
+explain the potential reasons behind its success. Moreover, we show that HS
+transfer remains as a competitive baseline, especially under a sophisticated
+layer mapping strategy, while OD transfer consistently lags behind other
+approaches. Findings from this study helped us deploy efficient yet effective
+student models for latency-critical applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP 2023 Industry Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ End-to-end Story Plot Generator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08796v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08796v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanlin Zhu, Andrew Cohen, Danqing Wang, Kevin Yang, Xiaomeng Yang, Jiantao Jiao, Yuandong Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Story plots, while short, carry most of the essential information of a full
+story that may contain tens of thousands of words. We study the problem of
+automatic generation of story plots, which includes story premise, character
+descriptions, plot outlines, etc. To generate a single engaging plot, existing
+plot generators (e.g., DOC (Yang et al., 2022a)) require hundreds to thousands
+of calls to LLMs (e.g., OpenAI API) in the planning stage of the story plot,
+which is costly and takes at least several minutes. Moreover, the hard-wired
+nature of the method makes the pipeline non-differentiable, blocking fast
+specialization and personalization of the plot generator. In this paper, we
+propose three models, $\texttt{OpenPlot}$, $\texttt{E2EPlot}$ and
+$\texttt{RLPlot}$, to address these challenges. $\texttt{OpenPlot}$ replaces
+expensive OpenAI API calls with LLaMA2 (Touvron et al., 2023) calls via careful
+prompt designs, which leads to inexpensive generation of high-quality training
+datasets of story plots. We then train an end-to-end story plot generator,
+$\texttt{E2EPlot}$, by supervised fine-tuning (SFT) using approximately 13000
+story plots generated by $\texttt{OpenPlot}$. $\texttt{E2EPlot}$ generates
+story plots of comparable quality to $\texttt{OpenPlot}$, and is > 10$\times$
+faster (1k tokens in only 30 seconds on average). Finally, we obtain
+$\texttt{RLPlot}$ that is further fine-tuned with RLHF on several different
+reward models for different aspects of story quality, which yields 60.0$\%$
+winning rate against $\texttt{E2EPlot}$ along the aspect of suspense and
+surprise.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating Bias for Question Answering Models by Tracking Bias Influence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08795v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08795v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyu Derek Ma, Jiun-Yu Kao, Arpit Gupta, Yu-Hsiang Lin, Wenbo Zhao, Tagyoung Chung, Wei Wang, Kai-Wei Chang, Nanyun Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Models of various NLP tasks have been shown to exhibit stereotypes, and the
+bias in the question answering (QA) models is especially harmful as the output
+answers might be directly consumed by the end users. There have been datasets
+to evaluate bias in QA models, while bias mitigation technique for the QA
+models is still under-explored. In this work, we propose BMBI, an approach to
+mitigate the bias of multiple-choice QA models. Based on the intuition that a
+model would lean to be more biased if it learns from a biased example, we
+measure the bias level of a query instance by observing its influence on
+another instance. If the influenced instance is more biased, we derive that the
+query instance is biased. We then use the bias level detected as an
+optimization objective to form a multi-task learning setting in addition to the
+original QA task. We further introduce a new bias evaluation metric to quantify
+bias in a comprehensive and sensitive way. We show that our method could be
+applied to multiple QA formulations across multiple bias categories. It can
+significantly reduce the bias level in all 9 bias categories in the BBQ dataset
+while maintaining comparable QA accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ "Im not Racist but...": Discovering Bias in the Internal Knowledge of
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08780v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08780v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abel Salinas, Louis Penafiel, Robert McCormack, Fred Morstatter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have garnered significant attention for their
+remarkable performance in a continuously expanding set of natural language
+processing tasks. However, these models have been shown to harbor inherent
+societal biases, or stereotypes, which can adversely affect their performance
+in their many downstream applications. In this paper, we introduce a novel,
+purely prompt-based approach to uncover hidden stereotypes within any arbitrary
+LLM. Our approach dynamically generates a knowledge representation of internal
+stereotypes, enabling the identification of biases encoded within the LLM's
+internal knowledge. By illuminating the biases present in LLMs and offering a
+systematic methodology for their analysis, our work contributes to advancing
+transparency and promoting fairness in natural language processing systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Warning: This paper discusses and contains content that is offensive
+  or upsetting</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UDAPDR: Unsupervised Domain Adaptation via LLM <span class="highlight-title">Prompt</span>ing and
+  Distillation of Rerankers <span class="chip">EMNLP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.00807v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.00807v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jon Saad-Falcon, Omar Khattab, Keshav Santhanam, Radu Florian, Martin Franz, Salim Roukos, Avirup Sil, Md Arafat Sultan, Christopher Potts
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many information retrieval tasks require large labeled datasets for
+fine-tuning. However, such datasets are often unavailable, and their utility
+for real-world applications can diminish quickly due to domain shifts. To
+address this challenge, we develop and motivate a method for using large
+language models (LLMs) to generate large numbers of synthetic queries cheaply.
+The method begins by generating a small number of synthetic queries using an
+expensive LLM. After that, a much less expensive one is used to create large
+numbers of synthetic queries, which are used to fine-tune a family of reranker
+models. These rerankers are then distilled into a single efficient retriever
+for use in the target domain. We show that this technique boosts zero-shot
+accuracy in long-tail domains and achieves substantially lower latency than
+standard reranking methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Long Paper at Empirical Methods in Natural Language Processing
+  (EMNLP) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evade Chat<span class="highlight-title">GPT</span> Detectors via A Single Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02599v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02599v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuyang Cai, Wanyun Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  ChatGPT brings revolutionary social value but also raises concerns about the
+misuse of AI-generated text. Consequently, an important question is how to
+detect whether texts are generated by ChatGPT or by human. Existing detectors
+are built upon the assumption that there are distributional gaps between
+human-generated and AI-generated text. These gaps are typically identified
+using statistical information or classifiers. Our research challenges the
+distributional gap assumption in detectors. We find that detectors do not
+effectively discriminate the semantic and stylistic gaps between
+human-generated and AI-generated text. Instead, the "subtle differences", such
+as an extra space, become crucial for detection. Based on this discovery, we
+propose the SpaceInfi strategy to evade detection. Experiments demonstrate the
+effectiveness of this strategy across multiple benchmarks and detectors. We
+also provide a theoretical explanation for why SpaceInfi is successful in
+evading perplexity-based detection. And we empirically show that a phenomenon
+called token mutation causes the evasion for language model-based detectors.
+Our findings offer new insights and challenges for understanding and
+constructing more applicable ChatGPT detectors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LanguageMPC: Large Language Models as Decision Makers for Autonomous
+  Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03026v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03026v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Sha, Yao Mu, Yuxuan Jiang, Li Chen, Chenfeng Xu, Ping Luo, Shengbo Eben Li, Masayoshi Tomizuka, Wei Zhan, Mingyu Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing learning-based autonomous driving (AD) systems face challenges in
+comprehending high-level information, generalizing to rare events, and
+providing interpretability. To address these problems, this work employs Large
+Language Models (LLMs) as a decision-making component for complex AD scenarios
+that require human commonsense understanding. We devise cognitive pathways to
+enable comprehensive reasoning with LLMs, and develop algorithms for
+translating LLM decisions into actionable driving commands. Through this
+approach, LLM decisions are seamlessly integrated with low-level controllers by
+guided parameter matrix adaptation. Extensive experiments demonstrate that our
+proposed method not only consistently surpasses baseline approaches in
+single-vehicle tasks, but also helps handle complex driving behaviors even
+multi-vehicle coordination, thanks to the commonsense reasoning capabilities of
+LLMs. This paper presents an initial step toward leveraging LLMs as effective
+decision-makers for intricate AD scenarios in terms of safety, efficiency,
+generalizability, and interoperability. We aspire for it to serve as
+inspiration for future research in this field. Project page:
+https://sites.google.com/view/llm-mpc
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can Chat<span class="highlight-title">GPT</span> Assess Human Personalities? A General Evaluation Framework <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.01248v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.01248v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haocong Rao, Cyril Leung, Chunyan Miao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) especially ChatGPT have produced impressive
+results in various areas, but their potential human-like psychology is still
+largely unexplored. Existing works study the virtual personalities of LLMs but
+rarely explore the possibility of analyzing human personalities via LLMs. This
+paper presents a generic evaluation framework for LLMs to assess human
+personalities based on Myers Briggs Type Indicator (MBTI) tests. Specifically,
+we first devise unbiased prompts by randomly permuting options in MBTI
+questions and adopt the average testing result to encourage more impartial
+answer generation. Then, we propose to replace the subject in question
+statements to enable flexible queries and assessments on different subjects
+from LLMs. Finally, we re-formulate the question instructions in a manner of
+correctness evaluation to facilitate LLMs to generate clearer responses. The
+proposed framework enables LLMs to flexibly assess personalities of different
+groups of people. We further propose three evaluation metrics to measure the
+consistency, robustness, and fairness of assessment results from
+state-of-the-art LLMs including ChatGPT and GPT-4. Our experiments reveal
+ChatGPT's ability to assess human personalities, and the average results
+demonstrate that it can achieve more consistent and fairer assessments in spite
+of lower robustness against prompt biases compared with InstructGPT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP 2023. Our codes are available at
+  https://github.com/Kali-Hac/ChatGPT-MBTI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Large Language Model Displays Emergent Ability to Interpret Novel
+  Literary Metaphors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.01497v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.01497v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicholas Ichien, Dušan Stamenković, Keith J. Holyoak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in the performance of large language models (LLMs) have
+sparked debate over whether, given sufficient training, high-level human
+abilities emerge in such generic forms of artificial intelligence (AI). Despite
+the exceptional performance of LLMs on a wide range of tasks involving natural
+language processing and reasoning, there has been sharp disagreement as to
+whether their abilities extend to more creative human abilities. A core example
+is the ability to interpret novel metaphors. Given the enormous and non curated
+text corpora used to train LLMs, a serious obstacle to designing tests is the
+requirement of finding novel yet high quality metaphors that are unlikely to
+have been included in the training data. Here we assessed the ability of GPT4,
+a state of the art large language model, to provide natural-language
+interpretations of novel literary metaphors drawn from Serbian poetry and
+translated into English. Despite exhibiting no signs of having been exposed to
+these metaphors previously, the AI system consistently produced detailed and
+incisive interpretations. Human judges, blind to the fact that an AI model was
+involved, rated metaphor interpretations generated by GPT4 as superior to those
+provided by a group of college students. In interpreting reversed metaphors,
+GPT4, as well as humans, exhibited signs of sensitivity to the Gricean
+cooperative principle. In addition, for several novel English poems GPT4
+produced interpretations that were rated as excellent or good by a human
+literary critic. These results indicate that LLMs such as GPT4 have acquired an
+emergent ability to interpret complex metaphors, including those embedded in
+novel poems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ What should I Ask: A Knowledge-driven Approach for Follow-up Questions
+  Generation in Conversational <span class="highlight-title">Survey</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.10977v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.10977v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yubin Ge, Ziang Xiao, Jana Diesner, Heng Ji, Karrie Karahalios, Hari Sundaram
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating follow-up questions on the fly could significantly improve
+conversational survey quality and user experiences by enabling a more dynamic
+and personalized survey structure. In this paper, we proposed a novel task for
+knowledge-driven follow-up question generation in conversational surveys. We
+constructed a new human-annotated dataset of human-written follow-up questions
+with dialogue history and labeled knowledge in the context of conversational
+surveys. Along with the dataset, we designed and validated a set of
+reference-free Gricean-inspired evaluation metrics to systematically evaluate
+the quality of generated follow-up questions. We then propose a two-staged
+knowledge-driven model for the task, which generates informative and coherent
+follow-up questions by using knowledge to steer the generation process. The
+experiments demonstrate that compared to GPT-based baseline models, our
+two-staged model generates more informative, coherent, and clear follow-up
+questions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generating Images with Multimodal Language Models <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17216v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17216v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Yu Koh, Daniel Fried, Ruslan Salakhutdinov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a method to fuse frozen text-only large language models (LLMs)
+with pre-trained image encoder and decoder models, by mapping between their
+embedding spaces. Our model demonstrates a wide suite of multimodal
+capabilities: image retrieval, novel image generation, and multimodal dialogue.
+Ours is the first approach capable of conditioning on arbitrarily interleaved
+image and text inputs to generate coherent image (and text) outputs. To achieve
+strong performance on image generation, we propose an efficient mapping network
+to ground the LLM to an off-the-shelf text-to-image generation model. This
+mapping network translates hidden representations of text into the embedding
+space of the visual models, enabling us to leverage the strong text
+representations of the LLM for visual outputs. Our approach outperforms
+baseline generation models on tasks with longer and more complex language. In
+addition to novel image generation, our model is also capable of image
+retrieval from a prespecified dataset, and decides whether to retrieve or
+generate at inference time. This is done with a learnt decision module which
+conditions on the hidden representations of the LLM. Our model exhibits a wider
+range of capabilities compared to prior multimodal language models. It can
+process image-and-text inputs, and produce retrieved images, generated images,
+and generated text -- outperforming non-LLM based generation models across
+several text-to-image tasks that measure context dependence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023. Project page: http://jykoh.com/gill</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02490v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02490v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weihao Yu, Zhengyuan Yang, Linjie Li, Jianfeng Wang, Kevin Lin, Zicheng Liu, Xinchao Wang, Lijuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose MM-Vet, an evaluation benchmark that examines large multimodal
+models (LMMs) on complicated multimodal tasks. Recent LMMs have shown various
+intriguing abilities, such as solving math problems written on the blackboard,
+reasoning about events and celebrities in news images, and explaining visual
+jokes. Rapid model advancements pose challenges to evaluation benchmark
+development. Problems include: (1) How to systematically structure and evaluate
+the complicated multimodal tasks; (2) How to design evaluation metrics that
+work well across question and answer types; and (3) How to give model insights
+beyond a simple performance ranking. To this end, we present MM-Vet, designed
+based on the insight that the intriguing ability to solve complicated tasks is
+often achieved by a generalist model being able to integrate different core
+vision-language (VL) capabilities. MM-Vet defines 6 core VL capabilities and
+examines the 16 integrations of interest derived from the capability
+combination. For evaluation metrics, we propose an LLM-based evaluator for
+open-ended outputs. The evaluator enables the evaluation across different
+question types and answer styles, resulting in a unified scoring metric. We
+evaluate representative LMMs on MM-Vet, providing insights into the
+capabilities of different LMM system paradigms and models. Code and data are
+available at https://github.com/yuweihao/MM-Vet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Update results of OpenFlamingo-9B (MPT), LLaMA-Adapter v2-7B, and
+  Otter-9B (MPT). Code, data and leaderboard:
+  https://github.com/yuweihao/MM-Vet</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can language models learn analogical reasoning? Investigating training
+  objectives and comparisons to human performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05597v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05597v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Molly R. Petersen, Lonneke van der Plas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While analogies are a common way to evaluate word embeddings in NLP, it is
+also of interest to investigate whether or not analogical reasoning is a task
+in itself that can be learned. In this paper, we test several ways to learn
+basic analogical reasoning, specifically focusing on analogies that are more
+typical of what is used to evaluate analogical reasoning in humans than those
+in commonly used NLP benchmarks. Our experiments find that models are able to
+learn analogical reasoning, even with a small amount of data. We additionally
+compare our models to a dataset with a human baseline, and find that after
+training, models approach human performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Being Right for Whose Right Reasons? <span class="chip">ACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.00639v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.00639v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Terne Sasha Thorn Jakobsen, Laura Cabello, Anders Søgaard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Explainability methods are used to benchmark the extent to which model
+predictions align with human rationales i.e., are 'right for the right
+reasons'. Previous work has failed to acknowledge, however, that what counts as
+a rationale is sometimes subjective. This paper presents what we think is a
+first of its kind, a collection of human rationale annotations augmented with
+the annotators demographic information. We cover three datasets spanning
+sentiment analysis and common-sense reasoning, and six demographic groups
+(balanced across age and ethnicity). Such data enables us to ask both what
+demographics our predictions align with and whose reasoning patterns our
+models' rationales align with. We find systematic inter-group annotator
+disagreement and show how 16 Transformer-based models align better with
+rationales provided by certain demographic groups: We find that models are
+biased towards aligning best with older and/or white annotators. We zoom in on
+the effects of model size and model distillation, finding -- contrary to our
+expectations -- negative correlations between model size and rationale
+agreement as well as no evidence that either model size or model distillation
+improves fairness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In Proceedings of ACL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Impact of Cross-Domain Data on German Language Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07321v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07321v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amin Dada, Aokun Chen, Cheng Peng, Kaleb E Smith, Ahmad Idrissi-Yaghir, Constantin Marc Seibold, Jianning Li, Lars Heiliger, Xi Yang, Christoph M. Friedrich, Daniel Truhn, Jan Egger, Jiang Bian, Jens Kleesiek, Yonghui Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditionally, large language models have been either trained on general web
+crawls or domain-specific data. However, recent successes of generative large
+language models, have shed light on the benefits of cross-domain datasets. To
+examine the significance of prioritizing data diversity over quality, we
+present a German dataset comprising texts from five domains, along with another
+dataset aimed at containing high-quality data. Through training a series of
+models ranging between 122M and 750M parameters on both datasets, we conduct a
+comprehensive benchmark on multiple downstream tasks. Our findings demonstrate
+that the models trained on the cross-domain dataset outperform those trained on
+quality data alone, leading to improvements up to $4.45\%$ over the previous
+state-of-the-art. The models are available at
+https://huggingface.co/ikim-uk-essen
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 1 figure, accepted at Findings of the Association for
+  Computational Linguistics: EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dicta<span class="highlight-title">BERT</span>: A State-of-the-Art <span class="highlight-title">BERT</span> Suite for Modern Hebrew 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.16687v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.16687v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaltiel Shmidman, Avi Shmidman, Moshe Koppel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present DictaBERT, a new state-of-the-art pre-trained BERT model for
+modern Hebrew, outperforming existing models on most benchmarks. Additionally,
+we release three fine-tuned versions of the model, designed to perform three
+specific foundational tasks in the analysis of Hebrew texts: prefix
+segmentation, morphological tagging and question answering. These fine-tuned
+models allow any developer to perform prefix segmentation, morphological
+tagging and question answering of a Hebrew input with a single call to a
+HuggingFace model, without the need to integrate any additional libraries or
+code. In this paper we describe the details of the training as well and the
+results on the different benchmarks. We release the models to the community,
+along with sample code demonstrating their use. We release these models as part
+of our goal to help further research and development in Hebrew NLP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated second version, with links to two question-answering models</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Sparse Fine-tuning for Inference Acceleration of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06927v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06927v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eldar Kurtic, Denis Kuznedelev, Elias Frantar, Michael Goin, Dan Alistarh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of accurate sparse fine-tuning of large language
+models (LLMs), that is, fine-tuning pretrained LLMs on specialized tasks, while
+inducing sparsity in their weights. On the accuracy side, we observe that
+standard loss-based fine-tuning may fail to recover accuracy, especially at
+high sparsities. To address this, we perform a detailed study of
+distillation-type losses, determining an L2-based distillation approach we term
+SquareHead which enables accurate recovery even at higher sparsities, across
+all model types. On the practical efficiency side, we show that sparse LLMs can
+be executed with speedups by taking advantage of sparsity, for both CPU and GPU
+runtimes. While the standard approach is to leverage sparsity for computational
+reduction, we observe that in the case of memory-bound LLMs sparsity can also
+be leveraged for reducing memory bandwidth. We exhibit end-to-end results
+showing speedups due to sparsity, while recovering accuracy, on T5 (language
+translation), Whisper (speech translation), and open GPT-type (MPT for text
+generation). For MPT text generation, we show for the first time that sparse
+fine-tuning can reach 75% sparsity without accuracy drops, provide notable
+end-to-end speedups for both CPU and GPU inference, and highlight that sparsity
+is also compatible with quantization approaches. Models and software for
+reproducing our results are provided in Section 6.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards End-to-End Embodied Decision Making via Multi-modal Large
+  Language Model: Explorations with <span class="highlight-title">GPT</span>4-Vision and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02071v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02071v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Chen, Yichi Zhang, Shuhuai Ren, Haozhe Zhao, Zefan Cai, Yuchi Wang, Peiyi Wang, Tianyu Liu, Baobao Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we explore the potential of Multimodal Large Language Models
+(MLLMs) in improving embodied decision-making processes for agents. While Large
+Language Models (LLMs) have been widely used due to their advanced reasoning
+skills and vast world knowledge, MLLMs like GPT4-Vision offer enhanced visual
+understanding and reasoning capabilities. We investigate whether
+state-of-the-art MLLMs can handle embodied decision-making in an end-to-end
+manner and whether collaborations between LLMs and MLLMs can enhance
+decision-making. To address these questions, we introduce a new benchmark
+called PCA-EVAL, which evaluates embodied decision-making from the perspectives
+of Perception, Cognition, and Action. Additionally, we propose HOLMES, a
+multi-agent cooperation framework that allows LLMs to leverage MLLMs and APIs
+to gather multimodal information for informed decision-making. We compare
+end-to-end embodied decision-making and HOLMES on our benchmark and find that
+the GPT4-Vision model demonstrates strong end-to-end embodied decision-making
+abilities, outperforming GPT4-HOLMES in terms of average decision accuracy
+(+3%). However, this performance is exclusive to the latest GPT4-Vision model,
+surpassing the open-source state-of-the-art MLLM by 26%. Our results indicate
+that powerful MLLMs like GPT4-Vision hold promise for decision-making in
+embodied agents, offering new avenues for MLLM research. Code and data are open
+at https://github.com/pkunlp-icler/PCA-EVAL/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 10 figures, Code and data:
+  https://github.com/pkunlp-icler/PCA-EVAL/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NAPG: Non-Autoregressive Program Generation for Hybrid Tabular-Textual
+  Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.03462v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.03462v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tengxun Zhang, Hongfei Xu, Josef van Genabith, Deyi Xiong, Hongying Zan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hybrid tabular-textual question answering (QA) requires reasoning from
+heterogeneous information, and the types of reasoning are mainly divided into
+numerical reasoning and span extraction. Current numerical reasoning methods
+autoregressively decode program sequences, and each decoding step produces
+either an operator or an operand. However, the step-by-step decoding suffers
+from exposure bias, and the accuracy of program generation drops sharply as the
+decoding steps unfold due to error propagation. In this paper, we propose a
+non-autoregressive program generation framework, which independently generates
+complete program tuples containing both operators and operands, can address the
+error propagation issue while significantly boosting the speed of program
+generation. Experiments on the ConvFinQA and MultiHiertt datasets show that our
+non-autoregressive program generation method can bring about substantial
+improvements over the strong FinQANet (+5.06 Exe Acc and +4.80 Prog Acc points)
+and MT2Net (+7.97 EM and +6.38 F1 points) baselines, establishing the new
+state-of-the-art performance, while being much faster (21x) in program
+generation. Finally, with increasing numbers of numerical reasoning steps the
+performance drop of our method is significantly smaller than that of the
+baselines. Our code will be publicly available soon.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Orca: A Few-shot Benchmark for Chinese Conversational Machine Reading
+  Comprehension 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.13619v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.13619v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nuo Chen, Hongguang Li, Junqing He, Yinan Bao, Xinshi Lin, Qi Yang, Jianfeng Liu, Ruyi Gan, Jiaxing Zhang, Baoyuan Wang, Jia Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The conversational machine reading comprehension (CMRC) task aims to answer
+questions in conversations, which has been a hot research topic in recent years
+because of its wide applications. However, existing CMRC benchmarks in which
+each conversation is assigned a static passage are inconsistent with real
+scenarios. Thus, model's comprehension ability towards real scenarios are hard
+to evaluate reasonably. To this end, we propose the first Chinese CMRC
+benchmark Orca and further provide zero-shot/few-shot settings to evaluate
+model's generalization ability towards diverse domains. We collect 831
+hot-topic driven conversations with 4,742 turns in total. Each turn of a
+conversation is assigned with a response-related passage, aiming to evaluate
+model's comprehension ability more reasonably. The topics of conversations are
+collected from social media platform and cover 33 domains, trying to be
+consistent with real scenarios. Importantly, answers in Orca are all
+well-annotated natural responses rather than the specific spans or short phrase
+in previous datasets. Besides, we implement three strong baselines to tackle
+the challenge in Orca. The results indicate the great challenge of our CMRC
+benchmark. Our datatset and checkpoints are available at
+https://github.com/nuochenpku/Orca.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Integrating UMLS Knowledge into Large Language Models for Medical
+  Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02778v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02778v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Yang, Edison Marrese-Taylor, Yuhe Ke, Lechao Cheng, Qingyu Chen, Irene Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated powerful text generation
+capabilities, bringing unprecedented innovation to the healthcare field. While
+LLMs hold immense promise for applications in healthcare, applying them to real
+clinical scenarios presents significant challenges, as these models may
+generate content that deviates from established medical facts and even exhibit
+potential biases. In our research, we develop an augmented LLM framework based
+on the Unified Medical Language System (UMLS), aiming to better serve the
+healthcare community. We employ LLaMa2-13b-chat and ChatGPT-3.5 as our
+benchmark models, and conduct automatic evaluations using the ROUGE Score and
+BERTScore on 104 questions from the LiveQA test set. Additionally, we establish
+criteria for physician-evaluation based on four dimensions: Factuality,
+Completeness, Readability and Relevancy. ChatGPT-3.5 is used for physician
+evaluation with 20 questions on the LiveQA test set. Multiple resident
+physicians conducted blind reviews to evaluate the generated content, and the
+results indicate that this framework effectively enhances the factuality,
+completeness, and relevance of generated content. Our research demonstrates the
+effectiveness of using UMLS-augmented LLMs and highlights the potential
+application value of LLMs in in medical question-answering.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficiently Enhancing Zero-Shot Performance of Instruction Following
+  Model via Retrieval of Soft <span class="highlight-title">Prompt</span> <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.03029v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.03029v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seonghyeon Ye, Joel Jang, Doyoung Kim, Yongrae Jo, Minjoon Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Enhancing the zero-shot performance of instruction-following models requires
+heavy computation, either by scaling the total number of training datasets or
+the model size. In this work, we explore how retrieval of soft prompts obtained
+through prompt tuning can efficiently assist hard prompts in zero-shot task
+generalization. Specifically, we train soft prompt embeddings for each prompt
+through prompt tuning, store the samples of the training instances mapped with
+the prompt embeddings, and retrieve the corresponding prompt embedding of the
+training instance closest to the query instance during inference. While only
+adding 0.007% additional parameters, retrieval of soft prompt enhances the
+performance of T0 on unseen tasks by outperforming it on 10 out of 11 datasets
+as well as improving the mean accuracy of T0 on BIG-bench benchmark by 2.39%
+points. Also, we report an interesting finding that retrieving source
+embeddings trained on similar answer choice formats is more important than
+those on similar task types.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Biomedical Named Entity Recognition via Dictionary-based Synonym
+  Generalization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13066v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13066v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihao Fu, Yixuan Su, Zaiqiao Meng, Nigel Collier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Biomedical named entity recognition is one of the core tasks in biomedical
+natural language processing (BioNLP). To tackle this task, numerous
+supervised/distantly supervised approaches have been proposed. Despite their
+remarkable success, these approaches inescapably demand laborious human effort.
+To alleviate the need of human effort, dictionary-based approaches have been
+proposed to extract named entities simply based on a given dictionary. However,
+one downside of existing dictionary-based approaches is that they are
+challenged to identify concept synonyms that are not listed in the given
+dictionary, which we refer as the synonym generalization problem. In this
+study, we propose a novel Synonym Generalization (SynGen) framework that
+recognizes the biomedical concepts contained in the input text using span-based
+predictions. In particular, SynGen introduces two regularization terms, namely,
+(1) a synonym distance regularizer; and (2) a noise perturbation regularizer,
+to minimize the synonym generalization error. To demonstrate the effectiveness
+of our approach, we provide a theoretical analysis of the bound of synonym
+generalization error. We extensively evaluate our approach on a wide range of
+benchmarks and the results verify that SynGen outperforms previous
+dictionary-based models by notable margins. Lastly, we provide a detailed
+analysis to further reveal the merits and inner-workings of our approach.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Target-oriented Proactive Dialogue Systems with Personalization: Problem
+  Formulation and <span class="highlight-title">Dataset</span> Curation <span class="chip">EMNLP-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07397v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07397v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jian Wang, Yi Cheng, Dongding Lin, Chak Tou Leong, Wenjie Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Target-oriented dialogue systems, designed to proactively steer conversations
+toward predefined targets or accomplish specific system-side goals, are an
+exciting area in conversational AI. In this work, by formulating a <dialogue
+act, topic> pair as the conversation target, we explore a novel problem of
+personalized target-oriented dialogue by considering personalization during the
+target accomplishment process. However, there remains an emergent need for
+high-quality datasets, and building one from scratch requires tremendous human
+effort. To address this, we propose an automatic dataset curation framework
+using a role-playing approach. Based on this framework, we construct a
+large-scale personalized target-oriented dialogue dataset, TopDial, which
+comprises about 18K multi-turn dialogues. The experimental results show that
+this dataset is of high quality and could contribute to exploring personalized
+target-oriented dialogue.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP-2023 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DUMB: A Benchmark for Smart Evaluation of Dutch Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13026v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13026v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wietse de Vries, Martijn Wieling, Malvina Nissim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce the Dutch Model Benchmark: DUMB. The benchmark includes a
+diverse set of datasets for low-, medium- and high-resource tasks. The total
+set of nine tasks includes four tasks that were previously not available in
+Dutch. Instead of relying on a mean score across tasks, we propose Relative
+Error Reduction (RER), which compares the DUMB performance of language models
+to a strong baseline which can be referred to in the future even when assessing
+different sets of language models. Through a comparison of 14 pre-trained
+language models (mono- and multi-lingual, of varying sizes), we assess the
+internal consistency of the benchmark tasks, as well as the factors that likely
+enable high performance. Our results indicate that current Dutch monolingual
+models under-perform and suggest training larger Dutch models with other
+architectures and pre-training objectives. At present, the highest performance
+is achieved by DeBERTaV3 (large), XLM-R (large) and mDeBERTaV3 (base). In
+addition to highlighting best strategies for training larger Dutch models, DUMB
+will foster further research on Dutch. A public leaderboard is available at
+https://dumbench.nl.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 camera-ready</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Universal Multi-modal Entity Alignment via Iteratively Fusing Modality
+  Similarity Paths 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05364v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05364v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bolin Zhu, Xiaoze Liu, Xin Mao, Zhuo Chen, Lingbing Guo, Tao Gui, Qi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The objective of Entity Alignment (EA) is to identify equivalent entity pairs
+from multiple Knowledge Graphs (KGs) and create a more comprehensive and
+unified KG. The majority of EA methods have primarily focused on the structural
+modality of KGs, lacking exploration of multi-modal information. A few
+multi-modal EA methods have made good attempts in this field. Still, they have
+two shortcomings: (1) inconsistent and inefficient modality modeling that
+designs complex and distinct models for each modality; (2) ineffective modality
+fusion due to the heterogeneous nature of modalities in EA. To tackle these
+challenges, we propose PathFusion, consisting of two main components: (1) MSP,
+a unified modeling approach that simplifies the alignment process by
+constructing paths connecting entities and modality nodes to represent multiple
+modalities; (2) IRF, an iterative fusion method that effectively combines
+information from different modalities using the path as an information carrier.
+Experimental results on real-world datasets demonstrate the superiority of
+PathFusion over state-of-the-art methods, with 22.4%-28.9% absolute improvement
+on Hits@1, and 0.194-0.245 absolute improvement on MRR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How to Enhance Causal Discrimination of Utterances: A Case on Affective
+  Reasoning <span class="chip">EMNLP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02615v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02615v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hang Chen, Jing Luo, Xinyu Yang, Wenjing Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Our investigation into the Affective Reasoning in Conversation (ARC) task
+highlights the challenge of causal discrimination. Almost all existing models,
+including large language models (LLMs), excel at capturing semantic
+correlations within utterance embeddings but fall short in determining the
+specific causal relationships. To overcome this limitation, we propose the
+incorporation of \textit{i.i.d.} noise terms into the conversation process,
+thereby constructing a structural causal model (SCM). It explores how distinct
+causal relationships of fitted embeddings can be discerned through independent
+conditions. To facilitate the implementation of deep learning, we introduce the
+cogn frameworks to handle unstructured conversation data, and employ an
+autoencoder architecture to regard the unobservable noise as learnable
+"implicit causes." Moreover, we curate a synthetic dataset that includes i.i.d.
+noise. Through comprehensive experiments, we validate the effectiveness and
+interpretability of our approach. Our code is available in
+https://github.com/Zodiark-ch/mater-of-our-EMNLP2023-paper.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted via EMNLP2023-main</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Human Mobility Question Answering (Vision Paper) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04443v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04443v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Xue, Flora D. Salim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Question answering (QA) systems have attracted much attention from the
+artificial intelligence community as they can learn to answer questions based
+on the given knowledge source (e.g., images in visual question answering).
+However, the research into question answering systems with human mobility data
+remains unexplored. Mining human mobility data is crucial for various
+applications such as smart city planning, pandemic management, and personalised
+recommendation system. In this paper, we aim to tackle this gap and introduce a
+novel task, that is, human mobility question answering (MobQA). The aim of the
+task is to let the intelligent system learn from mobility data and answer
+related questions. This task presents a new paradigm change in mobility
+prediction research and further facilitates the research of human mobility
+recommendation systems. To better support this novel research topic, this
+vision paper also proposes an initial design of the dataset and a potential
+deep learning model framework for the introduced MobQA task. We hope that this
+paper will provide novel insights and open new directions in human mobility
+research and question answering research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Provable Robust Watermarking for AI-Generated Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.17439v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.17439v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuandong Zhao, Prabhanjan Ananth, Lei Li, Yu-Xiang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of watermarking large language models (LLMs) generated
+text -- one of the most promising approaches for addressing the safety
+challenges of LLM usage. In this paper, we propose a rigorous theoretical
+framework to quantify the effectiveness and robustness of LLM watermarks. We
+propose a robust and high-quality watermark method, Unigram-Watermark, by
+extending an existing approach with a simplified fixed grouping strategy. We
+prove that our watermark method enjoys guaranteed generation quality,
+correctness in watermark detection, and is robust against text editing and
+paraphrasing. Experiments on three varying LLMs and two datasets verify that
+our Unigram-Watermark achieves superior detection accuracy and comparable
+generation quality in perplexity, thus promoting the responsible use of LLMs.
+Code is available at https://github.com/XuandongZhao/Unigram-Watermark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Echo<span class="highlight-title">Prompt</span>: Instructing the Model to Rephrase Queries for Improved
+  In-context Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10687v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10687v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rajasekhar Reddy Mekala, Yasaman Razeghi, Sameer Singh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Language models are achieving impressive performance on various tasks by
+aggressively adopting inference-time prompting techniques, such as zero-shot
+and few-shot prompting. In this work, we introduce EchoPrompt, a simple yet
+effective approach that prompts the model to rephrase its queries before
+answering them. EchoPrompt is adapted for both zero-shot and few-shot
+in-context learning with standard and chain-of-thought prompting. Experimental
+results show that EchoPrompt yields substantial improvements across all these
+settings for four families of causal language models. These improvements are
+observed across various numerical reasoning (e.g. GSM8K, SVAMP), reading
+comprehension (e.g. DROP), and logical reasoning (e.g. Coin Flipping) tasks. On
+average, EchoPrompt improves the Zero-shot-CoT performance of code-davinci-002
+by 5% in numerical tasks and 13% in reading comprehension tasks. We investigate
+the factors contributing to EchoPrompt's effectiveness through ablation
+studies, which reveal that both the original query and the model-generated
+rephrased version are instrumental in its performance gains. Our empirical
+results indicate that EchoPrompt is an effective technique that enhances
+in-context learning performance. We recommend incorporating EchoPrompt into
+various baseline prompting strategies to achieve performance boosts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Qwen-VL: A Versatile Vision-Language Model for Understanding,
+  Localization, Text Reading, and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.12966v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.12966v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, Jingren Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we introduce the Qwen-VL series, a set of large-scale
+vision-language models (LVLMs) designed to perceive and understand both texts
+and images. Starting from the Qwen-LM as a foundation, we endow it with visual
+capacity by the meticulously designed (i) visual receptor, (ii) input-output
+interface, (iii) 3-stage training pipeline, and (iv) multilingual multimodal
+cleaned corpus. Beyond the conventional image description and
+question-answering, we implement the grounding and text-reading ability of
+Qwen-VLs by aligning image-caption-box tuples. The resulting models, including
+Qwen-VL and Qwen-VL-Chat, set new records for generalist models under similar
+model scales on a broad range of visual-centric benchmarks (e.g., image
+captioning, question answering, visual grounding) and different settings (e.g.,
+zero-shot, few-shot). Moreover, on real-world dialog benchmarks, our
+instruction-tuned Qwen-VL-Chat also demonstrates superiority compared to
+existing vision-language chatbots. Code, demo and models are available at
+https://github.com/QwenLM/Qwen-VL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code, demo and models are available at
+  https://github.com/QwenLM/Qwen-VL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Synthetic Data Generation with Large Language Models for Text
+  Classification: Potential and Limitations <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07849v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07849v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuoyan Li, Hangxiao Zhu, Zhuoran Lu, Ming Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The collection and curation of high-quality training data is crucial for
+developing text classification models with superior performance, but it is
+often associated with significant costs and time investment. Researchers have
+recently explored using large language models (LLMs) to generate synthetic
+datasets as an alternative approach. However, the effectiveness of the
+LLM-generated synthetic data in supporting model training is inconsistent
+across different classification tasks. To better understand factors that
+moderate the effectiveness of the LLM-generated synthetic data, in this study,
+we look into how the performance of models trained on these synthetic data may
+vary with the subjectivity of classification. Our results indicate that
+subjectivity, at both the task level and instance level, is negatively
+associated with the performance of the model trained on synthetic data. We
+conclude by discussing the implications of our work on the potential and
+limitations of leveraging LLM for synthetic data generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can We Edit Multimodal Large Language Models? <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08475v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08475v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyuan Cheng, Bozhong Tian, Qingbin Liu, Xi Chen, Yongheng Wang, Huajun Chen, Ningyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we focus on editing Multimodal Large Language Models (MLLMs).
+Compared to editing single-modal LLMs, multimodal model editing is more
+challenging, which demands a higher level of scrutiny and careful consideration
+in the editing process. To facilitate research in this area, we construct a new
+benchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite
+of innovative metrics for evaluation. We conduct comprehensive experiments
+involving various model editing baselines and analyze the impact of editing
+different components for multimodal LLMs. Empirically, we notice that previous
+baselines can implement editing multimodal LLMs to some extent, but the effect
+is still barely satisfactory, indicating the potential difficulty of this task.
+We hope that our work can provide the NLP community with insights. Code and
+dataset are available in https://github.com/zjunlp/EasyEdit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Training Generative Question-Answering on Synthetic Data Obtained from
+  an Instruct-tuned Model <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08072v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08072v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kosuke Takahashi, Takahiro Omi, Kosuke Arima, Tatsuya Ishigaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a simple and cost-effective method for synthesizing data
+to train question-answering systems. For training, fine-tuning GPT models is a
+common practice in resource-rich languages like English, however, it becomes
+challenging for non-English languages due to the scarcity of sufficient
+question-answer (QA) pairs. Existing approaches use question and answer
+generators trained on human-authored QA pairs, which involves substantial human
+expenses. In contrast, we use an instruct-tuned model to generate QA pairs in a
+zero-shot or few-shot manner. We conduct experiments to compare various
+strategies for obtaining QA pairs from the instruct-tuned model. The results
+demonstrate that a model trained on our proposed synthetic data achieves
+comparable performance to a model trained on manually curated datasets, without
+incurring human costs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>PACLIC 2023 short paper, 4 pages (6 pages including references), 4
+  figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">90</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Vision-by-Language for Training-Free Compositional Image Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09291v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09291v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shyamgopal Karthik, Karsten Roth, Massimiliano Mancini, Zeynep Akata
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given an image and a target modification (e.g an image of the Eiffel tower
+and the text "without people and at night-time"), Compositional Image Retrieval
+(CIR) aims to retrieve the relevant target image in a database. While
+supervised approaches rely on annotating triplets that is costly (i.e. query
+image, textual modification, and target image), recent research sidesteps this
+need by using large-scale vision-language models (VLMs), performing Zero-Shot
+CIR (ZS-CIR). However, state-of-the-art approaches in ZS-CIR still require
+training task-specific, customized models over large amounts of image-text
+pairs. In this work, we propose to tackle CIR in a training-free manner via our
+Compositional Image Retrieval through Vision-by-Language (CIReVL), a simple,
+yet human-understandable and scalable pipeline that effectively recombines
+large-scale VLMs with large language models (LLMs). By captioning the reference
+image using a pre-trained generative VLM and asking a LLM to recompose the
+caption based on the textual target modification for subsequent retrieval via
+e.g. CLIP, we achieve modular language reasoning. In four ZS-CIR benchmarks, we
+find competitive, in-part state-of-the-art performance - improving over
+supervised methods. Moreover, the modularity of CIReVL offers simple
+scalability without re-training, allowing us to both investigate scaling laws
+and bottlenecks for ZS-CIR while easily scaling up to in parts more than double
+of previously reported results. Finally, we show that CIReVL makes CIR
+human-understandable by composing image and text in a modular fashion in the
+language domain, thereby making it intervenable, allowing to post-hoc re-align
+failure cases. Code will be released upon acceptance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Unbiased Look at <span class="highlight-title">Dataset</span>s for Visuo-Motor <span class="highlight-title">Pre-Train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09289v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09289v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sudeep Dasari, Mohan Kumar Srirama, Unnat Jain, Abhinav Gupta
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual representation learning hold great promise for robotics, but is
+severely hampered by the scarcity and homogeneity of robotics datasets. Recent
+works address this problem by pre-training visual representations on
+large-scale but out-of-domain data (e.g., videos of egocentric interactions)
+and then transferring them to target robotics tasks. While the field is heavily
+focused on developing better pre-training algorithms, we find that dataset
+choice is just as important to this paradigm's success. After all, the
+representation can only learn the structures or priors present in the
+pre-training dataset. To this end, we flip the focus on algorithms, and instead
+conduct a dataset centric analysis of robotic pre-training. Our findings call
+into question some common wisdom in the field. We observe that traditional
+vision datasets (like ImageNet, Kinetics and 100 Days of Hands) are
+surprisingly competitive options for visuo-motor representation learning, and
+that the pre-training dataset's image distribution matters more than its size.
+Finally, we show that common simulation benchmarks are not a reliable proxy for
+real world performance and that simple regularization strategies can
+dramatically improve real world policy learning.
+https://data4robotics.github.io
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CoRL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SAIR: Learning Semantic-aware Implicit Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09285v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09285v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Canyu Zhang, Xiaoguang Li, Qing Guo, Song Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Implicit representation of an image can map arbitrary coordinates in the
+continuous domain to their corresponding color values, presenting a powerful
+capability for image reconstruction. Nevertheless, existing implicit
+representation approaches only focus on building continuous appearance mapping,
+ignoring the continuities of the semantic information across pixels. As a
+result, they can hardly achieve desired reconstruction results when the
+semantic information within input images is corrupted, for example, a large
+region misses. To address the issue, we propose to learn semantic-aware
+implicit representation (SAIR), that is, we make the implicit representation of
+each pixel rely on both its appearance and semantic information (\eg, which
+object does the pixel belong to). To this end, we propose a framework with two
+modules: (1) building a semantic implicit representation (SIR) for a corrupted
+image whose large regions miss. Given an arbitrary coordinate in the continuous
+domain, we can obtain its respective text-aligned embedding indicating the
+object the pixel belongs. (2) building an appearance implicit representation
+(AIR) based on the SIR. Given an arbitrary coordinate in the continuous domain,
+we can reconstruct its color whether or not the pixel is missed in the input.
+We validate the novel semantic-aware implicit representation method on the
+image inpainting task, and the extensive experiments demonstrate that our
+method surpasses state-of-the-art approaches by a significant margin.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Transformer</span>-based Multimodal Change Detection with Multitask Consistency
+  Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09276v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09276v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Biyuan Liu, Huaixin Chen, Kun Li, Michael Ying Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Change detection plays a fundamental role in Earth observation for analyzing
+temporal iterations over time. However, recent studies have largely neglected
+the utilization of multimodal data that presents significant practical and
+technical advantages compared to single-modal approaches. This research focuses
+on leveraging digital surface model (DSM) data and aerial images captured at
+different times for detecting change beyond 2D. We observe that the current
+change detection methods struggle with the multitask conflicts between semantic
+and height change detection tasks. To address this challenge, we propose an
+efficient Transformer-based network that learns shared representation between
+cross-dimensional inputs through cross-attention. It adopts a consistency
+constraint to establish the multimodal relationship, which involves obtaining
+pseudo change through height change thresholding and minimizing the difference
+between semantic and pseudo change within their overlapping regions. A
+DSM-to-image multimodal dataset encompassing three cities in the Netherlands
+was constructed. It lays a new foundation for beyond-2D change detection from
+cross-dimensional inputs. Compared to five state-of-the-art change detection
+methods, our model demonstrates consistent multitask superiority in terms of
+semantic and height change detection. Furthermore, the consistency strategy can
+be seamlessly adapted to the other methods, yielding promising improvements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding and Modeling the Effects of Task and Context on Drivers'
+  Gaze Allocation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09275v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09275v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Iuliia Kotseruba, John K. Tsotsos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding what drivers look at is important for many applications,
+including driver training, monitoring, and assistance, as well as self-driving.
+Traditionally, factors affecting human visual attention have been divided into
+bottom-up (involuntary attraction to salient regions) and top-down (task- and
+context-driven). Although both play a role in drivers' gaze allocation, most of
+the existing modeling approaches apply techniques developed for bottom-up
+saliency and do not consider task and context influences explicitly. Likewise,
+common driving attention benchmarks lack relevant task and context annotations.
+Therefore, to enable analysis and modeling of these factors for drivers' gaze
+prediction, we propose the following: 1) address some shortcomings of the
+popular DR(eye)VE dataset and extend it with per-frame annotations for driving
+task and context; 2) benchmark a number of baseline and SOTA models for
+saliency and driver gaze prediction and analyze them w.r.t. the new
+annotations; and finally, 3) a novel model that modulates drivers' gaze
+prediction with explicit action and context information, and as a result
+significantly improves SOTA performance on DR(eye)VE overall (by 24\% KLD and
+89\% NSS) and on a subset of action and safety-critical intersection scenarios
+(by 10--30\% KLD). Extended annotations, code for model and evaluation will be
+made publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 8 figures, 8 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hypernymy Understanding Evaluation of Text-to-Image Models via WordNet
+  Hierarchy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09247v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09247v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anton Baryshnikov, Max Ryabinin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image synthesis has recently attracted widespread attention due to
+rapidly improving quality and numerous practical applications. However, the
+language understanding capabilities of text-to-image models are still poorly
+understood, which makes it difficult to reason about prompt formulations that a
+given model would understand well. In this work, we measure the capability of
+popular text-to-image models to understand $\textit{hypernymy}$, or the "is-a"
+relation between words. We design two automatic metrics based on the WordNet
+semantic hierarchy and existing image classifiers pretrained on ImageNet. These
+metrics both enable broad quantitative comparison of linguistic capabilities
+for text-to-image models and offer a way of finding fine-grained qualitative
+differences, such as words that are unknown to models and thus are difficult
+for them to draw. We comprehensively evaluate popular text-to-image models,
+including GLIDE, Latent Diffusion, and Stable Diffusion, showing how our
+metrics can provide a better understanding of the individual strengths and
+weaknesses of these models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Time CNN and Graph Convolution Network for Epileptic Spike Detection in
+  MEG Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pauline Mouches, Thibaut Dejean, Julien Jung, Romain Bouet, Carole Lartizien, Romain Quentin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Magnetoencephalography (MEG) recordings of patients with epilepsy exhibit
+spikes, a typical biomarker of the pathology. Detecting those spikes allows
+accurate localization of brain regions triggering seizures. Spike detection is
+often performed manually. However, it is a burdensome and error prone task due
+to the complexity of MEG data. To address this problem, we propose a 1D
+temporal convolutional neural network (Time CNN) coupled with a graph
+convolutional network (GCN) to classify short time frames of MEG recording as
+containing a spike or not. Compared to other recent approaches, our models have
+fewer parameters to train and we propose to use a GCN to account for MEG
+sensors spatial relationships. Our models produce clinically relevant results
+and outperform deep learning-based state-of-the-art methods reaching a
+classification f1-score of 76.7% on a balanced dataset and of 25.5% on a
+realistic, highly imbalanced dataset, for the spike class.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to IEEE ISBI 2024 for possible
+  publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ultrasound Image Segmentation of Thyroid Nodule via Latent Semantic
+  Feature Co-Registration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuewei Li, Yaqiao Zhu, Jie Gao, Xi Wei, Ruixuan Zhang, Yuan Tian, Mei Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Segmentation of nodules in thyroid ultrasound imaging plays a crucial role in
+the detection and treatment of thyroid cancer. However, owing to the diversity
+of scanner vendors and imaging protocols in different hospitals, the automatic
+segmentation model, which has already demonstrated expert-level accuracy in the
+field of medical image segmentation, finds its accuracy reduced as the result
+of its weak generalization performance when being applied in clinically
+realistic environments. To address this issue, the present paper proposes ASTN,
+a framework for thyroid nodule segmentation achieved through a new type
+co-registration network. By extracting latent semantic information from the
+atlas and target images and utilizing in-depth features to accomplish the
+co-registration of nodules in thyroid ultrasound images, this framework can
+ensure the integrity of anatomical structure and reduce the impact on
+segmentation as the result of overall differences in image caused by different
+devices. In addition, this paper also provides an atlas selection algorithm to
+mitigate the difficulty of co-registration. As shown by the evaluation results
+collected from the datasets of different devices, thanks to the method we
+proposed, the model generalization has been greatly improved while maintaining
+a high level of segmentation accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unseen Image Synthesis with Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09213v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09213v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ye Zhu, Yu Wu, Zhiwei Deng, Olga Russakovsky, Yan Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While the current trend in the generative field is scaling up towards larger
+models and more training data for generalized domain representations, we go the
+opposite direction in this work by synthesizing unseen domain images without
+additional training. We do so via latent sampling and geometric optimization
+using pre-trained and frozen Denoising Diffusion Probabilistic Models (DDPMs)
+on single-domain datasets. Our key observation is that DDPMs pre-trained even
+just on single-domain images are already equipped with sufficient
+representation abilities to reconstruct arbitrary images from the inverted
+latent encoding following bi-directional deterministic diffusion and denoising
+trajectories. This motivates us to investigate the statistical and geometric
+behaviors of the Out-Of-Distribution (OOD) samples from unseen image domains in
+the latent spaces along the denoising chain. Notably, we theoretically and
+empirically show that the inverted OOD samples also establish Gaussians that
+are distinguishable from the original In-Domain (ID) samples in the
+intermediate latent spaces, which allows us to sample from them directly.
+Geometrical domain-specific and model-dependent information of the unseen
+subspace (e.g., sample-wise distance and angles) is used to further optimize
+the sampled OOD latent encodings from the estimated Gaussian prior. We conduct
+extensive analysis and experiments using pre-trained diffusion models (DDPM,
+iDDPM) on different datasets (AFHQ, CelebA-HQ, LSUN-Church, and LSUN-Bedroom),
+proving the effectiveness of this novel perspective to explore and re-think the
+diffusion models' data synthesis generalization ability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages including appendices</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PaLI-3 Vision Language Models: Smaller, Faster, Stronger 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09199v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09199v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xi Chen, Xiao Wang, Lucas Beyer, Alexander Kolesnikov, Jialin Wu, Paul Voigtlaender, Basil Mustafa, Sebastian Goodman, Ibrahim Alabdulmohsin, Piotr Padlewski, Daniel Salz, Xi Xiong, Daniel Vlasic, Filip Pavetic, Keran Rong, Tianli Yu, Daniel Keysers, Xiaohua Zhai, Radu Soricut
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents PaLI-3, a smaller, faster, and stronger vision language
+model (VLM) that compares favorably to similar models that are 10x larger. As
+part of arriving at this strong performance, we compare Vision Transformer
+(ViT) models pretrained using classification objectives to contrastively
+(SigLIP) pretrained ones. We find that, while slightly underperforming on
+standard image classification benchmarks, SigLIP-based PaLI shows superior
+performance across various multimodal benchmarks, especially on localization
+and visually-situated text understanding. We scale the SigLIP image encoder up
+to 2 billion parameters, and achieves a new state-of-the-art on multilingual
+cross-modal retrieval. We hope that PaLI-3, at only 5B parameters, rekindles
+research on fundamental pieces of complex VLMs, and could fuel a new generation
+of scaled-up models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ mnmDTW: An extension to Dynamic Time Warping for Camera-based Movement
+  Error Localization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09170v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09170v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastian Dill, Maurice Rohr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this proof of concept, we use Computer Vision (CV) methods to extract pose
+information out of exercise videos. We then employ a modified version of
+Dynamic Time Warping (DTW) to calculate the deviation from a gold standard
+execution of the exercise. Specifically, we calculate the distance between each
+body part individually to get a more precise measure for exercise accuracy. We
+can show that exercise mistakes are clearly visible, identifiable and
+localizable through this metric.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Poster Prague 2023 Conference, 4 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Sparse Spatial Relation in Graph Inference for Text-Based VQA 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09147v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09147v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheng Zhou, Dan Guo, Jia Li, Xun Yang, Meng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-based visual question answering (TextVQA) faces the significant
+challenge of avoiding redundant relational inference. To be specific, a large
+number of detected objects and optical character recognition (OCR) tokens
+result in rich visual relationships. Existing works take all visual
+relationships into account for answer prediction. However, there are three
+observations: (1) a single subject in the images can be easily detected as
+multiple objects with distinct bounding boxes (considered repetitive objects).
+The associations between these repetitive objects are superfluous for answer
+reasoning; (2) two spatially distant OCR tokens detected in the image
+frequently have weak semantic dependencies for answer reasoning; and (3) the
+co-existence of nearby objects and tokens may be indicative of important visual
+cues for predicting answers. Rather than utilizing all of them for answer
+prediction, we make an effort to identify the most important connections or
+eliminate redundant ones. We propose a sparse spatial graph network (SSGN) that
+introduces a spatially aware relation pruning technique to this task. As
+spatial factors for relation measurement, we employ spatial distance, geometric
+dimension, overlap area, and DIoU for spatially aware pruning. We consider
+three visual relationships for graph learning: object-object, OCR-OCR tokens,
+and object-OCR token relationships. SSGN is a progressive graph learning
+architecture that verifies the pivotal relations in the correlated object-token
+sparse graph, and then in the respective object-based sparse graph and
+token-based sparse graph. Experiment results on TextVQA and ST-VQA datasets
+demonstrate that SSGN achieves promising performances. And some visualization
+results further demonstrate the interpretability of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TIP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physics-guided Noise Neural Proxy for Low-light Raw Image Denoising 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09126v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09126v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hansen Feng, Lizhi Wang, Yiqi Huang, Yuzhi Wang, Hua Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low-light raw image denoising plays a crucial role in mobile photography, and
+learning-based methods have become the mainstream approach. Training the
+learning-based methods with synthetic data emerges as an efficient and
+practical alternative to paired real data. However, the quality of synthetic
+data is inherently limited by the low accuracy of the noise model, which
+decreases the performance of low-light raw image denoising. In this paper, we
+develop a novel framework for accurate noise modeling that learns a
+physics-guided noise neural proxy (PNNP) from dark frames. PNNP integrates
+three efficient techniques: physics-guided noise decoupling (PND),
+physics-guided proxy model (PPM), and differentiable distribution-oriented loss
+(DDL). The PND decouples the dark frame into different components and handles
+different levels of noise in a flexible manner, which reduces the complexity of
+the noise neural proxy. The PPM incorporates physical priors to effectively
+constrain the generated noise, which promotes the accuracy of the noise neural
+proxy. The DDL provides explicit and reliable supervision for noise modeling,
+which promotes the precision of the noise neural proxy. Extensive experiments
+on public low-light raw image denoising datasets and real low-light imaging
+scenarios demonstrate the superior performance of our PNNP framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Training and Predicting Visual Error for Real-Time Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09125v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09125v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        João Libório Cardoso, Bernhard Kerbl, Lei Yang, Yury Uralsky, Michael Wimmer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual error metrics play a fundamental role in the quantification of
+perceived image similarity. Most recently, use cases for them in real-time
+applications have emerged, such as content-adaptive shading and shading reuse
+to increase performance and improve efficiency. A wide range of different
+metrics has been established, with the most sophisticated being capable of
+capturing the perceptual characteristics of the human visual system. However,
+their complexity, computational expense, and reliance on reference images to
+compare against prevent their generalized use in real-time, restricting such
+applications to using only the simplest available metrics. In this work, we
+explore the abilities of convolutional neural networks to predict a variety of
+visual metrics without requiring either reference or rendered images.
+Specifically, we train and deploy a neural network to estimate the visual error
+resulting from reusing shading or using reduced shading rates. The resulting
+models account for 70%-90% of the variance while achieving up to an order of
+magnitude faster computation times. Our solution combines image-space
+information that is readily available in most state-of-the-art deferred shading
+pipelines with reprojection from previous frames to enable an adequate estimate
+of visual errors, even in previously unseen regions. We describe a suitable
+convolutional network architecture and considerations for data preparation for
+training. We demonstrate the capability of our network to predict complex error
+metrics at interactive rates in a real-time application that implements
+content-adaptive shading in a deferred pipeline. Depending on the portion of
+unseen image regions, our approach can achieve up to $2\times$ performance
+compared to state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at Proceedings of the ACM in Computer Graphics and
+  Interactive Techniques. 14 Pages, 16 Figures, 3 Tables. For paper website and
+  higher quality figures, see https://jaliborc.github.io/rt-percept/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Equirectangular image construction method for standard CNNs for Semantic
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09122v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09122v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoqian Chen, Jian Liu, Minghe Li, Kaiwen Jiang, Ziheng Xu, Rencheng Sun, Yi Sui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  360{\deg} spherical images have advantages of wide view field, and are
+typically projected on a planar plane for processing, which is known as
+equirectangular image. The object shape in equirectangular images can be
+distorted and lack translation invariance. In addition, there are few publicly
+dataset of equirectangular images with labels, which presents a challenge for
+standard CNNs models to process equirectangular images effectively. To tackle
+this problem, we propose a methodology for converting a perspective image into
+equirectangular image. The inverse transformation of the spherical center
+projection and the equidistant cylindrical projection are employed. This
+enables the standard CNNs to learn the distortion features at different
+positions in the equirectangular image and thereby gain the ability to
+semantically the equirectangular image. The parameter, {\phi}, which determines
+the projection position of the perspective image, has been analyzed using
+various datasets and models, such as UNet, UNet++, SegNet, PSPNet, and DeepLab
+v3+. The experiments demonstrate that an optimal value of {\phi} for effective
+semantic segmentation of equirectangular images is 6{\pi}/16 for standard CNNs.
+Compared with the other three types of methods (supervised learning,
+unsupervised learning and data augmentation), the method proposed in this paper
+has the best average IoU value of 43.76%. This value is 23.85%, 10.7% and
+17.23% higher than those of other three methods, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DSG: An End-to-End Document Structure Generator <span class="chip">ICDM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09118v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09118v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johannes Rausch, Gentiana Rashiti, Maxim Gusev, Ce Zhang, Stefan Feuerriegel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Information in industry, research, and the public sector is widely stored as
+rendered documents (e.g., PDF files, scans). Hence, to enable downstream tasks,
+systems are needed that map rendered documents onto a structured hierarchical
+format. However, existing systems for this task are limited by heuristics and
+are not end-to-end trainable. In this work, we introduce the Document Structure
+Generator (DSG), a novel system for document parsing that is fully end-to-end
+trainable. DSG combines a deep neural network for parsing (i) entities in
+documents (e.g., figures, text blocks, headers, etc.) and (ii) relations that
+capture the sequence and nested structure between entities. Unlike existing
+systems that rely on heuristics, our DSG is trained end-to-end, making it
+effective and flexible for real-world applications. We further contribute a
+new, large-scale dataset called E-Periodica comprising real-world magazines
+with complex document structures for evaluation. Our results demonstrate that
+our DSG outperforms commercial OCR tools and, on top of that, achieves
+state-of-the-art performance. To the best of our knowledge, our DSG system is
+the first end-to-end trainable system for hierarchical document parsing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICDM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Timestamp-supervised Wearable-based Activity Segmentation and
+  Recognition with Contrastive Learning and Order-Preserving Optimal Transport 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09114v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09114v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Songpengcheng Xia, Lei Chu, Ling Pei, Jiarui Yang, Wenxian Yu, Robert C. Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human activity recognition (HAR) with wearables is one of the serviceable
+technologies in ubiquitous and mobile computing applications. The
+sliding-window scheme is widely adopted while suffering from the multi-class
+windows problem. As a result, there is a growing focus on joint segmentation
+and recognition with deep-learning methods, aiming at simultaneously dealing
+with HAR and time-series segmentation issues. However, obtaining the full
+activity annotations of wearable data sequences is resource-intensive or
+time-consuming, while unsupervised methods yield poor performance. To address
+these challenges, we propose a novel method for joint activity segmentation and
+recognition with timestamp supervision, in which only a single annotated sample
+is needed in each activity segment. However, the limited information of sparse
+annotations exacerbates the gap between recognition and segmentation tasks,
+leading to sub-optimal model performance. Therefore, the prototypes are
+estimated by class-activation maps to form a sample-to-prototype contrast
+module for well-structured embeddings. Moreover, with the optimal transport
+theory, our approach generates the sample-level pseudo-labels that take
+advantage of unlabeled data between timestamp annotations for further
+performance improvement. Comprehensive experiments on four public HAR datasets
+demonstrate that our model trained with timestamp supervision is superior to
+the state-of-the-art weakly-supervised methods and achieves comparable
+performance to the fully-supervised approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review (submitted to IEEE TMC)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Faster 3D cardiac CT segmentation with Vision <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09099v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09099v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lee Jollans, Mariana Bustamante, Lilian Henriksson, Anders Persson, Tino Ebbers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate segmentation of the heart is essential for personalized blood flow
+simulations and surgical intervention planning. A recent advancement in image
+recognition is the Vision Transformer (ViT), which expands the field of view to
+encompass a greater portion of the global image context. We adapted ViT for
+three-dimensional volume inputs. Cardiac computed tomography (CT) volumes from
+39 patients, featuring up to 20 timepoints representing the complete cardiac
+cycle, were utilized. Our network incorporates a modified ResNet50 block as
+well as a ViT block and employs cascade upsampling with skip connections.
+Despite its increased model complexity, our hybrid Transformer-Residual U-Net
+framework, termed TRUNet, converges in significantly less time than residual
+U-Net while providing comparable or superior segmentations of the left
+ventricle, left atrium, left atrial appendage, ascending aorta, and pulmonary
+veins. TRUNet offers more precise vessel boundary segmentation and better
+captures the heart's overall anatomical structure compared to residual U-Net,
+as confirmed by the absence of extraneous clusters of missegmented voxels. In
+terms of both performance and training speed, TRUNet exceeded U-Net, a commonly
+used segmentation architecture, making it a promising tool for 3D semantic
+segmentation tasks in medical imaging. The code for TRUNet is available at
+github.com/ljollans/TRUNet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ iPUNet:Iterative Cross Field Guided Point Cloud Upsampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09092v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09092v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guangshun Wei, Hao Pan, Shaojie Zhuang, Yuanfeng Zhou, Changjian Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point clouds acquired by 3D scanning devices are often sparse, noisy, and
+non-uniform, causing a loss of geometric features. To facilitate the usability
+of point clouds in downstream applications, given such input, we present a
+learning-based point upsampling method, i.e., iPUNet, which generates dense and
+uniform points at arbitrary ratios and better captures sharp features. To
+generate feature-aware points, we introduce cross fields that are aligned to
+sharp geometric features by self-supervision to guide point generation. Given
+cross field defined frames, we enable arbitrary ratio upsampling by learning at
+each input point a local parameterized surface. The learned surface consumes
+the neighboring points and 2D tangent plane coordinates as input, and maps onto
+a continuous surface in 3D where arbitrary ratios of output points can be
+sampled. To solve the non-uniformity of input points, on top of the cross field
+guided upsampling, we further introduce an iterative strategy that refines the
+point distribution by moving sparse points onto the desired continuous 3D
+surface in each iteration. Within only a few iterations, the sparse points are
+evenly distributed and their corresponding dense samples are more uniform and
+better capture geometric features. Through extensive evaluations on diverse
+scans of objects and scenes, we demonstrate that iPUNet is robust to handle
+noisy and non-uniformly distributed inputs, and outperforms state-of-the-art
+point cloud upsampling methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ pose-format: Library for Viewing, Augmenting, and Handling .pose Files 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09066v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09066v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amit Moryossef, Mathias Müller, Rebecka Fahrni
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Managing and analyzing pose data is a complex task, with challenges ranging
+from handling diverse file structures and data types to facilitating effective
+data manipulations such as normalization and augmentation. This paper presents
+\texttt{pose-format}, a comprehensive toolkit designed to address these
+challenges by providing a unified, flexible, and easy-to-use interface. The
+library includes a specialized file format that encapsulates various types of
+pose data, accommodating multiple individuals and an indefinite number of time
+frames, thus proving its utility for both image and video data. Furthermore, it
+offers seamless integration with popular numerical libraries such as NumPy,
+PyTorch, and TensorFlow, thereby enabling robust machine-learning applications.
+Through benchmarking, we demonstrate that our \texttt{.pose} file format offers
+vastly superior performance against prevalent formats like OpenPose, with added
+advantages like self-contained pose specification. Additionally, the library
+includes features for data normalization, augmentation, and easy-to-use
+visualization capabilities, both in Python and Browser environments.
+\texttt{pose-format} emerges as a one-stop solution, streamlining the
+complexities of pose data management and analysis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Spatial-Temporal Dual-Mode Mixed Flow Network for Panoramic Video
+  Salient Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09016v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09016v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaolei Chen, Pengcheng Zhang, Zelong Du, Ishfaq Ahmad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Salient object detection (SOD) in panoramic video is still in the initial
+exploration stage. The indirect application of 2D video SOD method to the
+detection of salient objects in panoramic video has many unmet challenges, such
+as low detection accuracy, high model complexity, and poor generalization
+performance. To overcome these hurdles, we design an Inter-Layer Attention
+(ILA) module, an Inter-Layer weight (ILW) module, and a Bi-Modal Attention
+(BMA) module. Based on these modules, we propose a Spatial-Temporal Dual-Mode
+Mixed Flow Network (STDMMF-Net) that exploits the spatial flow of panoramic
+video and the corresponding optical flow for SOD. First, the ILA module
+calculates the attention between adjacent level features of consecutive frames
+of panoramic video to improve the accuracy of extracting salient object
+features from the spatial flow. Then, the ILW module quantifies the salient
+object information contained in the features of each level to improve the
+fusion efficiency of the features of each level in the mixed flow. Finally, the
+BMA module improves the detection accuracy of STDMMF-Net. A large number of
+subjective and objective experimental results testify that the proposed method
+demonstrates better detection accuracy than the state-of-the-art (SOTA)
+methods. Moreover, the comprehensive performance of the proposed method is
+better in terms of memory required for model inference, testing time,
+complexity, and generalization performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ VCL Challenges 2023 at ICCV 2023 Technical Report: Bi-level Adaptation
+  Method for Test-time Adaptive Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08986v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08986v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenyu Lin, Yusheng He, Zhengqing Zang, Chenwei Tang, Tao Wang, Jiancheng Lv
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This report outlines our team's participation in VCL Challenges B Continual
+Test_time Adaptation, focusing on the technical details of our approach. Our
+primary focus is Testtime Adaptation using bi_level adaptations, encompassing
+image_level and detector_level adaptations. At the image level, we employ
+adjustable parameterbased image filters, while at the detector level, we
+leverage adjustable parameterbased mean teacher modules. Ultimately, through
+the utilization of these bi_level adaptations, we have achieved a remarkable
+38.3% mAP on the target domain of the test set within VCL Challenges B. It is
+worth noting that the minimal drop in mAP, is mearly 4.2%, and the overall
+performance is 32.5% mAP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UniParser: Multi-Human Parsing with Unified Correlation Representation
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08984v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08984v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaming Chu, Lei Jin, Junliang Xing, Jian Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-human parsing is an image segmentation task necessitating both
+instance-level and fine-grained category-level information. However, prior
+research has typically processed these two types of information through
+separate branches and distinct output formats, leading to inefficient and
+redundant frameworks. This paper introduces UniParser, which integrates
+instance-level and category-level representations in three key aspects: 1) we
+propose a unified correlation representation learning approach, allowing our
+network to learn instance and category features within the cosine space; 2) we
+unify the form of outputs of each modules as pixel-level segmentation results
+while supervising instance and category features using a homogeneous label
+accompanied by an auxiliary loss; and 3) we design a joint optimization
+procedure to fuse instance and category representations. By virtual of unifying
+instance-level and category-level output, UniParser circumvents manually
+designed post-processing techniques and surpasses state-of-the-art methods,
+achieving 49.3% AP on MHPv2.0 and 60.4% AP on CIHP. We will release our source
+code, pretrained models, and online demos to facilitate future studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LRRU: Long-short Range Recurrent Updating Networks for Depth Completion <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08956v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08956v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yufei Wang, Bo Li, Ge Zhang, Qi Liu, Tao Gao, Yuchao Dai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing deep learning-based depth completion methods generally employ
+massive stacked layers to predict the dense depth map from sparse input data.
+Although such approaches greatly advance this task, their accompanied huge
+computational complexity hinders their practical applications. To accomplish
+depth completion more efficiently, we propose a novel lightweight deep network
+framework, the Long-short Range Recurrent Updating (LRRU) network. Without
+learning complex feature representations, LRRU first roughly fills the sparse
+input to obtain an initial dense depth map, and then iteratively updates it
+through learned spatially-variant kernels. Our iterative update process is
+content-adaptive and highly flexible, where the kernel weights are learned by
+jointly considering the guidance RGB images and the depth map to be updated,
+and large-to-small kernel scopes are dynamically adjusted to capture
+long-to-short range dependencies. Our initial depth map has coarse but complete
+scene depth information, which helps relieve the burden of directly regressing
+the dense depth from sparse ones, while our proposed method can effectively
+refine it to an accurate depth map with less learnable parameters and inference
+time. Experimental results demonstrate that our proposed LRRU variants achieve
+state-of-the-art performance across different parameter regimes. In particular,
+the LRRU-Base model outperforms competing approaches on the NYUv2 dataset, and
+ranks 1st on the KITTI depth completion benchmark at the time of submission.
+Project page: https://npucvr.github.io/LRRU/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Making Multimodal Generation Easier: When Diffusion Models Meet LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08949v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08949v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangyu Zhao, Bo Liu, Qijiong Liu, Guangyuan Shi, Xiao-Ming Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present EasyGen, an efficient model designed to enhance multimodal
+understanding and generation by harnessing the capabilities of diffusion models
+and large language models (LLMs). Unlike existing multimodal models that
+predominately depend on encoders like CLIP or ImageBind and need ample amounts
+of training data to bridge the gap between modalities, EasyGen is built upon a
+bidirectional conditional diffusion model named BiDiffuser, which promotes more
+efficient interactions between modalities. EasyGen handles image-to-text
+generation by integrating BiDiffuser and an LLM via a simple projection layer.
+Unlike most existing multimodal models that are limited to generating text
+responses, EasyGen can also facilitate text-to-image generation by leveraging
+the LLM to create textual descriptions, which can be interpreted by BiDiffuser
+to generate appropriate visual responses. Extensive quantitative and
+qualitative experiments demonstrate the effectiveness of EasyGen, whose
+training can be easily achieved in a lab setting. The source code is available
+at https://github.com/zxy556677/EasyGen.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated Class-Incremental Learning with <span class="highlight-title">Prompt</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08948v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08948v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiale Liu, Yu-Wei Zhan, Chong-Yu Zhang, Xin Luo, Zhen-Duo Chen, Yinwei Wei, Xin-Shun Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As Web technology continues to develop, it has become increasingly common to
+use data stored on different clients. At the same time, federated learning has
+received widespread attention due to its ability to protect data privacy when
+let models learn from data which is distributed across various clients.
+However, most existing works assume that the client's data are fixed. In
+real-world scenarios, such an assumption is most likely not true as data may be
+continuously generated and new classes may also appear. To this end, we focus
+on the practical and challenging federated class-incremental learning (FCIL)
+problem. For FCIL, the local and global models may suffer from catastrophic
+forgetting on old classes caused by the arrival of new classes and the data
+distributions of clients are non-independent and identically distributed
+(non-iid).
+  In this paper, we propose a novel method called Federated Class-Incremental
+Learning with PrompTing (FCILPT). Given the privacy and limited memory, FCILPT
+does not use a rehearsal-based buffer to keep exemplars of old data. We choose
+to use prompts to ease the catastrophic forgetting of the old classes.
+Specifically, we encode the task-relevant and task-irrelevant knowledge into
+prompts, preserving the old and new knowledge of the local clients and solving
+the problem of catastrophic forgetting. We first sort the task information in
+the prompt pool in the local clients to align the task information on different
+clients before global aggregation. It ensures that the same task's knowledge
+are fully integrated, solving the problem of non-iid caused by the lack of
+classes among different clients in the same incremental task. Experiments on
+CIFAR-100, Mini-ImageNet, and Tiny-ImageNet demonstrate that FCILPT achieves
+significant accuracy improvements over the state-of-the-art methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online Adaptive Disparity Estimation for Dynamic Scenes in Structured
+  Light Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08934v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08934v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rukun Qiao, Hiroshi Kawasaki, Hongbin Zha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, deep neural networks have shown remarkable progress in dense
+disparity estimation from dynamic scenes in monocular structured light systems.
+However, their performance significantly drops when applied in unseen
+environments. To address this issue, self-supervised online adaptation has been
+proposed as a solution to bridge this performance gap. Unlike traditional
+fine-tuning processes, online adaptation performs test-time optimization to
+adapt networks to new domains. Therefore, achieving fast convergence during the
+adaptation process is critical for attaining satisfactory accuracy. In this
+paper, we propose an unsupervised loss function based on long sequential
+inputs. It ensures better gradient directions and faster convergence. Our loss
+function is designed using a multi-frame pattern flow, which comprises a set of
+sparse trajectories of the projected pattern along the sequence. We estimate
+the sparse pseudo ground truth with a confidence mask using a filter-based
+method, which guides the online adaptation process. Our proposed framework
+significantly improves the online adaptation speed and achieves superior
+performance on unseen data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accpeted by 36th IEEE/RSJ International Conference on Intelligent
+  Robots and Systems, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TIDE: Temporally Incremental Disparity Estimation via Pattern Flow in
+  Structured Light System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08932v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08932v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rukun Qiao, Hiroshi Kawasaki, Hongbin Zha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduced Temporally Incremental Disparity Estimation Network (TIDE-Net),
+a learning-based technique for disparity computation in mono-camera structured
+light systems. In our hardware setting, a static pattern is projected onto a
+dynamic scene and captured by a monocular camera. Different from most former
+disparity estimation methods that operate in a frame-wise manner, our network
+acquires disparity maps in a temporally incremental way. Specifically, We
+exploit the deformation of projected patterns (named pattern flow ) on captured
+image sequences, to model the temporal information. Notably, this newly
+proposed pattern flow formulation reflects the disparity changes along the
+epipolar line, which is a special form of optical flow. Tailored for pattern
+flow, the TIDE-Net, a recurrent architecture, is proposed and implemented. For
+each incoming frame, our model fuses correlation volumes (from current frame)
+and disparity (from former frame) warped by pattern flow. From fused features,
+the final stage of TIDE-Net estimates the residual disparity rather than the
+full disparity, as conducted by many previous methods. Interestingly, this
+design brings clear empirical advantages in terms of efficiency and
+generalization ability. Using only synthetic data for training, our extensitve
+evaluation results (w.r.t. both accuracy and efficienty metrics) show superior
+performance than several SOTA models on unseen real data. The code is available
+on https://github.com/CodePointer/TIDENet.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Interpretable Controllability in Object-Centric Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08929v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08929v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinwoo Kim, Janghyuk Choi, Jaehyun Kang, Changyeon Lee, Ho-Jin Choi, Seon Joo Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The binding problem in artificial neural networks is actively explored with
+the goal of achieving human-level recognition skills through the comprehension
+of the world in terms of symbol-like entities. Especially in the field of
+computer vision, object-centric learning (OCL) is extensively researched to
+better understand complex scenes by acquiring object representations or slots.
+While recent studies in OCL have made strides with complex images or videos,
+the interpretability and interactivity over object representation remain
+largely uncharted, still holding promise in the field of OCL. In this paper, we
+introduce a novel method, Slot Attention with Image Augmentation (SlotAug), to
+explore the possibility of learning interpretable controllability over slots in
+a self-supervised manner by utilizing an image augmentation strategy. We also
+devise the concept of sustainability in controllable slots by introducing
+iterative and reversible controls over slots with two proposed submethods:
+Auxiliary Identity Manipulation and Slot Consistency Loss. Extensive empirical
+studies and theoretical validation confirm the effectiveness of our approach,
+offering a novel capability for interpretable and sustainable control of object
+representations. Code will be available soon.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SIDE: <span class="highlight-title">Self-supervised</span> Intermediate Domain Exploration for Source-free
+  Domain Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08928v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08928v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiamei Liu, Han Sun, Yizhen Jia, Jie Qin, Huiyu Zhou, Ningzhong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain adaptation aims to alleviate the domain shift when transferring the
+knowledge learned from the source domain to the target domain. Due to privacy
+issues, source-free domain adaptation (SFDA), where source data is unavailable
+during adaptation, has recently become very demanding yet challenging. Existing
+SFDA methods focus on either self-supervised learning of target samples or
+reconstruction of virtual source data. The former overlooks the transferable
+knowledge in the source model, whilst the latter introduces even more
+uncertainty. To address the above issues, this paper proposes self-supervised
+intermediate domain exploration (SIDE) that effectively bridges the domain gap
+with an intermediate domain, where samples are cyclically filtered out in a
+self-supervised fashion. First, we propose cycle intermediate domain filtering
+(CIDF) to cyclically select intermediate samples with similar distributions
+over source and target domains. Second, with the aid of those intermediate
+samples, an inter-domain gap transition (IDGT) module is developed to mitigate
+possible distribution mismatches between the source and target data. Finally,
+we introduce cross-view consistency learning (CVCL) to maintain the intrinsic
+class discriminability whilst adapting the model to the target domain.
+Extensive experiments on three popular benchmarks, i.e. Office-31, Office-Home
+and VisDA-C, show that our proposed SIDE achieves competitive performance
+against state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>code at https://github.com/se111/SIDE</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Feature Proliferation -- the "Cancer" in StyleGAN and its Treatments <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08921v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08921v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuang Song, Yuanbang Liang, Jing Wu, Yu-Kun Lai, Yipeng Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the success of StyleGAN in image synthesis, the images it synthesizes
+are not always perfect and the well-known truncation trick has become a
+standard post-processing technique for StyleGAN to synthesize high-quality
+images. Although effective, it has long been noted that the truncation trick
+tends to reduce the diversity of synthesized images and unnecessarily
+sacrifices many distinct image features. To address this issue, in this paper,
+we first delve into the StyleGAN image synthesis mechanism and discover an
+important phenomenon, namely Feature Proliferation, which demonstrates how
+specific features reproduce with forward propagation. Then, we show how the
+occurrence of Feature Proliferation results in StyleGAN image artifacts. As an
+analogy, we refer to it as the" cancer" in StyleGAN from its proliferating and
+malignant nature. Finally, we propose a novel feature rescaling method that
+identifies and modulates risky features to mitigate feature proliferation.
+Thanks to our discovery of Feature Proliferation, the proposed feature
+rescaling method is less destructive and retains more useful image features
+than the truncation trick, as it is more fine-grained and works in a
+lower-level feature space rather than a high-level latent space. Experimental
+results justify the validity of our claims and the effectiveness of the
+proposed feature rescaling method. Our code is available at https://github.
+com/songc42/Feature-proliferation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scalarization for Multi-Task and Multi-Domain Learning at Scale <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08910v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08910v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amelie Royer, Tijmen Blankevoort, Babak Ehteshami Bejnordi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training a single model on multiple input domains and/or output tasks allows
+for compressing information from multiple sources into a unified backbone hence
+improves model efficiency. It also enables potential positive knowledge
+transfer across tasks/domains, leading to improved accuracy and data-efficient
+training. However, optimizing such networks is a challenge, in particular due
+to discrepancies between the different tasks or domains: Despite several
+hypotheses and solutions proposed over the years, recent work has shown that
+uniform scalarization training, i.e., simply minimizing the average of the task
+losses, yields on-par performance with more costly SotA optimization methods.
+This raises the issue of how well we understand the training dynamics of
+multi-task and multi-domain networks. In this work, we first devise a
+large-scale unified analysis of multi-domain and multi-task learning to better
+understand the dynamics of scalarization across varied task/domain combinations
+and model sizes. Following these insights, we then propose to leverage
+population-based training to efficiently search for the optimal scalarization
+weights when dealing with a large number of tasks or domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023; https://openreview.net/forum?id=TSuq3debnD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 3D Understanding of Deformable Linear Objects: <span class="highlight-title">Dataset</span>s and
+  Transferability Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08904v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08904v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bare Luka Žagar, Tim Hertel, Mingyu Liu, Ekim Yurtsever, ALois C. Knoll
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deformable linear objects are vastly represented in our everyday lives. It is
+often challenging even for humans to visually understand them, as the same
+object can be entangled so that it appears completely different. Examples of
+deformable linear objects include blood vessels and wiring harnesses, vital to
+the functioning of their corresponding systems, such as the human body and a
+vehicle. However, no point cloud datasets exist for studying 3D deformable
+linear objects. Therefore, we are introducing two point cloud datasets,
+PointWire and PointVessel. We evaluated state-of-the-art methods on the
+proposed large-scale 3D deformable linear object benchmarks. Finally, we
+analyzed the generalization capabilities of these methods by conducting
+transferability experiments on the PointWire and PointVessel datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self supervised convolutional kernel based handcrafted feature
+  harmonization: Enhanced left ventricle hypertension disease phenotyping on
+  echocardiography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08897v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08897v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jina Lee, Youngtaek Hong, Dawun Jeong, Yeonggul Jang, Sihyeon Jeong, Taekgeun Jung, Yeonyee E. Yoon, Inki Moon, Seung-Ah Lee, Hyuk-Jae Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Radiomics, a medical imaging technique, extracts quantitative handcrafted
+features from images to predict diseases. Harmonization in those features
+ensures consistent feature extraction across various imaging devices and
+protocols. Methods for harmonization include standardized imaging protocols,
+statistical adjustments, and evaluating feature robustness. Myocardial diseases
+such as Left Ventricular Hypertrophy (LVH) and Hypertensive Heart Disease (HHD)
+are diagnosed via echocardiography, but variable imaging settings pose
+challenges. Harmonization techniques are crucial for applying handcrafted
+features in disease diagnosis in such scenario. Self-supervised learning (SSL)
+enhances data understanding within limited datasets and adapts to diverse data
+settings. ConvNeXt-V2 integrates convolutional layers into SSL, displaying
+superior performance in various tasks. This study focuses on convolutional
+filters within SSL, using them as preprocessing to convert images into feature
+maps for handcrafted feature harmonization. Our proposed method excelled in
+harmonization evaluation and exhibited superior LVH classification performance
+compared to existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Image Cropping under Design Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08892v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08892v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takumi Nishiyasu, Wataru Shimoda, Yoichi Sato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image cropping is essential in image editing for obtaining a compositionally
+enhanced image. In display media, image cropping is a prospective technique for
+automatically creating media content. However, image cropping for media
+contents is often required to satisfy various constraints, such as an aspect
+ratio and blank regions for placing texts or objects. We call this problem
+image cropping under design constraints. To achieve image cropping under design
+constraints, we propose a score function-based approach, which computes scores
+for cropped results whether aesthetically plausible and satisfies design
+constraints. We explore two derived approaches, a proposal-based approach, and
+a heatmap-based approach, and we construct a dataset for evaluating the
+performance of the proposed approaches on image cropping under design
+constraints. In experiments, we demonstrate that the proposed approaches
+outperform a baseline, and we observe that the proposal-based approach is
+better than the heatmap-based approach under the same computation cost, but the
+heatmap-based approach leads to better scores by increasing computation cost.
+The experimental results indicate that balancing aesthetically plausible
+regions and satisfying design constraints is not a trivial problem and requires
+sensitive balance, and both proposed approaches are reasonable alternatives.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACMMM Asia accepted</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Hybrid Transfer Learning Assisted Decision Support System for Accurate
+  Prediction of Alzheimer Disease 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08888v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08888v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahin Khan Mahadi, Abdullah Abdullah, Jamal Uddin, Asif Newaz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Alzheimer's disease (AD) is the most common long-term illness in elderly
+people. In recent years, deep learning has become popular in the area of
+medical imaging and has had a lot of success there. It has become the most
+effective way to look at medical images. When it comes to detecting AD, the
+deep neural model is more accurate and effective than general machine learning.
+Our research contributes to the development of a more comprehensive
+understanding and detection of the disease by identifying four distinct classes
+that are predictive of AD with a high weighted accuracy of 98.91%. A unique
+strategy has been proposed to improve the accuracy of the imbalance dataset
+classification problem via the combination of ensemble averaging models and
+five different transfer learning models in this study.
+EfficientNetB0+Resnet152(effnet+res152) and
+InceptionV3+EfficientNetB0+Resnet50(incep+effnet+res50) models have been
+fine-tuned and have reached the highest weighted accuracy for multi-class AD
+stage classifications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extending Multi-modal Contrastive Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08884v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08884v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zehan Wang, Ziang Zhang, Luping Liu, Yang Zhao, Haifeng Huang, Tao Jin, Zhou Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal contrastive representation (MCR) of more than three modalities is
+critical in multi-modal learning. Although recent methods showcase impressive
+achievements, the high dependence on large-scale, high-quality paired data and
+the expensive training costs limit their further development. Inspired by
+recent C-MCR, this paper proposes Extending Multimodal Contrastive
+Representation (Ex-MCR), a training-efficient and paired-data-free method to
+flexibly learn unified contrastive representation space for more than three
+modalities by integrating the knowledge of existing MCR spaces. Specifically,
+Ex-MCR aligns multiple existing MCRs into the same based MCR, which can
+effectively preserve the original semantic alignment of the based MCR. Besides,
+we comprehensively enhance the entire learning pipeline for aligning MCR spaces
+from the perspectives of training data, architecture, and learning objectives.
+With the preserved original modality alignment and the enhanced space
+alignment, Ex-MCR shows superior representation learning performance and
+excellent modality extensibility. To demonstrate the effectiveness of Ex-MCR,
+we align the MCR spaces of CLAP (audio-text) and ULIP (3D-vision) into the CLIP
+(vision-text), leveraging the overlapping text and image modality,
+respectively. Remarkably, without using any paired data, Ex-MCR learns a
+3D-image-text-audio unified contrastive representation, and it achieves
+state-of-the-art performance on audio-visual, 3D-image, audio-text, visual-text
+retrieval, and 3D object classification tasks. More importantly, extensive
+qualitative results further demonstrate the emergent semantic alignment between
+the extended modalities (e.g., audio and 3D), which highlights the great
+potential of modality extensibility.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Our code is available at https://github.com/MCR-PEFT/Ex-MCR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ R&B: Region and Boundary Aware Zero-shot Grounded Text-to-image
+  Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08872v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08872v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayu Xiao, Liang Li, Henglei Lv, Shuhui Wang, Qingming Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent text-to-image (T2I) diffusion models have achieved remarkable progress
+in generating high-quality images given text-prompts as input. However, these
+models fail to convey appropriate spatial composition specified by a layout
+instruction. In this work, we probe into zero-shot grounded T2I generation with
+diffusion models, that is, generating images corresponding to the input layout
+information without training auxiliary modules or finetuning diffusion models.
+We propose a Region and Boundary (R&B) aware cross-attention guidance approach
+that gradually modulates the attention maps of diffusion model during
+generative process, and assists the model to synthesize images (1) with high
+fidelity, (2) highly compatible with textual input, and (3) interpreting layout
+instructions accurately. Specifically, we leverage the discrete sampling to
+bridge the gap between consecutive attention maps and discrete layout
+constraints, and design a region-aware loss to refine the generative layout
+during diffusion process. We further propose a boundary-aware loss to
+strengthen object discriminability within the corresponding regions.
+Experimental results show that our method outperforms existing state-of-the-art
+zero-shot grounded T2I generation methods by a large margin both qualitatively
+and quantitatively on several benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Re-initialization-free Level Set Method via Molecular Beam Epitaxy
+  Equation Regularization for Image Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08861v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08861v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fanghui Song, Jiebao Sun, Shengzhu Shi, Zhichang Guo, Dazhi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Variational level set method has become a powerful tool in image segmentation
+due to its ability to handle complex topological changes and maintain
+continuity and smoothness in the process of evolution. However its evolution
+process can be unstable, which results in over flatted or over sharpened
+contours and segmentation failure. To improve the accuracy and stability of
+evolution, we propose a high-order level set variational segmentation method
+integrated with molecular beam epitaxy (MBE) equation regularization. This
+method uses the crystal growth in the MBE process to limit the evolution of the
+level set function, and thus can avoid the re-initialization in the evolution
+process and regulate the smoothness of the segmented curve. It also works for
+noisy images with intensity inhomogeneity, which is a challenge in image
+segmentation. To solve the variational model, we derive the gradient flow and
+design scalar auxiliary variable (SAV) scheme coupled with fast Fourier
+transform (FFT), which can significantly improve the computational efficiency
+compared with the traditional semi-implicit and semi-explicit scheme. Numerical
+experiments show that the proposed method can generate smooth segmentation
+curves, retain fine segmentation targets and obtain robust segmentation results
+of small objects. Compared to existing level set methods, this model is
+state-of-the-art in both accuracy and efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rank-DETR for High Quality Object Detection <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08854v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08854v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Pu, Weicong Liang, Yiduo Hao, Yuhui Yuan, Yukang Yang, Chao Zhang, Han Hu, Gao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern detection transformers (DETRs) use a set of object queries to predict
+a list of bounding boxes, sort them by their classification confidence scores,
+and select the top-ranked predictions as the final detection results for the
+given input image. A highly performant object detector requires accurate
+ranking for the bounding box predictions. For DETR-based detectors, the
+top-ranked bounding boxes suffer from less accurate localization quality due to
+the misalignment between classification scores and localization accuracy, thus
+impeding the construction of high-quality detectors. In this work, we introduce
+a simple and highly performant DETR-based object detector by proposing a series
+of rank-oriented designs, combinedly called Rank-DETR. Our key contributions
+include: (i) a rank-oriented architecture design that can prompt positive
+predictions and suppress the negative ones to ensure lower false positive
+rates, as well as (ii) a rank-oriented loss function and matching cost design
+that prioritizes predictions of more accurate localization accuracy during
+ranking to boost the AP under high IoU thresholds. We apply our method to
+improve the recent SOTA methods (e.g., H-DETR and DINO-DETR) and report strong
+COCO object detection results when using different backbones such as
+ResNet-$50$, Swin-T, and Swin-L, demonstrating the effectiveness of our
+approach. Code is available at \url{https://github.com/LeapLabTHU/Rank-DETR}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Multi-modal 3D Semantic Segmentation in Real-world Autonomous
+  Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08826v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08826v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feng Jiang, Chaoping Tu, Gang Zhang, Jun Li, Hanqing Huang, Junyu Lin, Di Feng, Jian Pu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LiDAR and camera are two critical sensors for multi-modal 3D semantic
+segmentation and are supposed to be fused efficiently and robustly to promise
+safety in various real-world scenarios. However, existing multi-modal methods
+face two key challenges: 1) difficulty with efficient deployment and real-time
+execution; and 2) drastic performance degradation under weak calibration
+between LiDAR and cameras. To address these challenges, we propose CPGNet-LCF,
+a new multi-modal fusion framework extending the LiDAR-only CPGNet. CPGNet-LCF
+solves the first challenge by inheriting the easy deployment and real-time
+capabilities of CPGNet. For the second challenge, we introduce a novel weak
+calibration knowledge distillation strategy during training to improve the
+robustness against the weak calibration. CPGNet-LCF achieves state-of-the-art
+performance on the nuScenes and SemanticKITTI benchmarks. Remarkably, it can be
+easily deployed to run in 20ms per frame on a single Tesla V100 GPU using
+TensorRT TF16 mode. Furthermore, we benchmark performance over four weak
+calibration levels, demonstrating the robustness of our proposed approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From CLIP to DINO: Visual Encoders Shout in Multi-modal Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08825v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08825v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dongsheng Jiang, Yuchen Liu, Songlin Liu, Xiaopeng Zhang, Jin Li, Hongkai Xiong, Qi Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-modal Large Language Models (MLLMs) have made significant strides in
+expanding the capabilities of Large Language Models (LLMs) through the
+incorporation of visual perception interfaces. Despite the emergence of
+exciting applications and the availability of diverse instruction tuning data,
+existing approaches often rely on CLIP or its variants as the visual branch,
+and merely extract features from the deep layers. However, these methods lack a
+comprehensive analysis of the visual encoders in MLLMs. In this paper, we
+conduct an extensive investigation into the effectiveness of different vision
+encoders within MLLMs. Our findings reveal that the shallow layer features of
+CLIP offer particular advantages for fine-grained tasks such as grounding and
+region understanding. Surprisingly, the vision-only model DINO, which is not
+pretrained with text-image alignment, demonstrates promising performance as a
+visual branch within MLLMs. By simply equipping it with an MLP layer for
+alignment, DINO surpasses CLIP in fine-grained related perception tasks.
+Building upon these observations, we propose a simple yet effective feature
+merging strategy, named COMM, that integrates CLIP and DINO with Multi-level
+features Merging, to enhance the visual capabilities of MLLMs. We evaluate COMM
+through comprehensive experiments on a wide range of benchmarks, including
+image captioning, visual question answering, visual grounding, and object
+hallucination. Experimental results demonstrate the superior performance of
+COMM compared to existing methods, showcasing its enhanced visual capabilities
+within MLLMs. Code will be made available at
+https://github.com/YuchenLiu98/COMM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SAM-guided Unsupervised Domain Adaptation for 3D Segmentation <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08820v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08820v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xidong Peng, Runnan Chen, Feng Qiao, Lingdong Kong, Youquan Liu, Tai Wang, Xinge Zhu, Yuexin Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised domain adaptation (UDA) in 3D segmentation tasks presents a
+formidable challenge, primarily stemming from the sparse and unordered nature
+of point cloud data. Especially for LiDAR point clouds, the domain discrepancy
+becomes obvious across varying capture scenes, fluctuating weather conditions,
+and the diverse array of LiDAR devices in use. While previous UDA methodologies
+have often sought to mitigate this gap by aligning features between source and
+target domains, this approach falls short when applied to 3D segmentation due
+to the substantial domain variations. Inspired by the remarkable generalization
+capabilities exhibited by the vision foundation model, SAM, in the realm of
+image segmentation, our approach leverages the wealth of general knowledge
+embedded within SAM to unify feature representations across diverse 3D domains
+and further solves the 3D domain adaptation problem. Specifically, we harness
+the corresponding images associated with point clouds to facilitate knowledge
+transfer and propose an innovative hybrid feature augmentation methodology,
+which significantly enhances the alignment between the 3D feature space and
+SAM's feature space, operating at both the scene and instance levels. Our
+method is evaluated on many widely-recognized datasets and achieves
+state-of-the-art performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted to ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Incremental Object Detection with CLIP 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08815v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08815v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yupeng He, Ziyue Huang, Qingjie Liu, Yunhong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the incremental detection task, unlike the incremental classification
+task, data ambiguity exists due to the possibility of an image having different
+labeled bounding boxes in multiple continuous learning stages. This phenomenon
+often impairs the model's ability to learn new classes. However, the forward
+compatibility of the model is less considered in existing work, which hinders
+the model's suitability for incremental learning. To overcome this obstacle, we
+propose to use a language-visual model such as CLIP to generate text feature
+embeddings for different class sets, which enhances the feature space globally.
+We then employ the broad classes to replace the unavailable novel classes in
+the early learning stage to simulate the actual incremental scenario. Finally,
+we use the CLIP image encoder to identify potential objects in the proposals,
+which are classified into the background by the model. We modify the background
+labels of those proposals to known classes and add the boxes to the training
+set to alleviate the problem of data ambiguity. We evaluate our approach on
+various incremental learning settings on the PASCAL VOC 2007 dataset, and our
+approach outperforms state-of-the-art methods, particularly for the new
+classes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Two-Stage Deep Learning Framework for Quality Assessment of Left Atrial
+  Late Gadolinium Enhanced MRI Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08805v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08805v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        K M Arefeen Sultan, Benjamin Orkild, Alan Morris, Eugene Kholmovski, Erik Bieging, Eugene Kwan, Ravi Ranjan, Ed DiBella, Shireen Elhabian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate assessment of left atrial fibrosis in patients with atrial
+fibrillation relies on high-quality 3D late gadolinium enhancement (LGE) MRI
+images. However, obtaining such images is challenging due to patient motion,
+changing breathing patterns, or sub-optimal choice of pulse sequence
+parameters. Automated assessment of LGE-MRI image diagnostic quality is
+clinically significant as it would enhance diagnostic accuracy, improve
+efficiency, ensure standardization, and contributes to better patient outcomes
+by providing reliable and high-quality LGE-MRI scans for fibrosis
+quantification and treatment planning. To address this, we propose a two-stage
+deep-learning approach for automated LGE-MRI image diagnostic quality
+assessment. The method includes a left atrium detector to focus on relevant
+regions and a deep network to evaluate diagnostic quality. We explore two
+training strategies, multi-task learning, and pretraining using contrastive
+learning, to overcome limited annotated data in medical imaging. Contrastive
+Learning result shows about $4\%$, and $9\%$ improvement in F1-Score and
+Specificity compared to Multi-Task learning when there's limited data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to STACOM 2023. 11 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Task-aware Distributed Source Coding under Dynamic Bandwidth 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15523v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15523v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Po-han Li, Sravan Kumar Ankireddy, Ruihan Zhao, Hossein Nourkhiz Mahjoub, Ehsan Moradi-Pari, Ufuk Topcu, Sandeep Chinchali, Hyeji Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient compression of correlated data is essential to minimize
+communication overload in multi-sensor networks. In such networks, each sensor
+independently compresses the data and transmits them to a central node due to
+limited communication bandwidth. A decoder at the central node decompresses and
+passes the data to a pre-trained machine learning-based task to generate the
+final output. Thus, it is important to compress the features that are relevant
+to the task. Additionally, the final performance depends heavily on the total
+available bandwidth. In practice, it is common to encounter varying
+availability in bandwidth, and higher bandwidth results in better performance
+of the task. We design a novel distributed compression framework composed of
+independent encoders and a joint decoder, which we call neural distributed
+principal component analysis (NDPCA). NDPCA flexibly compresses data from
+multiple sources to any available bandwidth with a single model, reducing
+computing and storage overhead. NDPCA achieves this by learning low-rank task
+representations and efficiently distributing bandwidth among sensors, thus
+providing a graceful trade-off between performance and bandwidth. Experiments
+show that NDPCA improves the success rate of multi-view robotic arm
+manipulation by 9% and the accuracy of object detection tasks on satellite
+imagery by 14% compared to an autoencoder with uniform bandwidth allocation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LanguageMPC: Large Language Models as Decision Makers for Autonomous
+  Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03026v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03026v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Sha, Yao Mu, Yuxuan Jiang, Li Chen, Chenfeng Xu, Ping Luo, Shengbo Eben Li, Masayoshi Tomizuka, Wei Zhan, Mingyu Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing learning-based autonomous driving (AD) systems face challenges in
+comprehending high-level information, generalizing to rare events, and
+providing interpretability. To address these problems, this work employs Large
+Language Models (LLMs) as a decision-making component for complex AD scenarios
+that require human commonsense understanding. We devise cognitive pathways to
+enable comprehensive reasoning with LLMs, and develop algorithms for
+translating LLM decisions into actionable driving commands. Through this
+approach, LLM decisions are seamlessly integrated with low-level controllers by
+guided parameter matrix adaptation. Extensive experiments demonstrate that our
+proposed method not only consistently surpasses baseline approaches in
+single-vehicle tasks, but also helps handle complex driving behaviors even
+multi-vehicle coordination, thanks to the commonsense reasoning capabilities of
+LLMs. This paper presents an initial step toward leveraging LLMs as effective
+decision-makers for intricate AD scenarios in terms of safety, efficiency,
+generalizability, and interoperability. We aspire for it to serve as
+inspiration for future research in this field. Project page:
+https://sites.google.com/view/llm-mpc
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NAVI: Category-Agnostic Image Collections with High-Quality 3D Shape and
+  Pose Annotations <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09109v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09109v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Varun Jampani, Kevis-Kokitsi Maninis, Andreas Engelhardt, Arjun Karpur, Karen Truong, Kyle Sargent, Stefan Popov, André Araujo, Ricardo Martin-Brualla, Kaushal Patel, Daniel Vlasic, Vittorio Ferrari, Ameesh Makadia, Ce Liu, Yuanzhen Li, Howard Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in neural reconstruction enable high-quality 3D object
+reconstruction from casually captured image collections. Current techniques
+mostly analyze their progress on relatively simple image collections where
+Structure-from-Motion (SfM) techniques can provide ground-truth (GT) camera
+poses. We note that SfM techniques tend to fail on in-the-wild image
+collections such as image search results with varying backgrounds and
+illuminations. To enable systematic research progress on 3D reconstruction from
+casual image captures, we propose NAVI: a new dataset of category-agnostic
+image collections of objects with high-quality 3D scans along with per-image
+2D-3D alignments providing near-perfect GT camera parameters. These 2D-3D
+alignments allow us to extract accurate derivative annotations such as dense
+pixel correspondences, depth and segmentation maps. We demonstrate the use of
+NAVI image collections on different problem settings and show that NAVI enables
+more thorough evaluations that were not possible with existing datasets. We
+believe NAVI is beneficial for systematic research progress on 3D
+reconstruction and correspondence estimation. Project page:
+https://navidataset.github.io
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023 camera ready. Project page:
+  https://navidataset.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Constructing Metric-Semantic Maps using Floor Plan Priors for Long-Term
+  Indoor Localization <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.10959v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.10959v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicky Zimmerman, Matteo Sodano, Elias Marks, Jens Behley, Cyrill Stachniss
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object-based maps are relevant for scene understanding since they integrate
+geometric and semantic information of the environment, allowing autonomous
+robots to robustly localize and interact with on objects. In this paper, we
+address the task of constructing a metric-semantic map for the purpose of
+long-term object-based localization. We exploit 3D object detections from
+monocular RGB frames for both, the object-based map construction, and for
+globally localizing in the constructed map. To tailor the approach to a target
+environment, we propose an efficient way of generating 3D annotations to
+finetune the 3D object detection model. We evaluate our map construction in an
+office building, and test our long-term localization approach on challenging
+sequences recorded in the same environment over nine months. The experiments
+suggest that our approach is suitable for constructing metric-semantic maps,
+and that our localization approach is robust to long-term changes. Both, the
+mapping algorithm and the localization pipeline can run online on an onboard
+computer. We release an open-source C++/ROS implementation of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, accepted to IROS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decaf: Monocular Deformation Capture for Face and Hand Interactions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16670v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16670v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soshi Shimada, Vladislav Golyanik, Patrick Pérez, Christian Theobalt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing methods for 3D tracking from monocular RGB videos predominantly
+consider articulated and rigid objects. Modelling dense non-rigid object
+deformations in this setting remained largely unaddressed so far, although such
+effects can improve the realism of the downstream applications such as AR/VR
+and avatar communications. This is due to the severe ill-posedness of the
+monocular view setting and the associated challenges. While it is possible to
+naively track multiple non-rigid objects independently using 3D templates or
+parametric 3D models, such an approach would suffer from multiple artefacts in
+the resulting 3D estimates such as depth ambiguity, unnatural intra-object
+collisions and missing or implausible deformations. Hence, this paper
+introduces the first method that addresses the fundamental challenges depicted
+above and that allows tracking human hands interacting with human faces in 3D
+from single monocular RGB videos. We model hands as articulated objects
+inducing non-rigid face deformations during an active interaction. Our method
+relies on a new hand-face motion and interaction capture dataset with realistic
+face deformations acquired with a markerless multi-view camera system. As a
+pivotal step in its creation, we process the reconstructed raw 3D shapes with
+position-based dynamics and an approach for non-uniform stiffness estimation of
+the head tissues, which results in plausible annotations of the surface
+deformations, hand-face contact regions and head-hand positions. At the core of
+our neural approach are a variational auto-encoder supplying the hand-face
+depth prior and modules that guide the 3D tracking by estimating the contacts
+and the deformations. Our final 3D hand and face reconstructions are realistic
+and more plausible compared to several baselines applicable in our setting,
+both quantitatively and qualitatively.
+https://vcai.mpi-inf.mpg.de/projects/Decaf
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generating Images with Multimodal Language Models <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17216v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17216v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Yu Koh, Daniel Fried, Ruslan Salakhutdinov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a method to fuse frozen text-only large language models (LLMs)
+with pre-trained image encoder and decoder models, by mapping between their
+embedding spaces. Our model demonstrates a wide suite of multimodal
+capabilities: image retrieval, novel image generation, and multimodal dialogue.
+Ours is the first approach capable of conditioning on arbitrarily interleaved
+image and text inputs to generate coherent image (and text) outputs. To achieve
+strong performance on image generation, we propose an efficient mapping network
+to ground the LLM to an off-the-shelf text-to-image generation model. This
+mapping network translates hidden representations of text into the embedding
+space of the visual models, enabling us to leverage the strong text
+representations of the LLM for visual outputs. Our approach outperforms
+baseline generation models on tasks with longer and more complex language. In
+addition to novel image generation, our model is also capable of image
+retrieval from a prespecified dataset, and decides whether to retrieve or
+generate at inference time. This is done with a learnt decision module which
+conditions on the hidden representations of the LLM. Our model exhibits a wider
+range of capabilities compared to prior multimodal language models. It can
+process image-and-text inputs, and produce retrieved images, generated images,
+and generated text -- outperforming non-LLM based generation models across
+several text-to-image tasks that measure context dependence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023. Project page: http://jykoh.com/gill</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02490v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02490v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weihao Yu, Zhengyuan Yang, Linjie Li, Jianfeng Wang, Kevin Lin, Zicheng Liu, Xinchao Wang, Lijuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose MM-Vet, an evaluation benchmark that examines large multimodal
+models (LMMs) on complicated multimodal tasks. Recent LMMs have shown various
+intriguing abilities, such as solving math problems written on the blackboard,
+reasoning about events and celebrities in news images, and explaining visual
+jokes. Rapid model advancements pose challenges to evaluation benchmark
+development. Problems include: (1) How to systematically structure and evaluate
+the complicated multimodal tasks; (2) How to design evaluation metrics that
+work well across question and answer types; and (3) How to give model insights
+beyond a simple performance ranking. To this end, we present MM-Vet, designed
+based on the insight that the intriguing ability to solve complicated tasks is
+often achieved by a generalist model being able to integrate different core
+vision-language (VL) capabilities. MM-Vet defines 6 core VL capabilities and
+examines the 16 integrations of interest derived from the capability
+combination. For evaluation metrics, we propose an LLM-based evaluator for
+open-ended outputs. The evaluator enables the evaluation across different
+question types and answer styles, resulting in a unified scoring metric. We
+evaluate representative LMMs on MM-Vet, providing insights into the
+capabilities of different LMM system paradigms and models. Code and data are
+available at https://github.com/yuweihao/MM-Vet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Update results of OpenFlamingo-9B (MPT), LLaMA-Adapter v2-7B, and
+  Otter-9B (MPT). Code, data and leaderboard:
+  https://github.com/yuweihao/MM-Vet</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DANet: Enhancing Small Object Detection through an Efficient Deformable
+  Attention Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05768v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05768v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Sohag Mia, Abdullah Al Bary Voban, Abu Bakor Hayat Arnob, Abdu Naim, Md Kawsar Ahmed, Md Shariful Islam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Efficient and accurate detection of small objects in manufacturing settings,
+such as defects and cracks, is crucial for ensuring product quality and safety.
+To address this issue, we proposed a comprehensive strategy by synergizing
+Faster R-CNN with cutting-edge methods. By combining Faster R-CNN with Feature
+Pyramid Network, we enable the model to efficiently handle multi-scale features
+intrinsic to manufacturing environments. Additionally, Deformable Net is used
+that contorts and conforms to the geometric variations of defects, bringing
+precision in detecting even the minuscule and complex features. Then, we
+incorporated an attention mechanism called Convolutional Block Attention Module
+in each block of our base ResNet50 network to selectively emphasize informative
+features and suppress less useful ones. After that we incorporated RoI Align,
+replacing RoI Pooling for finer region-of-interest alignment and finally the
+integration of Focal Loss effectively handles class imbalance, crucial for rare
+defect occurrences. The rigorous evaluation of our model on both the NEU-DET
+and Pascal VOC datasets underscores its robust performance and generalization
+capabilities. On the NEU-DET dataset, our model exhibited a profound
+understanding of steel defects, achieving state-of-the-art accuracy in
+identifying various defects. Simultaneously, when evaluated on the Pascal VOC
+dataset, our model showcases its ability to detect objects across a wide
+spectrum of categories within complex and small scenes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCD-23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SeMLaPS: Real-time Semantic Mapping with Latent Prior Networks and
+  Quasi-Planar Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16585v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16585v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingwen Wang, Juan Tarrio, Lourdes Agapito, Pablo F. Alcantarilla, Alexander Vakhitov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The availability of real-time semantics greatly improves the core geometric
+functionality of SLAM systems, enabling numerous robotic and AR/VR
+applications. We present a new methodology for real-time semantic mapping from
+RGB-D sequences that combines a 2D neural network and a 3D network based on a
+SLAM system with 3D occupancy mapping. When segmenting a new frame we perform
+latent feature re-projection from previous frames based on differentiable
+rendering. Fusing re-projected feature maps from previous frames with
+current-frame features greatly improves image segmentation quality, compared to
+a baseline that processes images independently. For 3D map processing, we
+propose a novel geometric quasi-planar over-segmentation method that groups 3D
+map elements likely to belong to the same semantic classes, relying on surface
+normals. We also describe a novel neural network design for lightweight
+semantic map post-processing. Our system achieves state-of-the-art semantic
+mapping quality within 2D-3D networks-based systems and matches the performance
+of 3D convolutional networks on three real indoor datasets, while working in
+real-time. Moreover, it shows better cross-sensor generalization abilities
+compared to 3D CNNs, enabling training and inference with different depth
+sensors. Code and data will be released on project page:
+http://jingwenwang95.github.io/SeMLaPS
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>RA-L 2023. 8 pages, 7 figures. Project page:
+  http://jingwenwang95.github.io/SeMLaPS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ViTs are Everywhere: A Comprehensive Study Showcasing Vision
+  <span class="highlight-title">Transformer</span>s in Different Domain 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05664v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05664v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Sohag Mia, Abu Bakor Hayat Arnob, Abdu Naim, Abdullah Al Bary Voban, Md Shariful Islam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer design is the de facto standard for natural language processing
+tasks. The success of the transformer design in natural language processing has
+lately piqued the interest of researchers in the domain of computer vision.
+When compared to Convolutional Neural Networks (CNNs), Vision Transformers
+(ViTs) are becoming more popular and dominant solutions for many vision
+problems. Transformer-based models outperform other types of networks, such as
+convolutional and recurrent neural networks, in a range of visual benchmarks.
+We evaluate various vision transformer models in this work by dividing them
+into distinct jobs and examining their benefits and drawbacks. ViTs can
+overcome several possible difficulties with convolutional neural networks
+(CNNs). The goal of this survey is to show the first use of ViTs in CV. In the
+first phase, we categorize various CV applications where ViTs are appropriate.
+Image classification, object identification, image segmentation, video
+transformer, image denoising, and NAS are all CV applications. Our next step
+will be to analyze the state-of-the-art in each area and identify the models
+that are currently available. In addition, we outline numerous open research
+difficulties as well as prospective research possibilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCD-2023. arXiv admin note: substantial text overlap with
+  arXiv:2208.04309 by other authors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MLIC: Multi-Reference Entropy Model for Learned Image Compression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.07273v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.07273v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Jiang, Jiayu Yang, Yongqi Zhai, Peirong Ning, Feng Gao, Ronggang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, learned image compression has achieved remarkable performance. The
+entropy model, which estimates the distribution of the latent representation,
+plays a crucial role in boosting rate-distortion performance. However, most
+entropy models only capture correlations in one dimension, while the latent
+representation contain channel-wise, local spatial, and global spatial
+correlations. To tackle this issue, we propose the Multi-Reference Entropy
+Model (MEM) and the advanced version, MEM$^+$. These models capture the
+different types of correlations present in latent representation. Specifically,
+We first divide the latent representation into slices. When decoding the
+current slice, we use previously decoded slices as context and employ the
+attention map of the previously decoded slice to predict global correlations in
+the current slice. To capture local contexts, we introduce two enhanced
+checkerboard context capturing techniques that avoids performance degradation.
+Based on MEM and MEM$^+$, we propose image compression models MLIC and
+MLIC$^+$. Extensive experimental evaluations demonstrate that our MLIC and
+MLIC$^+$ models achieve state-of-the-art performance, reducing BD-rate by
+$8.05\%$ and $11.39\%$ on the Kodak dataset compared to VTM-17.0 when measured
+in PSNR. Our code will be available at https://github.com/JiangWeibeta/MLIC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ACMMM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Critical Analysis of Internal Reliability for Uncertainty
+  Quantification of Dense Image Matching in Multi-view Stereo 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.09379v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.09379v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Debao Huang, Rongjun Qin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, photogrammetrically derived point clouds are widely used in many
+civilian applications due to their low cost and flexibility in acquisition.
+Typically, photogrammetric point clouds are assessed through reference data
+such as LiDAR point clouds. However, when reference data are not available, the
+assessment of photogrammetric point clouds may be challenging. Since these
+point clouds are algorithmically derived, their accuracies and precisions are
+highly varying with the camera networks, scene complexity, and dense image
+matching (DIM) algorithms, and there is no standard error metric to determine
+per-point errors. The theory of internal reliability of camera networks has
+been well studied through first-order error estimation of Bundle Adjustment
+(BA), which is used to understand the errors of 3D points assuming known
+measurement errors. However, the measurement errors of the DIM algorithms are
+intricate to an extent that every single point may have its error function
+determined by factors such as pixel intensity, texture entropy, and surface
+smoothness. Despite the complexity, there exist a few common metrics that may
+aid the process of estimating the posterior reliability of the derived points,
+especially in a multi-view stereo (MVS) setup when redundancies are present. In
+this paper, by using an aerial oblique photogrammetric block with LiDAR
+reference data, we analyze several internal matching metrics within a common
+MVS framework, including statistics in ray convergence, intersection angles,
+DIM energy, etc.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Figure 8</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Uncertainty-aware Gait Recognition via Learning from Dirichlet
+  Distribution-based Evidence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.08007v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.08007v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Beibei Lin, Chen Liu, Ming Wang, Lincheng Li, Shunli Zhang, Robby T. Tan, Xin Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing gait recognition frameworks retrieve an identity in the gallery
+based on the distance between a probe sample and the identities in the gallery.
+However, existing methods often neglect that the gallery may not contain
+identities corresponding to the probes, leading to recognition errors rather
+than raising an alarm. In this paper, we introduce a novel uncertainty-aware
+gait recognition method that models the uncertainty of identification based on
+learned evidence. Specifically, we treat our recognition model as an evidence
+collector to gather evidence from input samples and parameterize a Dirichlet
+distribution over the evidence. The Dirichlet distribution essentially
+represents the density of the probability assigned to the input samples. We
+utilize the distribution to evaluate the resultant uncertainty of each probe
+sample and then determine whether a probe has a counterpart in the gallery or
+not. To the best of our knowledge, our method is the first attempt to tackle
+gait recognition with uncertainty modelling. Moreover, our uncertain modeling
+significantly improves the robustness against out-of-distribution (OOD)
+queries. Extensive experiments demonstrate that our method achieves
+state-of-the-art performance on datasets with OOD queries, and can also
+generalize well to other identity-retrieval tasks. Importantly, our method
+outperforms the state-of-the-art by a large margin of 51.26% when the OOD query
+rate is around 50% on OUMVLP.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-View Class Incremental Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09675v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09675v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Depeng Li, Tianqi Wang, Junwei Chen, Kenji Kawaguchi, Cheng Lian, Zhigang Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-view learning (MVL) has gained great success in integrating information
+from multiple perspectives of a dataset to improve downstream task performance.
+To make MVL methods more practical in an open-ended environment, this paper
+investigates a novel paradigm called multi-view class incremental learning
+(MVCIL), where a single model incrementally classifies new classes from a
+continual stream of views, requiring no access to earlier views of data.
+However, MVCIL is challenged by the catastrophic forgetting of old information
+and the interference with learning new concepts. To address this, we first
+develop a randomization-based representation learning technique serving for
+feature extraction to guarantee their separate view-optimal working states,
+during which multiple views belonging to a class are presented sequentially;
+Then, we integrate them one by one in the orthogonality fusion subspace spanned
+by the extracted features; Finally, we introduce selective weight consolidation
+for learning-without-forgetting decision-making while encountering new classes.
+Extensive experiments on synthetic and real-world datasets validate the
+effectiveness of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Information Fusion</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards End-to-End Embodied Decision Making via Multi-modal Large
+  Language Model: Explorations with <span class="highlight-title">GPT</span>4-Vision and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02071v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02071v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Chen, Yichi Zhang, Shuhuai Ren, Haozhe Zhao, Zefan Cai, Yuchi Wang, Peiyi Wang, Tianyu Liu, Baobao Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this study, we explore the potential of Multimodal Large Language Models
+(MLLMs) in improving embodied decision-making processes for agents. While Large
+Language Models (LLMs) have been widely used due to their advanced reasoning
+skills and vast world knowledge, MLLMs like GPT4-Vision offer enhanced visual
+understanding and reasoning capabilities. We investigate whether
+state-of-the-art MLLMs can handle embodied decision-making in an end-to-end
+manner and whether collaborations between LLMs and MLLMs can enhance
+decision-making. To address these questions, we introduce a new benchmark
+called PCA-EVAL, which evaluates embodied decision-making from the perspectives
+of Perception, Cognition, and Action. Additionally, we propose HOLMES, a
+multi-agent cooperation framework that allows LLMs to leverage MLLMs and APIs
+to gather multimodal information for informed decision-making. We compare
+end-to-end embodied decision-making and HOLMES on our benchmark and find that
+the GPT4-Vision model demonstrates strong end-to-end embodied decision-making
+abilities, outperforming GPT4-HOLMES in terms of average decision accuracy
+(+3%). However, this performance is exclusive to the latest GPT4-Vision model,
+surpassing the open-source state-of-the-art MLLM by 26%. Our results indicate
+that powerful MLLMs like GPT4-Vision hold promise for decision-making in
+embodied agents, offering new avenues for MLLM research. Code and data are open
+at https://github.com/pkunlp-icler/PCA-EVAL/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 10 figures, Code and data:
+  https://github.com/pkunlp-icler/PCA-EVAL/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hierarchical Disentanglement-Alignment Network for Robust SAR Vehicle
+  Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03550v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03550v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weijie Li, Wei Yang, Wenpeng Zhang, Tianpeng Liu, Yongxiang Liu, Li Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vehicle recognition is a fundamental problem in SAR image interpretation.
+However, robustly recognizing vehicle targets is a challenging task in SAR due
+to the large intraclass variations and small interclass variations.
+Additionally, the lack of large datasets further complicates the task. Inspired
+by the analysis of target signature variations and deep learning
+explainability, this paper proposes a novel domain alignment framework named
+the Hierarchical Disentanglement-Alignment Network (HDANet) to achieve
+robustness under various operating conditions. Concisely, HDANet integrates
+feature disentanglement and alignment into a unified framework with three
+modules: domain data generation, multitask-assisted mask disentanglement, and
+domain alignment of target features. The first module generates diverse data
+for alignment, and three simple but effective data augmentation methods are
+designed to simulate target signature variations. The second module
+disentangles the target features from background clutter using the
+multitask-assisted mask to prevent clutter from interfering with subsequent
+alignment. The third module employs a contrastive loss for domain alignment to
+extract robust target features from generated diverse data and disentangled
+features. Lastly, the proposed method demonstrates impressive robustness across
+nine operating conditions in the MSTAR dataset, and extensive qualitative and
+quantitative analyses validate the effectiveness of our framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PonderV2: Pave the Way for 3D Foundation Model with A Universal
+  <span class="highlight-title">Pre-train</span>ing Paradigm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08586v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08586v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyi Zhu, Honghui Yang, Xiaoyang Wu, Di Huang, Sha Zhang, Xianglong He, Tong He, Hengshuang Zhao, Chunhua Shen, Yu Qiao, Wanli Ouyang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In contrast to numerous NLP and 2D computer vision foundational models, the
+learning of a robust and highly generalized 3D foundational model poses
+considerably greater challenges. This is primarily due to the inherent data
+variability and the diversity of downstream tasks. In this paper, we introduce
+a comprehensive 3D pre-training framework designed to facilitate the
+acquisition of efficient 3D representations, thereby establishing a pathway to
+3D foundational models. Motivated by the fact that informative 3D features
+should be able to encode rich geometry and appearance cues that can be utilized
+to render realistic images, we propose a novel universal paradigm to learn
+point cloud representations by differentiable neural rendering, serving as a
+bridge between 3D and 2D worlds. We train a point cloud encoder within a
+devised volumetric neural renderer by comparing the rendered images with the
+real images. Notably, our approach demonstrates the seamless integration of the
+learned 3D encoder into diverse downstream tasks. These tasks encompass not
+only high-level challenges such as 3D detection and segmentation but also
+low-level objectives like 3D reconstruction and image synthesis, spanning both
+indoor and outdoor scenarios. Besides, we also illustrate the capability of
+pre-training a 2D backbone using the proposed universal methodology, surpassing
+conventional pre-training methods by a large margin. For the first time,
+PonderV2 achieves state-of-the-art performance on 11 indoor and outdoor
+benchmarks. The consistent improvements in various settings imply the
+effectiveness of the proposed method. Code and models will be made available at
+https://github.com/OpenGVLab/PonderV2.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2301.00157</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Universal Semantic-Geometric Representation for Robotic Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10474v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10474v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Zhang, Yingdong Hu, Hanchen Cui, Hang Zhao, Yang Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robots rely heavily on sensors, especially RGB and depth cameras, to perceive
+and interact with the world. RGB cameras record 2D images with rich semantic
+information while missing precise spatial information. On the other side, depth
+cameras offer critical 3D geometry data but capture limited semantics.
+Therefore, integrating both modalities is crucial for learning representations
+for robotic perception and control. However, current research predominantly
+focuses on only one of these modalities, neglecting the benefits of
+incorporating both. To this end, we present $\textbf{Semantic-Geometric
+Representation} (\textbf{SGR})$, a universal perception module for robotics
+that leverages the rich semantic information of large-scale pre-trained 2D
+models and inherits the merits of 3D spatial reasoning. Our experiments
+demonstrate that SGR empowers the agent to successfully complete a diverse
+range of simulated and real-world robotic manipulation tasks, outperforming
+state-of-the-art methods significantly in both single-task and multi-task
+settings. Furthermore, SGR possesses the capability to generalize to novel
+semantic attributes, setting it apart from the other methods. Project website:
+https://semantic-geometric-representation.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CoRL 2023. Project website:
+  https://semantic-geometric-representation.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Beyond the Pixel: a Photometrically Calibrated HDR <span class="highlight-title">Dataset</span> for Luminance
+  and Color Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.12372v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.12372v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christophe Bolduc, Justine Giroux, Marc Hébert, Claude Demers, Jean-François Lalonde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Light plays an important role in human well-being. However, most computer
+vision tasks treat pixels without considering their relationship to physical
+luminance. To address this shortcoming, we introduce the Laval Photometric
+Indoor HDR Dataset, the first large-scale photometrically calibrated dataset of
+high dynamic range 360{\deg} panoramas. Our key contribution is the calibration
+of an existing, uncalibrated HDR Dataset. We do so by accurately capturing RAW
+bracketed exposures simultaneously with a professional photometric measurement
+device (chroma meter) for multiple scenes across a variety of lighting
+conditions. Using the resulting measurements, we establish the calibration
+coefficients to be applied to the HDR images. The resulting dataset is a rich
+representation of indoor scenes which displays a wide range of illuminance and
+color, and varied types of light sources. We exploit the dataset to introduce
+three novel tasks, where: per-pixel luminance, per-pixel color and planar
+illuminance can be predicted from a single input image. Finally, we also
+capture another smaller photometric dataset with a commercial 360{\deg} camera,
+to experiment on generalization across cameras. We are optimistic that the
+release of our datasets and associated code will spark interest in physically
+accurate light estimation within the community. Dataset and code are available
+at https://lvsn.github.io/beyondthepixel/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Complementary Domain Adaptation and Generalization for Unsupervised
+  Continual Domain Shift Learning <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15833v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15833v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wonguk Cho, Jinha Park, Taesup Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual domain shift poses a significant challenge in real-world
+applications, particularly in situations where labeled data is not available
+for new domains. The challenge of acquiring knowledge in this problem setting
+is referred to as unsupervised continual domain shift learning. Existing
+methods for domain adaptation and generalization have limitations in addressing
+this issue, as they focus either on adapting to a specific domain or
+generalizing to unseen domains, but not both. In this paper, we propose
+Complementary Domain Adaptation and Generalization (CoDAG), a simple yet
+effective learning framework that combines domain adaptation and generalization
+in a complementary manner to achieve three major goals of unsupervised
+continual domain shift learning: adapting to a current domain, generalizing to
+unseen domains, and preventing forgetting of previously seen domains. Our
+approach is model-agnostic, meaning that it is compatible with any existing
+domain adaptation and generalization algorithms. We evaluate CoDAG on several
+benchmark datasets and demonstrate that our model outperforms state-of-the-art
+models in all datasets and evaluation metrics, highlighting its effectiveness
+and robustness in handling unsupervised continual domain shift learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TransHP: Image Classification with Hierarchical <span class="highlight-title">Prompt</span>ing <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06385v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06385v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenhao Wang, Yifan Sun, Wei Li, Yi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores a hierarchical prompting mechanism for the hierarchical
+image classification (HIC) task. Different from prior HIC methods, our
+hierarchical prompting is the first to explicitly inject ancestor-class
+information as a tokenized hint that benefits the descendant-class
+discrimination. We think it well imitates human visual recognition, i.e.,
+humans may use the ancestor class as a prompt to draw focus on the subtle
+differences among descendant classes. We model this prompting mechanism into a
+Transformer with Hierarchical Prompting (TransHP). TransHP consists of three
+steps: 1) learning a set of prompt tokens to represent the coarse (ancestor)
+classes, 2) on-the-fly predicting the coarse class of the input image at an
+intermediate block, and 3) injecting the prompt token of the predicted coarse
+class into the intermediate feature. Though the parameters of TransHP maintain
+the same for all input images, the injected coarse-class prompt conditions
+(modifies) the subsequent feature extraction and encourages a dynamic focus on
+relatively subtle differences among the descendant classes. Extensive
+experiments show that TransHP improves image classification on accuracy (e.g.,
+improving ViT-B/16 by +2.83% ImageNet classification accuracy), training data
+efficiency (e.g., +12.69% improvement under 10% ImageNet training data), and
+model explainability. Moreover, TransHP also performs favorably against prior
+HIC methods, showing that TransHP well exploits the hierarchical information.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural Point-based Volumetric Avatar: Surface-guided Neural Points for
+  Efficient and Photorealistic Volumetric Head Avatar <span class="chip">SIGGRAPH</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.05000v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.05000v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cong Wang, Di Kang, Yan-Pei Cao, Linchao Bao, Ying Shan, Song-Hai Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rendering photorealistic and dynamically moving human heads is crucial for
+ensuring a pleasant and immersive experience in AR/VR and video conferencing
+applications. However, existing methods often struggle to model challenging
+facial regions (e.g., mouth interior, eyes, hair/beard), resulting in
+unrealistic and blurry results. In this paper, we propose {\fullname}
+({\name}), a method that adopts the neural point representation as well as the
+neural volume rendering process and discards the predefined connectivity and
+hard correspondence imposed by mesh-based approaches. Specifically, the neural
+points are strategically constrained around the surface of the target
+expression via a high-resolution UV displacement map, achieving increased
+modeling capacity and more accurate control. We introduce three technical
+innovations to improve the rendering and training efficiency: a patch-wise
+depth-guided (shading point) sampling strategy, a lightweight radiance decoding
+process, and a Grid-Error-Patch (GEP) ray sampling strategy during training. By
+design, our {\name} is better equipped to handle topologically changing regions
+and thin structures while also ensuring accurate expression control when
+animating avatars. Experiments conducted on three subjects from the Multiface
+dataset demonstrate the effectiveness of our designs, outperforming previous
+state-of-the-art methods, especially in handling challenging facial regions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by SIGGRAPH Asia 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MoCaE: Mixture of Calibrated Experts Significantly Improves Object
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.14976v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.14976v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kemal Oksuz, Selim Kuzucu, Tom Joy, Puneet K. Dokania
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose an extremely simple and highly effective approach to faithfully
+combine different object detectors to obtain a Mixture of Experts (MoE) that
+has a superior accuracy to the individual experts in the mixture. We find that
+naively combining these experts in a similar way to the well-known Deep
+Ensembles (DEs), does not result in an effective MoE. We identify the
+incompatibility between the confidence score distribution of different
+detectors to be the primary reason for such failure cases. Therefore, to
+construct the MoE, our proposal is to first calibrate each individual detector
+against a target calibration function. Then, filter and refine all the
+predictions from different detectors in the mixture. We term this approach as
+MoCaE and demonstrate its effectiveness through extensive experiments on object
+detection, instance segmentation and rotated object detection tasks.
+Specifically, MoCaE improves (i) three strong object detectors on COCO test-dev
+by $2.4$ $\mathrm{AP}$ by reaching $59.0$ $\mathrm{AP}$; (ii) instance
+segmentation methods on the challenging long-tailed LVIS dataset by $2.3$
+$\mathrm{AP}$; and (iii) all existing rotated object detectors by reaching
+$82.62$ $\mathrm{AP_{50}}$ on DOTA dataset, establishing a new state-of-the-art
+(SOTA). Code will be made public.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Worst-Case Morphs using Wasserstein ALI and Improved MIPGAN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08371v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08371v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Una M. Kelly, Meike Nauta, Lu Liu, Luuk J. Spreeuwers, Raymond N. J. Veldhuis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A morph is a combination of two separate facial images and contains identity
+information of two different people. When used in an identity document, both
+people can be authenticated by a biometric Face Recognition (FR) system. Morphs
+can be generated using either a landmark-based approach or approaches based on
+deep learning such as Generative Adversarial Networks (GAN). In a recent paper,
+we introduced a \emph{worst-case} upper bound on how challenging morphing
+attacks can be for an FR system. The closer morphs are to this upper bound, the
+bigger the challenge they pose to FR. We introduced an approach with which it
+was possible to generate morphs that approximate this upper bound for a known
+FR system (white box), but not for unknown (black box) FR systems.
+  In this paper, we introduce a morph generation method that can approximate
+worst-case morphs even when the FR system is not known. A key contribution is
+that we include the goal of generating difficult morphs \emph{during} training.
+Our method is based on Adversarially Learned Inference (ALI) and uses concepts
+from Wasserstein GANs trained with Gradient Penalty, which were introduced to
+stabilise the training of GANs. We include these concepts to achieve similar
+improvement in training stability and call the resulting method Wasserstein ALI
+(WALI). We finetune WALI using loss functions designed specifically to improve
+the ability to manipulate identity information in facial images and show how it
+can generate morphs that are more challenging for FR systems than landmark- or
+GAN-based morphs. We also show how our findings can be used to improve MIPGAN,
+an existing StyleGAN-based morph generator.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adapting Segment Anything Model for Change Detection in HR Remote
+  Sensing Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.01429v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.01429v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Ding, Kun Zhu, Daifeng Peng, Hao Tang, Kuiwu Yang, Lorenzo Bruzzone
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision Foundation Models (VFMs) such as the Segment Anything Model (SAM)
+allow zero-shot or interactive segmentation of visual contents, thus they are
+quickly applied in a variety of visual scenes. However, their direct use in
+many Remote Sensing (RS) applications is often unsatisfactory due to the
+special imaging characteristics of RS images. In this work, we aim to utilize
+the strong visual recognition capabilities of VFMs to improve the change
+detection of high-resolution Remote Sensing Images (RSIs). We employ the visual
+encoder of FastSAM, an efficient variant of the SAM, to extract visual
+representations in RS scenes. To adapt FastSAM to focus on some specific ground
+objects in the RS scenes, we propose a convolutional adaptor to aggregate the
+task-oriented change information. Moreover, to utilize the semantic
+representations that are inherent to SAM features, we introduce a task-agnostic
+semantic learning branch to model the semantic latent in bi-temporal RSIs. The
+resulting method, SAMCD, obtains superior accuracy compared to the SOTA methods
+and exhibits a sample-efficient learning ability that is comparable to
+semi-supervised CD methods. To the best of our knowledge, this is the first
+work that adapts VFMs for the CD of HR RSIs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Soundify: Matching Sound Effects to Video <span class="chip">NeurIPS 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.09726v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.09726v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Chuan-En Lin, Anastasis Germanidis, Cristóbal Valenzuela, Yining Shi, Nikolas Martelaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the art of video editing, sound helps add character to an object and
+immerse the viewer within a space. Through formative interviews with
+professional editors (N=10), we found that the task of adding sounds to video
+can be challenging. This paper presents Soundify, a system that assists editors
+in matching sounds to video. Given a video, Soundify identifies matching
+sounds, synchronizes the sounds to the video, and dynamically adjusts panning
+and volume to create spatial audio. In a human evaluation study (N=889), we
+show that Soundify is capable of matching sounds to video out-of-the-box for a
+diverse range of audio categories. In a within-subjects expert study (N=12), we
+demonstrate the usefulness of Soundify in helping video editors match sounds to
+video with lighter workload, reduced task completion time, and improved
+usability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Full paper in UIST 2023; Short paper in NeurIPS 2021 ML4CD Workshop;
+  Online demo: http://soundify.cc</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3D Face Morphing Attacks: Generation, Vulnerability and Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2201.03454v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2201.03454v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jag Mohan Singh, Raghavendra Ramachandra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face Recognition systems (FRS) have been found to be vulnerable to morphing
+attacks, where the morphed face image is generated by blending the face images
+from contributory data subjects. This work presents a novel direction for
+generating face-morphing attacks in 3D. To this extent, we introduced a novel
+approach based on blending 3D face point clouds corresponding to contributory
+data subjects. The proposed method generates 3D face morphing by projecting the
+input 3D face point clouds onto depth maps and 2D color images, followed by
+image blending and wrapping operations performed independently on the color
+images and depth maps. We then back-projected the 2D morphing color map and the
+depth map to the point cloud using the canonical (fixed) view. Given that the
+generated 3D face morphing models will result in holes owing to a single
+canonical view, we have proposed a new algorithm for hole filling that will
+result in a high-quality 3D face morphing model. Extensive experiments were
+conducted on the newly generated 3D face dataset comprising 675 3D scans
+corresponding to 41 unique data subjects and a publicly available database
+(Facescape) with 100 data subjects. Experiments were performed to benchmark the
+vulnerability of the {proposed 3D morph-generation scheme against} automatic
+2D, 3D FRS, and human observer analysis. We also presented a quantitative
+assessment of the quality of the generated 3D face-morphing models using eight
+different quality metrics. Finally, we propose three different 3D face Morphing
+Attack Detection (3D-MAD) algorithms to benchmark the performance of 3D face
+morphing attack detection techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper is accepted at IEEE Transactions on Biometrics, Behavior
+  and Identity Science</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OTJR: Optimal Transport Meets Optimal Jacobian Regularization for
+  Adversarial Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.11793v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.11793v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Binh M. Le, Shahroz Tariq, Simon S. Woo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Web, as a rich medium of diverse content, has been constantly under the
+threat of malicious entities exploiting its vulnerabilities, especially with
+the rapid proliferation of deep learning applications in various web services.
+One such vulnerability, crucial to the fidelity and integrity of web content,
+is the susceptibility of deep neural networks to adversarial perturbations,
+especially concerning images - a dominant form of data on the web. In light of
+the recent advancements in the robustness of classifiers, we delve deep into
+the intricacies of adversarial training (AT) and Jacobian regularization, two
+pivotal defenses. Our work {is the} first carefully analyzes and characterizes
+these two schools of approaches, both theoretically and empirically, to
+demonstrate how each approach impacts the robust learning of a classifier.
+Next, we propose our novel Optimal Transport with Jacobian regularization
+method, dubbed~\SystemName, jointly incorporating the input-output Jacobian
+regularization into the AT by leveraging the optimal transport theory. In
+particular, we employ the Sliced Wasserstein (SW) distance that can efficiently
+push the adversarial samples' representations closer to those of clean samples,
+regardless of the number of classes within the dataset. The SW distance
+provides the adversarial samples' movement directions, which are much more
+informative and powerful for the Jacobian regularization. Our empirical
+evaluations set a new standard in the domain, with our method achieving
+commendable accuracies of 51.41\% on the ~\CIFAR-10 and 28.49\% on the
+~\CIFAR-100 datasets under the AutoAttack metric. In a real-world
+demonstration, we subject images sourced from the Internet to online
+adversarial attacks, reinforcing the efficacy and relevance of our model in
+defending against sophisticated web-image perturbations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion in Diffusion: Cyclic One-Way Diffusion for
+  Text-Vision-Conditioned Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.08247v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.08247v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruoyu Wang, Yongqi Yang, Zhihao Qian, Ye Zhu, Yu Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Originating from the diffusion phenomenon in physics that describes particle
+movement, the diffusion generative models inherit the characteristics of
+stochastic random walk in the data space along the denoising trajectory.
+However, the intrinsic mutual interference among image regions contradicts the
+need for practical downstream application scenarios where the preservation of
+low-level pixel information from given conditioning is desired (e.g.,
+customization tasks like personalized generation and inpainting based on a
+user-provided single image). In this work, we investigate the diffusion
+(physics) in diffusion (machine learning) properties and propose our Cyclic
+One-Way Diffusion (COW) method to control the direction of diffusion phenomenon
+given a pre-trained frozen diffusion model for versatile customization
+application scenarios, where the low-level pixel information from the
+conditioning needs to be preserved. Notably, unlike most current methods that
+incorporate additional conditions by fine-tuning the base text-to-image
+diffusion model or learning auxiliary networks, our method provides a novel
+perspective to understand the task needs and is applicable to a wider range of
+customization scenarios in a learning-free manner. Extensive experiment results
+show that our proposed COW can achieve more flexible customization based on
+strict visual conditions in different application settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Structured Prediction Problem Archive 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.03574v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.03574v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Swoboda, Ahmed Abbas, Florian Bernard, Andrea Hornakova, Paul Roetzer, Bogdan Savchynskyy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Structured prediction problems are one of the fundamental tools in machine
+learning. In order to facilitate algorithm development for their numerical
+solution, we collect in one place a large number of datasets in easy to read
+formats for a diverse set of problem classes. We provide archival links to
+datasets, description of the considered problems and problem formats, and a
+short summary of problem characteristics including size, number of instances
+etc. For reference we also give a non-exhaustive selection of algorithms
+proposed in the literature for their solution. We hope that this central
+repository will make benchmarking and comparison to established works easier.
+We welcome submission of interesting new datasets and algorithms for inclusion
+in our archive.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Added new shape matching instances based of learned descriptors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CUEING: a lightweight model to Capture hUman attEntion In driviNG 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15710v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15710v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linfeng Liang, Yao Deng, Yang Zhang, Jianchao Lu, Chen Wang, Quanzheng Sheng, Xi Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Discrepancies in decision-making between Autonomous Driving Systems (ADS) and
+human drivers underscore the need for intuitive human gaze predictors to bridge
+this gap, thereby improving user trust and experience. Existing gaze datasets,
+despite their value, suffer from noise that hampers effective training.
+Furthermore, current gaze prediction models exhibit inconsistency across
+diverse scenarios and demand substantial computational resources, restricting
+their on-board deployment in autonomous vehicles. We propose a novel adaptive
+cleansing technique for purging noise from existing gaze datasets, coupled with
+a robust, lightweight convolutional self-attention gaze prediction model. Our
+approach not only significantly enhances model generalizability and performance
+by up to 12.13% but also ensures a remarkable reduction in model complexity by
+up to 98.2% compared to the state-of-the art, making in-vehicle deployment
+feasible to augment ADS decision visualization and performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Name Your Colour For the Task: Artificially Discover Colour Naming via
+  Colour Quantisation <span class="highlight-title">Transformer</span> <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.03434v7">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.03434v7.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shenghan Su, Lin Gu, Yue Yang, Zenghui Zhang, Tatsuya Harada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The long-standing theory that a colour-naming system evolves under dual
+pressure of efficient communication and perceptual mechanism is supported by
+more and more linguistic studies, including analysing four decades of
+diachronic data from the Nafaanra language. This inspires us to explore whether
+machine learning could evolve and discover a similar colour-naming system via
+optimising the communication efficiency represented by high-level recognition
+performance. Here, we propose a novel colour quantisation transformer,
+CQFormer, that quantises colour space while maintaining the accuracy of machine
+recognition on the quantised images. Given an RGB image, Annotation Branch maps
+it into an index map before generating the quantised image with a colour
+palette; meanwhile the Palette Branch utilises a key-point detection way to
+find proper colours in the palette among the whole colour space. By interacting
+with colour annotation, CQFormer is able to balance both the machine vision
+accuracy and colour perceptual structure such as distinct and stable colour
+distribution for discovered colour system. Very interestingly, we even observe
+the consistent evolution pattern between our artificial colour system and basic
+colour terms across human languages. Besides, our colour quantisation method
+also offers an efficient quantisation method that effectively compresses the
+image storage while maintaining high performance in high-level recognition
+tasks such as classification and detection. Extensive experiments demonstrate
+the superior performance of our method with extremely low bit-rate colours,
+showing potential to integrate into quantisation network to quantities from
+image to network activation. The source code is available at
+https://github.com/ryeocthiv/CQFormer
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023 Oral</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Multimodal Learning with Missing Modalities via
+  Parameter-Efficient Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03986v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03986v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Kaykobad Reza, Ashley Prater-Bennette, M. Salman Asif
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal learning seeks to utilize data from multiple sources to improve
+the overall performance of downstream tasks. It is desirable for redundancies
+in the data to make multimodal systems robust to missing or corrupted
+observations in some correlated modalities. However, we observe that the
+performance of several existing multimodal networks significantly deteriorates
+if one or multiple modalities are absent at test time. To enable robustness to
+missing modalities, we propose simple and parameter-efficient adaptation
+procedures for pretrained multimodal networks. In particular, we exploit
+low-rank adaptation and modulation of intermediate features to compensate for
+the missing modalities. We demonstrate that such adaptation can partially
+bridge performance drop due to missing modalities and outperform independent,
+dedicated networks trained for the available modality combinations in some
+cases. The proposed adaptation requires extremely small number of parameters
+(e.g., fewer than 0.7% of the total parameters in most experiments). We conduct
+a series of experiments to highlight the robustness of our proposed method
+using diverse datasets for RGB-thermal and RGB-Depth semantic segmentation,
+multimodal material segmentation, and multimodal sentiment analysis tasks. Our
+proposed method demonstrates versatility across various tasks and datasets, and
+outperforms existing methods for robust multimodal learning with missing
+modalities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 3 figures, 11 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MagicDrive: Street View Generation with Diverse 3D Geometry Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02601v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02601v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruiyuan Gao, Kai Chen, Enze Xie, Lanqing Hong, Zhenguo Li, Dit-Yan Yeung, Qiang Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in diffusion models have significantly enhanced the data
+synthesis with 2D control. Yet, precise 3D control in street view generation,
+crucial for 3D perception tasks, remains elusive. Specifically, utilizing
+Bird's-Eye View (BEV) as the primary condition often leads to challenges in
+geometry control (e.g., height), affecting the representation of object shapes,
+occlusion patterns, and road surface elevations, all of which are essential to
+perception data synthesis, especially for 3D object detection tasks. In this
+paper, we introduce MagicDrive, a novel street view generation framework
+offering diverse 3D geometry controls, including camera poses, road maps, and
+3D bounding boxes, together with textual descriptions, achieved through
+tailored encoding strategies. Besides, our design incorporates a cross-view
+attention module, ensuring consistency across multiple camera views. With
+MagicDrive, we achieve high-fidelity street-view synthesis that captures
+nuanced 3D geometry and various scene descriptions, enhancing tasks like BEV
+segmentation and 3D object detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://flymin.github.io/magicdrive</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IPMix: Label-Preserving Data Augmentation Method for Training Robust
+  Classifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04780v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04780v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenglin Huang, Xianan Bao, Na Zhang, Qingqi Zhang, Xiaomei Tu, Biao Wu, Xi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data augmentation has been proven effective for training high-accuracy
+convolutional neural network classifiers by preventing overfitting. However,
+building deep neural networks in real-world scenarios requires not only high
+accuracy on clean data but also robustness when data distributions shift. While
+prior methods have proposed that there is a trade-off between accuracy and
+robustness, we propose IPMix, a simple data augmentation approach to improve
+robustness without hurting clean accuracy. IPMix integrates three levels of
+data augmentation (image-level, patch-level, and pixel-level) into a coherent
+and label-preserving technique to increase the diversity of training data with
+limited computational overhead. To further improve the robustness, IPMix
+introduces structural complexity at different levels to generate more diverse
+images and adopts the random mixing method for multi-scale information fusion.
+Experiments demonstrate that IPMix outperforms state-of-the-art corruption
+robustness on CIFAR-C and ImageNet-C. In addition, we show that IPMix also
+significantly improves the other safety measures, including robustness to
+adversarial perturbations, calibration, prediction consistency, and anomaly
+detection, achieving state-of-the-art or comparable results on several
+benchmarks, including ImageNet-R, ImageNet-A, and ImageNet-O.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ZSON: Zero-Shot Object-Goal Navigation using Multimodal Goal Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.12403v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.12403v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arjun Majumdar, Gunjan Aggarwal, Bhavika Devnani, Judy Hoffman, Dhruv Batra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a scalable approach for learning open-world object-goal navigation
+(ObjectNav) -- the task of asking a virtual robot (agent) to find any instance
+of an object in an unexplored environment (e.g., "find a sink"). Our approach
+is entirely zero-shot -- i.e., it does not require ObjectNav rewards or
+demonstrations of any kind. Instead, we train on the image-goal navigation
+(ImageNav) task, in which agents find the location where a picture (i.e., goal
+image) was captured. Specifically, we encode goal images into a multimodal,
+semantic embedding space to enable training semantic-goal navigation
+(SemanticNav) agents at scale in unannotated 3D environments (e.g., HM3D).
+After training, SemanticNav agents can be instructed to find objects described
+in free-form natural language (e.g., "sink", "bathroom sink", etc.) by
+projecting language goals into the same multimodal, semantic embedding space.
+As a result, our approach enables open-world ObjectNav. We extensively evaluate
+our agents on three ObjectNav datasets (Gibson, HM3D, and MP3D) and observe
+absolute improvements in success of 4.2% - 20.0% over existing zero-shot
+methods. For reference, these gains are similar or better than the 5%
+improvement in success between the Habitat 2020 and 2021 ObjectNav challenge
+winners. In an open-world setting, we discover that our agents can generalize
+to compound instructions with a room explicitly mentioned (e.g., "Find a
+kitchen sink") and when the target room can be inferred (e.g., "Find a sink and
+a stove").
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>code: https://github.com/gunagg/zson</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Halftoning via Deep Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.12152v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.12152v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haitian Jiang, Dongliang Xiong, Xiaowen Jiang, Li Ding, Liang Chen, Kai Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Halftoning aims to reproduce a continuous-tone image with pixels whose
+intensities are constrained to two discrete levels. This technique has been
+deployed on every printer, and the majority of them adopt fast methods (e.g.,
+ordered dithering, error diffusion) that fail to render structural details,
+which determine halftone's quality. Other prior methods of pursuing visual
+pleasure by searching for the optimal halftone solution, on the contrary,
+suffer from their high computational cost. In this paper, we propose a fast and
+structure-aware halftoning method via a data-driven approach. Specifically, we
+formulate halftoning as a reinforcement learning problem, in which each binary
+pixel's value is regarded as an action chosen by a virtual agent with a shared
+fully convolutional neural network (CNN) policy. In the offline phase, an
+effective gradient estimator is utilized to train the agents in producing
+high-quality halftones in one action step. Then, halftones can be generated
+online by one fast CNN inference. Besides, we propose a novel anisotropy
+suppressing loss function, which brings the desirable blue-noise property.
+Finally, we find that optimizing SSIM could result in holes in flat areas,
+which can be avoided by weighting the metric with the contone's contrast map.
+Experiments show that our framework can effectively train a light-weight CNN,
+which is 15x faster than previous structure-aware methods, to generate
+blue-noise halftones with satisfactory visual quality. We also present a
+prototype of deep multitoning to demonstrate the extensibility of our method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">GPT</span>4RoI: Instruction Tuning Large Language Model on Region-of-Interest 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03601v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03601v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shilong Zhang, Peize Sun, Shoufa Chen, Min Xiao, Wenqi Shao, Wenwei Zhang, Yu Liu, Kai Chen, Ping Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual instruction tuning large language model(LLM) on image-text pairs has
+achieved general-purpose vision-language abilities. However, the lack of
+region-text pairs limits their advancements to fine-grained multimodal
+understanding. In this paper, we propose spatial instruction tuning, which
+introduces the reference to the region-of-interest(RoI) in the instruction.
+Before sending to LLM, the reference is replaced by RoI features and
+interleaved with language embeddings as a sequence. Our model GPT4RoI, trained
+on 7 region-text pair datasets, brings an unprecedented interactive and
+conversational experience compared to previous image-level models. (1)
+Interaction beyond language: Users can interact with our model by both language
+and drawing bounding boxes to flexibly adjust the referring granularity. (2)
+Versatile multimodal abilities: A variety of attribute information within each
+RoI can be mined by GPT4RoI, e.g., color, shape, material, action, etc.
+Furthermore, it can reason about multiple RoIs based on common sense. On the
+Visual Commonsense Reasoning(VCR) dataset, GPT4RoI achieves a remarkable
+accuracy of 81.6%, surpassing all existing models by a significant margin (the
+second place is 75.6%) and almost reaching human-level performance of 85.0%.
+The code, dataset, and demo can be found at
+https://github.com/jshilong/GPT4RoI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code has been released at https://github.com/jshilong/GPT4RoI</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ H-InDex: Visual Reinforcement Learning with Hand-Informed
+  Representations for Dexterous Manipulation <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01404v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01404v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanjie Ze, Yuyao Liu, Ruizhe Shi, Jiaxin Qin, Zhecheng Yuan, Jiashun Wang, Huazhe Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human hands possess remarkable dexterity and have long served as a source of
+inspiration for robotic manipulation. In this work, we propose a human
+$\textbf{H}$and$\textbf{-In}$formed visual representation learning framework to
+solve difficult $\textbf{Dex}$terous manipulation tasks ($\textbf{H-InDex}$)
+with reinforcement learning. Our framework consists of three stages: (i)
+pre-training representations with 3D human hand pose estimation, (ii) offline
+adapting representations with self-supervised keypoint detection, and (iii)
+reinforcement learning with exponential moving average BatchNorm. The last two
+stages only modify $0.36\%$ parameters of the pre-trained representation in
+total, ensuring the knowledge from pre-training is maintained to the full
+extent. We empirically study 12 challenging dexterous manipulation tasks and
+find that H-InDex largely surpasses strong baseline methods and the recent
+visual foundation models for motor control. Code is available at
+https://yanjieze.com/H-InDex .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023. Code and videos: https://yanjieze.com/H-InDex</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lifelong-MonoDepth: Lifelong Learning for Multi-Domain Monocular Metric
+  Depth Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.05050v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.05050v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junjie Hu, Chenyou Fan, Liguang Zhou, Qing Gao, Honghai Liu, Tin Lun Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the rapid advancements in autonomous driving and robot navigation, there
+is a growing demand for lifelong learning models capable of estimating metric
+(absolute) depth. Lifelong learning approaches potentially offer significant
+cost savings in terms of model training, data storage, and collection. However,
+the quality of RGB images and depth maps is sensor-dependent, and depth maps in
+the real world exhibit domain-specific characteristics, leading to variations
+in depth ranges. These challenges limit existing methods to lifelong learning
+scenarios with small domain gaps and relative depth map estimation. To
+facilitate lifelong metric depth learning, we identify three crucial technical
+challenges that require attention: i) developing a model capable of addressing
+the depth scale variation through scale-aware depth learning, ii) devising an
+effective learning strategy to handle significant domain gaps, and iii)
+creating an automated solution for domain-aware depth inference in practical
+applications. Based on the aforementioned considerations, in this paper, we
+present i) a lightweight multi-head framework that effectively tackles the
+depth scale imbalance, ii) an uncertainty-aware lifelong learning solution that
+adeptly handles significant domain gaps, and iii) an online domain-specific
+predictor selection method for real-time inference. Through extensive numerical
+studies, we show that the proposed method can achieve good efficiency,
+stability, and plasticity, leading the benchmarks by 8% to 15%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Qwen-VL: A Versatile Vision-Language Model for Understanding,
+  Localization, Text Reading, and Beyond 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.12966v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.12966v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, Jingren Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we introduce the Qwen-VL series, a set of large-scale
+vision-language models (LVLMs) designed to perceive and understand both texts
+and images. Starting from the Qwen-LM as a foundation, we endow it with visual
+capacity by the meticulously designed (i) visual receptor, (ii) input-output
+interface, (iii) 3-stage training pipeline, and (iv) multilingual multimodal
+cleaned corpus. Beyond the conventional image description and
+question-answering, we implement the grounding and text-reading ability of
+Qwen-VLs by aligning image-caption-box tuples. The resulting models, including
+Qwen-VL and Qwen-VL-Chat, set new records for generalist models under similar
+model scales on a broad range of visual-centric benchmarks (e.g., image
+captioning, question answering, visual grounding) and different settings (e.g.,
+zero-shot, few-shot). Moreover, on real-world dialog benchmarks, our
+instruction-tuned Qwen-VL-Chat also demonstrates superiority compared to
+existing vision-language chatbots. Code, demo and models are available at
+https://github.com/QwenLM/Qwen-VL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code, demo and models are available at
+  https://github.com/QwenLM/Qwen-VL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Geom-Erasing: Geometry-Driven Removal of Implicit Concept in Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05873v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05873v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhili Liu, Kai Chen, Yifan Zhang, Jianhua Han, Lanqing Hong, Hang Xu, Zhenguo Li, Dit-Yan Yeung, James Kwok
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning diffusion models through personalized datasets is an acknowledged
+method for improving generation quality across downstream tasks, which,
+however, often inadvertently generates unintended concepts such as watermarks
+and QR codes, attributed to the limitations in image sources and collecting
+methods within specific downstream tasks. Existing solutions suffer from
+eliminating these unintentionally learned implicit concepts, primarily due to
+the dependency on the model's ability to recognize concepts that it actually
+cannot discern. In this work, we introduce Geom-Erasing, a novel approach that
+successfully removes the implicit concepts with either an additional accessible
+classifier or detector model to encode geometric information of these concepts
+into text domain. Moreover, we propose Implicit Concept, a novel image-text
+dataset imbued with three implicit concepts (i.e., watermarks, QR codes, and
+text) for training and evaluation. Experimental results demonstrate that
+Geom-Erasing not only identifies but also proficiently eradicates implicit
+concepts, revealing a significant improvement over the existing methods. The
+integration of geometric information marks a substantial progression in the
+precise removal of implicit concepts in diffusion models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SAMRS: Scaling-up Remote Sensing Segmentation <span class="highlight-title">Dataset</span> with Segment
+  Anything Model <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.02034v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.02034v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Di Wang, Jing Zhang, Bo Du, Minqiang Xu, Lin Liu, Dacheng Tao, Liangpei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The success of the Segment Anything Model (SAM) demonstrates the significance
+of data-centric machine learning. However, due to the difficulties and high
+costs associated with annotating Remote Sensing (RS) images, a large amount of
+valuable RS data remains unlabeled, particularly at the pixel level. In this
+study, we leverage SAM and existing RS object detection datasets to develop an
+efficient pipeline for generating a large-scale RS segmentation dataset, dubbed
+SAMRS. SAMRS totally possesses 105,090 images and 1,668,241 instances,
+surpassing existing high-resolution RS segmentation datasets in size by several
+orders of magnitude. It provides object category, location, and instance
+information that can be used for semantic segmentation, instance segmentation,
+and object detection, either individually or in combination. We also provide a
+comprehensive analysis of SAMRS from various aspects. Moreover, preliminary
+experiments highlight the importance of conducting segmentation pre-training
+with SAMRS to address task discrepancies and alleviate the limitations posed by
+limited training data during fine-tuning. The code and dataset will be
+available at https://github.com/ViTAE-Transformer/SAMRS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2023 Datasets and Benchmarks Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VideoFusion: Decomposed Diffusion Models for High-Quality Video
+  Generation <span class="chip">CVPR2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08320v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08320v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengxiong Luo, Dayou Chen, Yingya Zhang, Yan Huang, Liang Wang, Yujun Shen, Deli Zhao, Jingren Zhou, Tieniu Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A diffusion probabilistic model (DPM), which constructs a forward diffusion
+process by gradually adding noise to data points and learns the reverse
+denoising process to generate new samples, has been shown to handle complex
+data distribution. Despite its recent success in image synthesis, applying DPMs
+to video generation is still challenging due to high-dimensional data spaces.
+Previous methods usually adopt a standard diffusion process, where frames in
+the same video clip are destroyed with independent noises, ignoring the content
+redundancy and temporal correlation. This work presents a decomposed diffusion
+process via resolving the per-frame noise into a base noise that is shared
+among all frames and a residual noise that varies along the time axis. The
+denoising pipeline employs two jointly-learned networks to match the noise
+decomposition accordingly. Experiments on various datasets confirm that our
+approach, termed as VideoFusion, surpasses both GAN-based and diffusion-based
+alternatives in high-quality video generation. We further show that our
+decomposed formulation can benefit from pre-trained image diffusion models and
+well-support text-conditioned video creation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CVPR2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can We Edit Multimodal Large Language Models? <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08475v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08475v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyuan Cheng, Bozhong Tian, Qingbin Liu, Xi Chen, Yongheng Wang, Huajun Chen, Ningyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we focus on editing Multimodal Large Language Models (MLLMs).
+Compared to editing single-modal LLMs, multimodal model editing is more
+challenging, which demands a higher level of scrutiny and careful consideration
+in the editing process. To facilitate research in this area, we construct a new
+benchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite
+of innovative metrics for evaluation. We conduct comprehensive experiments
+involving various model editing baselines and analyze the impact of editing
+different components for multimodal LLMs. Empirically, we notice that previous
+baselines can implement editing multimodal LLMs to some extent, but the effect
+is still barely satisfactory, indicating the potential difficulty of this task.
+We hope that our work can provide the NLP community with insights. Code and
+dataset are available in https://github.com/zjunlp/EasyEdit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">9</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Click<span class="highlight-title">Prompt</span>: CTR Models are Strong <span class="highlight-title">Prompt</span> Generators for Adapting
+  Language Models to CTR Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09234v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09234v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianghao Lin, Bo Chen, Hangyu Wang, Yunjia Xi, Yanru Qu, Xinyi Dai, Kangning Zhang, Ruiming Tang, Yong Yu, Weinan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Click-through rate (CTR) prediction has become increasingly indispensable for
+various Internet applications. Traditional CTR models convert the multi-field
+categorical data into ID features via one-hot encoding, and extract the
+collaborative signals among features. Such a paradigm suffers from the problem
+of semantic information loss. Another line of research explores the potential
+of pretrained language models (PLMs) for CTR prediction by converting input
+data into textual sentences through hard prompt templates. Although semantic
+signals are preserved, they generally fail to capture the collaborative
+information (e.g., feature interactions, pure ID features), not to mention the
+unacceptable inference overhead brought by the huge model size. In this paper,
+we aim to model both the semantic knowledge and collaborative knowledge for
+accurate CTR estimation, and meanwhile address the inference inefficiency
+issue. To benefit from both worlds and close their gaps, we propose a novel
+model-agnostic framework (i.e., ClickPrompt), where we incorporate CTR models
+to generate interaction-aware soft prompts for PLMs. We design a
+prompt-augmented masked language modeling (PA-MLM) pretraining task, where PLM
+has to recover the masked tokens based on the language context, as well as the
+soft prompts generated by CTR model. The collaborative and semantic knowledge
+from ID and textual features would be explicitly aligned and interacted via the
+prompt interface. Then, we can either tune the CTR model with PLM for superior
+performance, or solely tune the CTR model without PLM for inference efficiency.
+Experiments on four real-world datasets validate the effectiveness of
+ClickPrompt compared with existing baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AgentCF: Collaborative Learning with Autonomous Language Agents for
+  Recommender Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09233v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09233v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junjie Zhang, Yupeng Hou, Ruobing Xie, Wenqi Sun, Julian McAuley, Wayne Xin Zhao, Leyu Lin, Ji-Rong Wen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, there has been an emergence of employing LLM-powered agents as
+believable human proxies, based on their remarkable decision-making capability.
+However, existing studies mainly focus on simulating human dialogue. Human
+non-verbal behaviors, such as item clicking in recommender systems, although
+implicitly exhibiting user preferences and could enhance the modeling of users,
+have not been deeply explored. The main reasons lie in the gap between language
+modeling and behavior modeling, as well as the incomprehension of LLMs about
+user-item relations.
+  To address this issue, we propose AgentCF for simulating user-item
+interactions in recommender systems through agent-based collaborative
+filtering. We creatively consider not only users but also items as agents, and
+develop a collaborative learning approach that optimizes both kinds of agents
+together. Specifically, at each time step, we first prompt the user and item
+agents to interact autonomously. Then, based on the disparities between the
+agents' decisions and real-world interaction records, user and item agents are
+prompted to reflect on and adjust the misleading simulations collaboratively,
+thereby modeling their two-sided relations. The optimized agents can also
+propagate their preferences to other agents in subsequent interactions,
+implicitly capturing the collaborative filtering idea. Overall, the optimized
+agents exhibit diverse interaction behaviors within our framework, including
+user-item, user-user, item-item, and collective interactions. The results show
+that these agents can demonstrate personalized behaviors akin to those of
+real-world individuals, sparking the development of next-generation user
+behavior simulation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EHI: End-to-end Learning of Hierarchical Index for Efficient Dense
+  Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08891v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08891v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ramnath Kumar, Anshul Mittal, Nilesh Gupta, Aditya Kusupati, Inderjit Dhillon, Prateek Jain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dense embedding-based retrieval is now the industry standard for semantic
+search and ranking problems, like obtaining relevant web documents for a given
+query. Such techniques use a two-stage process: (a) contrastive learning to
+train a dual encoder to embed both the query and documents and (b) approximate
+nearest neighbor search (ANNS) for finding similar documents for a given query.
+These two stages are disjoint; the learned embeddings might be ill-suited for
+the ANNS method and vice-versa, leading to suboptimal performance. In this
+work, we propose End-to-end Hierarchical Indexing -- EHI -- that jointly learns
+both the embeddings and the ANNS structure to optimize retrieval performance.
+EHI uses a standard dual encoder model for embedding queries and documents
+while learning an inverted file index (IVF) style tree structure for efficient
+ANNS. To ensure stable and efficient learning of discrete tree-based ANNS
+structure, EHI introduces the notion of dense path embedding that captures the
+position of a query/document in the tree. We demonstrate the effectiveness of
+EHI on several benchmarks, including de-facto industry standard MS MARCO (Dev
+set and TREC DL19) datasets. For example, with the same compute budget, EHI
+outperforms state-of-the-art (SOTA) in by 0.6% (MRR@10) on MS MARCO dev set and
+by 4.2% (nDCG@10) on TREC DL19 benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CIDER: Category-Guided Intent Disentanglement for Accurate Personalized
+  News Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09401v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09401v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunyong Ko, Seongeun Ryu, Sang-Wook Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized news recommendation aims to assist users in finding news
+articles that align with their interests, which plays a pivotal role in
+mitigating users' information overload problem. Although many recent works have
+been studied for better user and news representations, the following challenges
+have been rarely studied: (C1) How to precisely comprehend a range of intents
+coupled within a news article? and (C2) How to differentiate news articles with
+varying post-read preferences in users' click history? To tackle both
+challenges together, in this paper, we propose a novel personalized news
+recommendation framework (CIDER) that employs (1) category-guided intent
+disentanglement for (C1) and (2) consistency-based news representation for
+(C2). Furthermore, we incorporate a category prediction into the training
+process of CIDER as an auxiliary task, which provides supplementary supervisory
+signals to enhance intent disentanglement. Extensive experiments on two
+real-world datasets reveal that (1) CIDER provides consistent performance
+improvements over seven state-of-the-art news recommendation methods and (2)
+the proposed strategies significantly improve the model accuracy of CIDER.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Collaborative Contextualization: Bridging the Gap between Collaborative
+  Filtering and <span class="highlight-title">Pre-train</span>ed Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09400v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09400v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Wang, Liangwei Yang, Zhiwei Liu, Xiaolong Liu, Mingdai Yang, Yueqing Liang, Philip S. Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditional recommender systems have heavily relied on identity
+representations (IDs) to model users and items, while the ascendancy of
+pre-trained language model (PLM) encoders has enriched the modeling of
+contextual item descriptions. However, PLMs, although effective in addressing
+few-shot, zero-shot, or unified modeling scenarios, often neglect the crucial
+collaborative filtering signal. This neglect gives rise to two pressing
+challenges: (1) Collaborative Contextualization, the seamless integration of
+collaborative signals with contextual representations. (2) the imperative to
+bridge the representation gap between ID-based representations and contextual
+representations while preserving their contextual semantics. In this paper, we
+propose CollabContext, a novel model that adeptly combines collaborative
+filtering signals with contextual representations and aligns these
+representations within the contextual space, preserving essential contextual
+semantics. Experimental results across three real-world datasets demonstrate
+substantial improvements. Leveraging collaborative contextualization,
+CollabContext can also be effectively applied to cold-start scenarios,
+achieving remarkable enhancements in recommendation performance. The code is
+available after the conference accepts the paper.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Addressing the cold start problem in privacy preserving content-based
+  recommender systems using hypercube graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09341v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09341v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Noa Tuval, Alain Hertz, Tsvi Kuflik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The initial interaction of a user with a recommender system is problematic
+because, in such a so-called cold start situation, the recommender system has
+very little information about the user, if any. Moreover, in collaborative
+filtering, users need to share their preferences with the service provider by
+rating items while in content-based filtering there is no need for such
+information sharing. We have recently shown that a content-based model that
+uses hypercube graphs can determine user preferences with a very limited number
+of ratings while better preserving user privacy. In this paper, we confirm
+these findings on the basis of experiments with more than 1,000 users in the
+restaurant and movie domains. We show that the proposed method outperforms
+standard machine learning algorithms when the number of available ratings is at
+most 10, which often happens, and is competitive with larger training sets. In
+addition, training is simple and does not require large computational efforts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22 pages, 6 figures, 9 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UDAPDR: Unsupervised Domain Adaptation via LLM <span class="highlight-title">Prompt</span>ing and
+  Distillation of Rerankers <span class="chip">EMNLP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.00807v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.00807v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jon Saad-Falcon, Omar Khattab, Keshav Santhanam, Radu Florian, Martin Franz, Salim Roukos, Avirup Sil, Md Arafat Sultan, Christopher Potts
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many information retrieval tasks require large labeled datasets for
+fine-tuning. However, such datasets are often unavailable, and their utility
+for real-world applications can diminish quickly due to domain shifts. To
+address this challenge, we develop and motivate a method for using large
+language models (LLMs) to generate large numbers of synthetic queries cheaply.
+The method begins by generating a small number of synthetic queries using an
+expensive LLM. After that, a much less expensive one is used to create large
+numbers of synthetic queries, which are used to fine-tune a family of reranker
+models. These rerankers are then distilled into a single efficient retriever
+for use in the target domain. We show that this technique boosts zero-shot
+accuracy in long-tail domains and achieves substantially lower latency than
+standard reranking methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Long Paper at Empirical Methods in Natural Language Processing
+  (EMNLP) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ReLLa: Retrieval-enhanced Large Language Models for Lifelong Sequential
+  Behavior Comprehension in Recommendation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.11131v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.11131v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianghao Lin, Rong Shan, Chenxu Zhu, Kounianhua Du, Bo Chen, Shigang Quan, Ruiming Tang, Yong Yu, Weinan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With large language models (LLMs) achieving remarkable breakthroughs in
+natural language processing (NLP) domains, LLM-enhanced recommender systems
+have received much attention and have been actively explored currently. In this
+paper, we focus on adapting and empowering a pure large language model for
+zero-shot and few-shot recommendation tasks. First and foremost, we identify
+and formulate the lifelong sequential behavior incomprehension problem for LLMs
+in recommendation domains, i.e., LLMs fail to extract useful information from a
+textual context of long user behavior sequence, even if the length of context
+is far from reaching the context limitation of LLMs. To address such an issue
+and improve the recommendation performance of LLMs, we propose a novel
+framework, namely Retrieval-enhanced Large Language models (ReLLa) for
+recommendation tasks in both zero-shot and few-shot settings. For zero-shot
+recommendation, we perform semantic user behavior retrieval (SUBR) to improve
+the data quality of testing samples, which greatly reduces the difficulty for
+LLMs to extract the essential knowledge from user behavior sequences. As for
+few-shot recommendation, we further design retrieval-enhanced instruction
+tuning (ReiT) by adopting SUBR as a data augmentation technique for training
+samples. Specifically, we develop a mixed training dataset consisting of both
+the original data samples and their retrieval-enhanced counterparts. We conduct
+extensive experiments on three real-world public datasets to demonstrate the
+superiority of ReLLa compared with existing baseline models, as well as its
+capability for lifelong sequential behavior comprehension. To be highlighted,
+with only less than 10% training samples, few-shot ReLLa can outperform
+traditional CTR models that are trained on the entire training set (e.g.,
+DCNv2, DIN, SIM).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Updated Version. Few-shot ReLLa is now able to outperform full-shot
+  CTR models trained on the entire training set</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comprehensive <span class="highlight-title">Survey</span> on Deep Learning Techniques in Educational Data
+  Mining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.04761v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.04761v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanguo Lin, Hong Chen, Wei Xia, Fan Lin, Pengcheng Wu, Zongyue Wang, Yong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Educational Data Mining (EDM) has emerged as a vital field of research, which
+harnesses the power of computational techniques to analyze educational data.
+With the increasing complexity and diversity of educational data, Deep Learning
+techniques have shown significant advantages in addressing the challenges
+associated with analyzing and modeling this data. This survey aims to
+systematically review the state-of-the-art in EDM with Deep Learning. We begin
+by providing a brief introduction to EDM and Deep Learning, highlighting their
+relevance in the context of modern education. Next, we present a detailed
+review of Deep Learning techniques applied in four typical educational
+scenarios, including knowledge tracing, undesirable student detecting,
+performance prediction, and personalized recommendation. Furthermore, a
+comprehensive overview of public datasets and processing tools for EDM is
+provided. Finally, we point out emerging trends and future directions in this
+research area.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">120</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Disentangled Latent Spaces Facilitate Data-Driven Auxiliary Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09278v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09278v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Geri Skenderi, Luigi Capogrosso, Andrea Toaiari, Matteo Denitto, Franco Fummi, Simone Melzi, Marco Cristani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In deep learning, auxiliary objectives are often used to facilitate learning
+in situations where data is scarce, or the principal task is extremely complex.
+This idea is primarily inspired by the improved generalization capability
+induced by solving multiple tasks simultaneously, which leads to a more robust
+shared representation. Nevertheless, finding optimal auxiliary tasks that give
+rise to the desired improvement is a crucial problem that often requires
+hand-crafted solutions or expensive meta-learning approaches. In this paper, we
+propose a novel framework, dubbed Detaux, whereby a weakly supervised
+disentanglement procedure is used to discover new unrelated classification
+tasks and the associated labels that can be exploited with the principal task
+in any Multi-Task Learning (MTL) model. The disentanglement procedure works at
+a representation level, isolating a subspace related to the principal task,
+plus an arbitrary number of orthogonal subspaces. In the most disentangled
+subspaces, through a clustering procedure, we generate the additional
+classification tasks, and the associated labels become their representatives.
+Subsequently, the original data, the labels associated with the principal task,
+and the newly discovered ones can be fed into any MTL framework. Extensive
+validation on both synthetic and real data, along with various ablation
+studies, demonstrate promising results, revealing the potential in what has
+been, so far, an unexplored connection between learning disentangled
+representations and MTL. The code will be made publicly available upon
+acceptance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review in Pattern Recognition Letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Hybrid Approach for Depression Classification: Random Forest-ANN
+  Ensemble on Motor Activity Signals 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09277v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09277v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anket Patil, Dhairya Shah, Abhishek Shah, Mokshit Gala
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Regarding the rising number of people suffering from mental health illnesses
+in today's society, the importance of mental health cannot be overstated.
+Wearable sensors, which are increasingly widely available, provide a potential
+way to track and comprehend mental health issues. These gadgets not only
+monitor everyday activities but also continuously record vital signs like heart
+rate, perhaps providing information on a person's mental state. Recent research
+has used these sensors in conjunction with machine learning methods to identify
+patterns relating to different mental health conditions, highlighting the
+immense potential of this data beyond simple activity monitoring. In this
+research, we present a novel algorithm called the Hybrid Random forest - Neural
+network that has been tailored to evaluate sensor data from depressed patients.
+Our method has a noteworthy accuracy of 80\% when evaluated on a special
+dataset that included both unipolar and bipolar depressive patients as well as
+healthy controls. The findings highlight the algorithm's potential for reliably
+determining a person's depression condition using sensor data, making a
+substantial contribution to the area of mental health diagnostics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Retro-fallback: retrosynthetic planning in an uncertain world 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09270v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09270v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Austin Tripp, Krzysztof Maziarz, Sarah Lewis, Marwin Segler, José Miguel Hernández-Lobato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrosynthesis is the task of proposing a series of chemical reactions to
+create a desired molecule from simpler, buyable molecules. While previous works
+have proposed algorithms to find optimal solutions for a range of metrics (e.g.
+shortest, lowest-cost), these works generally overlook the fact that we have
+imperfect knowledge of the space of possible reactions, meaning plans created
+by the algorithm may not work in a laboratory. In this paper we propose a novel
+formulation of retrosynthesis in terms of stochastic processes to account for
+this uncertainty. We then propose a novel greedy algorithm called
+retro-fallback which maximizes the probability that at least one synthesis plan
+can be executed in the lab. Using in-silico benchmarks we demonstrate that
+retro-fallback generally produces better sets of synthesis plans than the
+popular MCTS and retro* algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>39 pages (including appendices). Currently undergoing peer review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Genetic algorithms are strong baselines for molecule generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09267v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09267v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Austin Tripp, José Miguel Hernández-Lobato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating molecules, both in a directed and undirected fashion, is a huge
+part of the drug discovery pipeline. Genetic algorithms (GAs) generate
+molecules by randomly modifying known molecules. In this paper we show that GAs
+are very strong algorithms for such tasks, outperforming many complicated
+machine learning methods: a result which many researchers may find surprising.
+We therefore propose insisting during peer review that new algorithms must have
+some clear advantage over GAs, which we call the GA criterion. Ultimately our
+work suggests that a lot of research in molecule generation should be
+re-assessed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Currently under review. Code will be made available at a later date</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ User Inference Attacks on Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09266v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09266v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nikhil Kandpal, Krishna Pillutla, Alina Oprea, Peter Kairouz, Christopher A. Choquette-Choo, Zheng Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning is a common and effective method for tailoring large language
+models (LLMs) to specialized tasks and applications. In this paper, we study
+the privacy implications of fine-tuning LLMs on user data. To this end, we
+define a realistic threat model, called user inference, wherein an attacker
+infers whether or not a user's data was used for fine-tuning. We implement
+attacks for this threat model that require only a small set of samples from a
+user (possibly different from the samples used for training) and black-box
+access to the fine-tuned LLM. We find that LLMs are susceptible to user
+inference attacks across a variety of fine-tuning datasets, at times with near
+perfect attack success rates. Further, we investigate which properties make
+users vulnerable to user inference, finding that outlier users (i.e. those with
+data distributions sufficiently different from other users) and users who
+contribute large quantities of data are most susceptible to attack. Finally, we
+explore several heuristics for mitigating privacy attacks. We find that
+interventions in the training algorithm, such as batch or per-example gradient
+clipping and early stopping fail to prevent user inference. However, limiting
+the number of fine-tuning samples from a single user can reduce attack
+effectiveness, albeit at the cost of reducing the total amount of fine-tuning
+data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span>RE: Weakly-Supervised Document-Level Relation Extraction via
+  <span class="highlight-title">Prompt</span>ing-Based Data Programming 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09265v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09265v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chufan Gao, Xulin Fan, Jimeng Sun, Xuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Relation extraction aims to classify the relationships between two entities
+into pre-defined categories. While previous research has mainly focused on
+sentence-level relation extraction, recent studies have expanded the scope to
+document-level relation extraction. Traditional relation extraction methods
+heavily rely on human-annotated training data, which is time-consuming and
+labor-intensive. To mitigate the need for manual annotation, recent
+weakly-supervised approaches have been developed for sentence-level relation
+extraction while limited work has been done on document-level relation
+extraction. Weakly-supervised document-level relation extraction faces
+significant challenges due to an imbalanced number "no relation" instances and
+the failure of directly probing pretrained large language models for document
+relation extraction. To address these challenges, we propose PromptRE, a novel
+weakly-supervised document-level relation extraction method that combines
+prompting-based techniques with data programming. Furthermore, PromptRE
+incorporates the label distribution and entity types as prior knowledge to
+improve the performance. By leveraging the strengths of both prompting and data
+programming, PromptRE achieves improved performance in relation classification
+and effectively handles the "no relation" problem. Experimental results on
+ReDocRED, a benchmark dataset for document-level relation extraction,
+demonstrate the superiority of PromptRE over baseline approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards End-to-end 4-Bit Inference on Generative Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09259v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09259v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Saleh Ashkboos, Ilia Markov, Elias Frantar, Tingxuan Zhong, Xincheng Wang, Jie Ren, Torsten Hoefler, Dan Alistarh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We show that the majority of the inference computations for large generative
+models such as LLaMA and OPT can be performed with both weights and activations
+being cast to 4 bits, in a way that leads to practical speedups while at the
+same time maintaining good accuracy. We achieve this via a hybrid quantization
+strategy called QUIK, which compresses most of the weights and activations to
+4-bit, while keeping some outlier weights and activations in higher-precision.
+Crucially, our scheme is designed with computational efficiency in mind: we
+provide GPU kernels with highly-efficient layer-wise runtimes, which lead to
+practical end-to-end throughput improvements of up to 3.1x relative to FP16
+execution. Code and models are provided at https://github.com/IST-DASLab/QUIK.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Entropic Neural Optimal Transport To Map Within and Across
+  Spaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09254v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09254v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominik Klein, Théo Uscidda, Fabian Theis, Marco Cuturi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning measure-to-measure mappings is a crucial task in machine learning,
+featured prominently in generative modeling. Recent years have witnessed a
+surge of techniques that draw inspiration from optimal transport (OT) theory.
+Combined with neural network models, these methods collectively known as
+\textit{Neural OT} use optimal transport as an inductive bias: such mappings
+should be optimal w.r.t. a given cost function, in the sense that they are able
+to move points in a thrifty way, within (by minimizing displacements) or across
+spaces (by being isometric). This principle, while intuitive, is often
+confronted with several practical challenges that require adapting the OT
+toolbox: cost functions other than the squared-Euclidean cost can be
+challenging to handle, the deterministic formulation of Monge maps leaves
+little flexibility, mapping across incomparable spaces raises multiple
+challenges, while the mass conservation constraint inherent to OT can provide
+too much credit to outliers. While each of these mismatches between practice
+and theory has been addressed independently in various works, we propose in
+this work an elegant framework to unify them, called \textit{generative
+entropic neural optimal transport} (GENOT). GENOT can accommodate any cost
+function; handles randomness using conditional generative models; can map
+points across incomparable spaces, and can be used as an \textit{unbalanced}
+solver. We evaluate our approach through experiments conducted on various
+synthetic datasets and demonstrate its practicality in single-cell biology. In
+this domain, GENOT proves to be valuable for tasks such as modeling cell
+development, predicting cellular responses to drugs, and translating between
+different data modalities of cells.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ It's an Alignment, Not a Trade-off: Revisiting Bias and Variance in Deep
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09250v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09250v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lin Chen, Michal Lukasik, Wittawat Jitkrittum, Chong You, Sanjiv Kumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classical wisdom in machine learning holds that the generalization error can
+be decomposed into bias and variance, and these two terms exhibit a
+\emph{trade-off}. However, in this paper, we show that for an ensemble of deep
+learning based classification models, bias and variance are \emph{aligned} at a
+sample level, where squared bias is approximately \emph{equal} to variance for
+correctly classified sample points. We present empirical evidence confirming
+this phenomenon in a variety of deep learning models and datasets. Moreover, we
+study this phenomenon from two theoretical perspectives: calibration and neural
+collapse. We first show theoretically that under the assumption that the models
+are well calibrated, we can observe the bias-variance alignment. Second,
+starting from the picture provided by the neural collapse theory, we show an
+approximate correlation between bias and variance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hypernymy Understanding Evaluation of Text-to-Image Models via WordNet
+  Hierarchy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09247v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09247v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anton Baryshnikov, Max Ryabinin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-to-image synthesis has recently attracted widespread attention due to
+rapidly improving quality and numerous practical applications. However, the
+language understanding capabilities of text-to-image models are still poorly
+understood, which makes it difficult to reason about prompt formulations that a
+given model would understand well. In this work, we measure the capability of
+popular text-to-image models to understand $\textit{hypernymy}$, or the "is-a"
+relation between words. We design two automatic metrics based on the WordNet
+semantic hierarchy and existing image classifiers pretrained on ImageNet. These
+metrics both enable broad quantitative comparison of linguistic capabilities
+for text-to-image models and offer a way of finding fine-grained qualitative
+differences, such as words that are unknown to models and thus are difficult
+for them to draw. We comprehensively evaluate popular text-to-image models,
+including GLIDE, Latent Diffusion, and Stable Diffusion, showing how our
+metrics can provide a better understanding of the individual strengths and
+weaknesses of these models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Time CNN and Graph Convolution Network for Epileptic Spike Detection in
+  MEG Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pauline Mouches, Thibaut Dejean, Julien Jung, Romain Bouet, Carole Lartizien, Romain Quentin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Magnetoencephalography (MEG) recordings of patients with epilepsy exhibit
+spikes, a typical biomarker of the pathology. Detecting those spikes allows
+accurate localization of brain regions triggering seizures. Spike detection is
+often performed manually. However, it is a burdensome and error prone task due
+to the complexity of MEG data. To address this problem, we propose a 1D
+temporal convolutional neural network (Time CNN) coupled with a graph
+convolutional network (GCN) to classify short time frames of MEG recording as
+containing a spike or not. Compared to other recent approaches, our models have
+fewer parameters to train and we propose to use a GCN to account for MEG
+sensors spatial relationships. Our models produce clinically relevant results
+and outperform deep learning-based state-of-the-art methods reaching a
+classification f1-score of 76.7% on a balanced dataset and of 25.5% on a
+realistic, highly imbalanced dataset, for the spike class.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This work has been submitted to IEEE ISBI 2024 for possible
+  publication</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Insuring Smiles: Predicting routine dental coverage using Spark ML 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09229v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09229v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aishwarya Gupta, Rahul S. Bhogale, Priyanka Thota, Prathushkumar Dathuri, Jongwook Woo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Finding suitable health insurance coverage can be challenging for individuals
+and small enterprises in the USA. The Health Insurance Exchange Public Use
+Files (Exchange PUFs) dataset provided by CMS offers valuable information on
+health and dental policies [1]. In this paper, we leverage machine learning
+algorithms to predict if a health insurance plan covers routine dental services
+for adults. By analyzing plan type, region, deductibles, out-of-pocket
+maximums, and copayments, we employ Logistic Regression, Decision Tree, Random
+Forest, Gradient Boost, Factorization Model and Support Vector Machine
+algorithms. Our goal is to provide a clinical strategy for individuals and
+families to select the most suitable insurance plan based on income and
+expenses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 13 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast & Efficient Learning of Bayesian Networks from Data: Knowledge
+  Discovery and Causality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09222v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09222v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minn Sein, Fu Shunkai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Structure learning is essential for Bayesian networks (BNs) as it uncovers
+causal relationships, and enables knowledge discovery, predictions, inferences,
+and decision-making under uncertainty. Two novel algorithms, FSBN and SSBN,
+based on the PC algorithm, employ local search strategy and conditional
+independence tests to learn the causal network structure from data. They
+incorporate d-separation to infer additional topology information, prioritize
+conditioning sets, and terminate the search immediately and efficiently. FSBN
+achieves up to 52% computation cost reduction, while SSBN surpasses it with a
+remarkable 72% reduction for a 200-node network. SSBN demonstrates further
+efficiency gains due to its intelligent strategy. Experimental studies show
+that both algorithms match the induction quality of the PC algorithm while
+significantly reducing computation costs. This enables them to offer
+interpretability and adaptability while reducing the computational burden,
+making them valuable for various applications in big data analytics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unseen Image Synthesis with Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09213v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09213v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ye Zhu, Yu Wu, Zhiwei Deng, Olga Russakovsky, Yan Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While the current trend in the generative field is scaling up towards larger
+models and more training data for generalized domain representations, we go the
+opposite direction in this work by synthesizing unseen domain images without
+additional training. We do so via latent sampling and geometric optimization
+using pre-trained and frozen Denoising Diffusion Probabilistic Models (DDPMs)
+on single-domain datasets. Our key observation is that DDPMs pre-trained even
+just on single-domain images are already equipped with sufficient
+representation abilities to reconstruct arbitrary images from the inverted
+latent encoding following bi-directional deterministic diffusion and denoising
+trajectories. This motivates us to investigate the statistical and geometric
+behaviors of the Out-Of-Distribution (OOD) samples from unseen image domains in
+the latent spaces along the denoising chain. Notably, we theoretically and
+empirically show that the inverted OOD samples also establish Gaussians that
+are distinguishable from the original In-Domain (ID) samples in the
+intermediate latent spaces, which allows us to sample from them directly.
+Geometrical domain-specific and model-dependent information of the unseen
+subspace (e.g., sample-wise distance and angles) is used to further optimize
+the sampled OOD latent encodings from the estimated Gaussian prior. We conduct
+extensive analysis and experiments using pre-trained diffusion models (DDPM,
+iDDPM) on different datasets (AFHQ, CelebA-HQ, LSUN-Church, and LSUN-Bedroom),
+proving the effectiveness of this novel perspective to explore and re-think the
+diffusion models' data synthesis generalization ability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages including appendices</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Regularization-Based Methods for Ordinal Quantification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09210v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09210v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mirko Bunse, Alejandro Moreo, Fabrizio Sebastiani, Martin Senz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantification, i.e., the task of training predictors of the class prevalence
+values in sets of unlabeled data items, has received increased attention in
+recent years. However, most quantification research has concentrated on
+developing algorithms for binary and multiclass problems in which the classes
+are not ordered. Here, we study the ordinal case, i.e., the case in which a
+total order is defined on the set of n>2 classes. We give three main
+contributions to this field. First, we create and make available two datasets
+for ordinal quantification (OQ) research that overcome the inadequacies of the
+previously available ones. Second, we experimentally compare the most important
+OQ algorithms proposed in the literature so far. To this end, we bring together
+algorithms proposed by authors from very different research fields, such as
+data mining and astrophysics, who were unaware of each others' developments.
+Third, we propose a novel class of regularized OQ algorithms, which outperforms
+existing algorithms in our experiments. The key to this gain in performance is
+that our regularization prevents ordinally implausible estimates, assuming that
+ordinal distributions tend to be smooth in practice. We informally verify this
+assumption for several real-world applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>45 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SiamAF: Learning Shared Information from ECG and PPG Signals for Robust
+  Atrial Fibrillation Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09203v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09203v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhicheng Guo, Cheng Ding, Duc H. Do, Amit Shah, Randall J. Lee, Xiao Hu, Cynthia Rudin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Atrial fibrillation (AF) is the most common type of cardiac arrhythmia. It is
+associated with an increased risk of stroke, heart failure, and other
+cardiovascular complications, but can be clinically silent. Passive AF
+monitoring with wearables may help reduce adverse clinical outcomes related to
+AF. Detecting AF in noisy wearable data poses a significant challenge, leading
+to the emergence of various deep learning techniques. Previous deep learning
+models learn from a single modality, either electrocardiogram (ECG) or
+photoplethysmography (PPG) signals. However, deep learning models often
+struggle to learn generalizable features and rely on features that are more
+susceptible to corruption from noise, leading to sub-optimal performances in
+certain scenarios, especially with low-quality signals. Given the increasing
+availability of ECG and PPG signal pairs from wearables and bedside monitors,
+we propose a new approach, SiamAF, leveraging a novel Siamese network
+architecture and joint learning loss function to learn shared information from
+both ECG and PPG signals. At inference time, the proposed model is able to
+predict AF from either PPG or ECG and outperforms baseline methods on three
+external test sets. It learns medically relevant features as a result of our
+novel architecture design. The proposed model also achieves comparable
+performance to traditional learning regimes while requiring much fewer training
+labels, providing a potential approach to reduce future reliance on manual
+labeling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph Condensation via Eigenbasis Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09202v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09202v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Liu, Deyu Bo, Chuan Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The increasing amount of graph data places requirements on the efficiency and
+scalability of graph neural networks (GNNs), despite their effectiveness in
+various graph-related applications. Recently, the emerging graph condensation
+(GC) sheds light on reducing the computational cost of GNNs from a data
+perspective. It aims to replace the real large graph with a significantly
+smaller synthetic graph so that GNNs trained on both graphs exhibit comparable
+performance. However, our empirical investigation reveals that existing GC
+methods suffer from poor generalization, i.e., different GNNs trained on the
+same synthetic graph have obvious performance gaps. What factors hinder the
+generalization of GC and how can we mitigate it? To answer this question, we
+commence with a detailed analysis and observe that GNNs will inject spectrum
+bias into the synthetic graph, resulting in a distribution shift. To tackle
+this issue, we propose eigenbasis matching for spectrum-free graph
+condensation, named GCEM, which has two key steps: First, GCEM matches the
+eigenbasis of the real and synthetic graphs, rather than the graph structure,
+which eliminates the spectrum bias of GNNs. Subsequently, GCEM leverages the
+spectrum of the real graph and the synthetic eigenbasis to construct the
+synthetic graph, thereby preserving the essential structural information. We
+theoretically demonstrate that the synthetic graph generated by GCEM maintains
+the spectral similarity, i.e., total variation, of the real graph. Extensive
+experiments conducted on five graph datasets verify that GCEM not only achieves
+state-of-the-art performance over baselines but also significantly narrows the
+performance gaps between different GNNs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A 4-approximation algorithm for min max correlation clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09196v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09196v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Holger Heidrich, Jannik Irmai, Bjoern Andres
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a lower bounding technique for the min max correlation
+clustering problem and, based on this technique, a combinatorial
+4-approximation algorithm for complete graphs. This improves upon the previous
+best known approximation guarantees of 5, using a linear program formulation
+(Kalhan et al., 2019), and 4, for a combinatorial algorithm (Davies et al.,
+2023). We extend this algorithm by a greedy joining heuristic and show
+empirically that it improves the state of the art in solution quality and
+runtime on several benchmark datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Variational autoencoder with weighted samples for high-dimensional
+  non-parametric adaptive importance sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09194v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09194v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julien Demange-Chryst, François Bachoc, Jérôme Morio, Timothé Krauth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Probability density function estimation with weighted samples is the main
+foundation of all adaptive importance sampling algorithms. Classically, a
+target distribution is approximated either by a non-parametric model or within
+a parametric family. However, these models suffer from the curse of
+dimensionality or from their lack of flexibility. In this contribution, we
+suggest to use as the approximating model a distribution parameterised by a
+variational autoencoder. We extend the existing framework to the case of
+weighted samples by introducing a new objective function. The flexibility of
+the obtained family of distributions makes it as expressive as a non-parametric
+model, and despite the very high number of parameters to estimate, this family
+is much more efficient in high dimension than the classical Gaussian or
+Gaussian mixture families. Moreover, in order to add flexibility to the model
+and to be able to learn multimodal distributions, we consider a learnable prior
+distribution for the variational autoencoder latent variables. We also
+introduce a new pre-training procedure for the variational autoencoder to find
+good starting weights of the neural networks to prevent as much as possible the
+posterior collapse phenomenon to happen. At last, we explicit how the resulting
+distribution can be combined with importance sampling, and we exploit the
+proposed procedure in existing adaptive importance sampling algorithms to draw
+points from a target distribution and to estimate a rare event probability in
+high dimension on two multimodal problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Does Graph Distillation See Like Vision <span class="highlight-title">Dataset</span> Counterpart? <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09192v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09192v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Beining Yang, Kai Wang, Qingyun Sun, Cheng Ji, Xingcheng Fu, Hao Tang, Yang You, Jianxin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training on large-scale graphs has achieved remarkable results in graph
+representation learning, but its cost and storage have attracted increasing
+concerns. Existing graph condensation methods primarily focus on optimizing the
+feature matrices of condensed graphs while overlooking the impact of the
+structure information from the original graphs. To investigate the impact of
+the structure information, we conduct analysis from the spectral domain and
+empirically identify substantial Laplacian Energy Distribution (LED) shifts in
+previous works. Such shifts lead to poor performance in cross-architecture
+generalization and specific tasks, including anomaly detection and link
+prediction. In this paper, we propose a novel Structure-broadcasting Graph
+Dataset Distillation (SGDD) scheme for broadcasting the original structure
+information to the generation of the synthetic one, which explicitly prevents
+overlooking the original structure information. Theoretically, the synthetic
+graphs by SGDD are expected to have smaller LED shifts than previous works,
+leading to superior performance in both cross-architecture settings and
+specific tasks. We validate the proposed SGDD across 9 datasets and achieve
+state-of-the-art results on all of them: for example, on the YelpChi dataset,
+our approach maintains 98.6% test accuracy of training on the original graph
+dataset with 1,000 times saving on the scale of the graph. Moreover, we
+empirically evaluate there exist 17.6% ~ 31.4% reductions in LED shift crossing
+9 datasets. Extensive experiments and analysis verify the effectiveness and
+necessity of the proposed designs. The code is available in the GitHub
+repository: https://github.com/RingBDStack/SGDD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PRIOR: Personalized Prior for Reactivating the Information Overlooked in
+  Federated Learning <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09183v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09183v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingjia Shi, Yuhao Zhou, Kai Wang, Huaizheng Zhang, Shudong Huang, Qing Ye, Jiangcheng Lv
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classical federated learning (FL) enables training machine learning models
+without sharing data for privacy preservation, but heterogeneous data
+characteristic degrades the performance of the localized model. Personalized FL
+(PFL) addresses this by synthesizing personalized models from a global model
+via training on local data. Such a global model may overlook the specific
+information that the clients have been sampled. In this paper, we propose a
+novel scheme to inject personalized prior knowledge into the global model in
+each client, which attempts to mitigate the introduced incomplete information
+problem in PFL. At the heart of our proposed approach is a framework, the PFL
+with Bregman Divergence (pFedBreD), decoupling the personalized prior from the
+local objective function regularized by Bregman divergence for greater
+adaptability in personalized scenarios. We also relax the mirror descent (RMD)
+to extract the prior explicitly to provide optional strategies. Additionally,
+our pFedBreD is backed up by a convergence analysis. Sufficient experiments
+demonstrate that our method reaches the state-of-the-art performances on 5
+datasets and outperforms other methods by up to 3.5% across 8 benchmarks.
+Extensive analyses verify the robustness and necessity of proposed designs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is accepted by NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Deep Neural Network -- Mechanistic Hybrid Model to Predict
+  Pharmacokinetics in Rat 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09167v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09167v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florian Führer, Andrea Gruber, Holger Diedam, Andreas H. Göller, Stephan Menz, Sebastian Schneckener
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An important aspect in the development of small molecules as drugs or
+agro-chemicals is their systemic availability after intravenous and oral
+administration.The prediction of the systemic availability from the chemical
+structure of a poten-tial candidate is highly desirable, as it allows to focus
+the drug or agrochemicaldevelopment on compounds with a favorable kinetic
+profile. However, such pre-dictions are challenging as the availability is the
+result of the complex interplaybetween molecular properties, biology and
+physiology and training data is rare.In this work we improve the hybrid model
+developed earlier [34]. We reducethe median fold change error for the total
+oral exposure from 2.85 to 2.35 andfor intravenous administration from 1.95 to
+1.62. This is achieved by trainingon a larger data set, improving the neural
+network architecture as well as theparametrization of mechanistic model.
+Further, we extend our approach to predictadditional endpoints and to handle
+different covariates, like sex and dosage form.In contrast to a pure machine
+learning model, our model is able to predict newend points on which it has not
+been trained. We demonstrate this feature by1predicting the exposure over the
+first 24h, while the model has only been trainedon the total exposure.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Journal of Computer-Aided Molecular Design</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantum Machine Learning in Climate Change and Sustainability: a <span class="highlight-title">Review</span> <span class="chip">AAAI</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09162v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09162v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amal Nammouchi, Andreas Kassler, Andreas Theorachis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Climate change and its impact on global sustainability are critical
+challenges, demanding innovative solutions that combine cutting-edge
+technologies and scientific insights. Quantum machine learning (QML) has
+emerged as a promising paradigm that harnesses the power of quantum computing
+to address complex problems in various domains including climate change and
+sustainability. In this work, we survey existing literature that applies
+quantum machine learning to solve climate change and sustainability-related
+problems. We review promising QML methodologies that have the potential to
+accelerate decarbonization including energy systems, climate data forecasting,
+climate monitoring, and hazardous events predictions. We discuss the challenges
+and current limitations of quantum machine learning approaches and provide an
+overview of potential opportunities and future work to leverage QML-based
+methods in the important area of climate change research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages Accepted for publication in AAAI proceedings (AAAI Fall
+  symposium 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Jointly-Learned Exit and Inference for a Dynamic Neural Network :
+  JEI-DNN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09163v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09163v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Florence Regol, Joud Chataoui, Mark Coates
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large pretrained models, coupled with fine-tuning, are slowly becoming
+established as the dominant architecture in machine learning. Even though these
+models offer impressive performance, their practical application is often
+limited by the prohibitive amount of resources required for every inference.
+Early-exiting dynamic neural networks (EDNN) circumvent this issue by allowing
+a model to make some of its predictions from intermediate layers (i.e.,
+early-exit). Training an EDNN architecture is challenging as it consists of two
+intertwined components: the gating mechanism (GM) that controls early-exiting
+decisions and the intermediate inference modules (IMs) that perform inference
+from intermediate representations. As a result, most existing approaches rely
+on thresholding confidence metrics for the gating mechanism and strive to
+improve the underlying backbone network and the inference modules. Although
+successful, this approach has two fundamental shortcomings: 1) the GMs and the
+IMs are decoupled during training, leading to a train-test mismatch; and 2) the
+thresholding gating mechanism introduces a positive bias into the predictive
+probabilities, making it difficult to readily extract uncertainty information.
+We propose a novel architecture that connects these two modules. This leads to
+significant performance improvements on classification datasets and enables
+better uncertainty characterization capabilities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Computational Complexity of Finding Stationary Points in Non-Convex
+  Optimization <span class="chip">COLT 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09157v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09157v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexandros Hollender, Manolis Zampetakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Finding approximate stationary points, i.e., points where the gradient is
+approximately zero, of non-convex but smooth objective functions $f$ over
+unrestricted $d$-dimensional domains is one of the most fundamental problems in
+classical non-convex optimization. Nevertheless, the computational and query
+complexity of this problem are still not well understood when the dimension $d$
+of the problem is independent of the approximation error. In this paper, we
+show the following computational and query complexity results:
+  1. The problem of finding approximate stationary points over unrestricted
+domains is PLS-complete.
+  2. For $d = 2$, we provide a zero-order algorithm for finding
+$\varepsilon$-approximate stationary points that requires at most
+$O(1/\varepsilon)$ value queries to the objective function.
+  3. We show that any algorithm needs at least $\Omega(1/\varepsilon)$ queries
+to the objective function and/or its gradient to find $\varepsilon$-approximate
+stationary points when $d=2$. Combined with the above, this characterizes the
+query complexity of this problem to be $\Theta(1/\varepsilon)$.
+  4. For $d = 2$, we provide a zero-order algorithm for finding
+$\varepsilon$-KKT points in constrained optimization problems that requires at
+most $O(1/\sqrt{\varepsilon})$ value queries to the objective function. This
+closes the gap between the works of Bubeck and Mikulincer [2020] and Vavasis
+[1993] and characterizes the query complexity of this problem to be
+$\Theta(1/\sqrt{\varepsilon})$.
+  5. Combining our results with the recent result of Fearnley et al. [2022], we
+show that finding approximate KKT points in constrained optimization is
+reducible to finding approximate stationary points in unconstrained
+optimization but the converse is impossible.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Full version of COLT 2023 extended abstract</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lattice Approximations in Wasserstein Space 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09149v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09149v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Keaton Hamm, Varun Khurana
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider structured approximation of measures in Wasserstein space
+$W_p(\mathbb{R}^d)$ for $p\in[1,\infty)$ by discrete and piecewise constant
+measures based on a scaled Voronoi partition of $\mathbb{R}^d$. We show that if
+a full rank lattice $\Lambda$ is scaled by a factor of $h\in(0,1]$, then
+approximation of a measure based on the Voronoi partition of $h\Lambda$ is
+$O(h)$ regardless of $d$ or $p$. We then use a covering argument to show that
+$N$-term approximations of compactly supported measures is $O(N^{-\frac1d})$
+which matches known rates for optimal quantizers and empirical measure
+approximation in most instances. Finally, we extend these results to
+noncompactly supported measures with sufficient decay.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Goodhart's Law in Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09144v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09144v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jacek Karwowski, Oliver Hayman, Xingjian Bai, Klaus Kiendlhofer, Charlie Griffin, Joar Skalse
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Implementing a reward function that perfectly captures a complex task in the
+real world is impractical. As a result, it is often appropriate to think of the
+reward function as a proxy for the true objective rather than as its
+definition. We study this phenomenon through the lens of Goodhart's law, which
+predicts that increasing optimisation of an imperfect proxy beyond some
+critical point decreases performance on the true objective. First, we propose a
+way to quantify the magnitude of this effect and show empirically that
+optimising an imperfect proxy reward often leads to the behaviour predicted by
+Goodhart's law for a wide range of environments and reward functions. We then
+provide a geometric explanation for why Goodhart's law occurs in Markov
+decision processes. We use these theoretical insights to propose an optimal
+early stopping method that provably avoids the aforementioned pitfall and
+derive theoretical regret bounds for this method. Moreover, we derive a
+training method that maximises worst-case reward, for the setting where there
+is uncertainty about the true reward function. Finally, we evaluate our early
+stopping method experimentally. Our results support a foundation for a
+theoretically-principled study of reinforcement learning under reward
+misspecification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Consensus Game: Language Model Generation via Equilibrium Search 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09139v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09139v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Athul Paul Jacob, Yikang Shen, Gabriele Farina, Jacob Andreas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When applied to question answering and other text generation tasks, language
+models (LMs) may be queried generatively (by sampling answers from their output
+distribution) or discriminatively (by using them to score or rank a set of
+candidate outputs). These procedures sometimes yield very different
+predictions. How do we reconcile mutually incompatible scoring procedures to
+obtain coherent LM predictions? We introduce a new, a training-free,
+game-theoretic procedure for language model decoding. Our approach casts
+language model decoding as a regularized imperfect-information sequential
+signaling game - which we term the CONSENSUS GAME - in which a GENERATOR seeks
+to communicate an abstract correctness parameter using natural language
+sentences to a DISCRIMINATOR. We develop computational procedures for finding
+approximate equilibria of this game, resulting in a decoding algorithm we call
+EQUILIBRIUM-RANKING. Applied to a large number of tasks (including reading
+comprehension, commonsense reasoning, mathematical problem-solving, and
+dialog), EQUILIBRIUM-RANKING consistently, and sometimes substantially,
+improves performance over existing LM decoding procedures - on multiple
+benchmarks, we observe that applying EQUILIBRIUM-RANKING to LLaMA-7B
+outperforms the much larger LLaMA-65B and PaLM-540B models. These results
+highlight the promise of game-theoretic tools for addressing fundamental
+challenges of truthfulness and consistency in LMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Computing Marginal and Conditional Divergences between Decomposable
+  Models with Applications <span class="chip">ICDM</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09129v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09129v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Loong Kuan Lee, Geoffrey I. Webb, Daniel F. Schmidt, Nico Piatkowski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to compute the exact divergence between two high-dimensional
+distributions is useful in many applications but doing so naively is
+intractable. Computing the alpha-beta divergence -- a family of divergences
+that includes the Kullback-Leibler divergence and Hellinger distance -- between
+the joint distribution of two decomposable models, i.e chordal Markov networks,
+can be done in time exponential in the treewidth of these models. However,
+reducing the dissimilarity between two high-dimensional objects to a single
+scalar value can be uninformative. Furthermore, in applications such as
+supervised learning, the divergence over a conditional distribution might be of
+more interest. Therefore, we propose an approach to compute the exact
+alpha-beta divergence between any marginal or conditional distribution of two
+decomposable models. Doing so tractably is non-trivial as we need to decompose
+the divergence between these distributions and therefore, require a
+decomposition over the marginal and conditional distributions of these models.
+Consequently, we provide such a decomposition and also extend existing work to
+compute the marginal and conditional alpha-beta divergence between these
+decompositions. We then show how our method can be used to analyze
+distributional changes by first applying it to a benchmark image dataset.
+Finally, based on our framework, we propose a novel way to quantify the error
+in contemporary superconducting quantum computers. Code for all experiments is
+available at: https://lklee.dev/pub/2023-icdm/code
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 8 figures, Accepted at the IEEE International Conference on
+  Data Mining (ICDM) 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Generalization Bounds for Projective Clustering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09127v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09127v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maria Sofia Bucarelli, Matilde Fjeldsø Larsen, Chris Schwiegelshohn, Mads Bech Toftrup
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given a set of points, clustering consists of finding a partition of a point
+set into $k$ clusters such that the center to which a point is assigned is as
+close as possible. Most commonly, centers are points themselves, which leads to
+the famous $k$-median and $k$-means objectives. One may also choose centers to
+be $j$ dimensional subspaces, which gives rise to subspace clustering. In this
+paper, we consider learning bounds for these problems. That is, given a set of
+$n$ samples $P$ drawn independently from some unknown, but fixed distribution
+$\mathcal{D}$, how quickly does a solution computed on $P$ converge to the
+optimal clustering of $\mathcal{D}$? We give several near optimal results. In
+particular,
+  For center-based objectives, we show a convergence rate of
+$\tilde{O}\left(\sqrt{{k}/{n}}\right)$. This matches the known optimal bounds
+of [Fefferman, Mitter, and Narayanan, Journal of the Mathematical Society 2016]
+and [Bartlett, Linder, and Lugosi, IEEE Trans. Inf. Theory 1998] for $k$-means
+and extends it to other important objectives such as $k$-median.
+  For subspace clustering with $j$-dimensional subspaces, we show a convergence
+rate of $\tilde{O}\left(\sqrt{\frac{kj^2}{n}}\right)$. These are the first
+provable bounds for most of these problems. For the specific case of projective
+clustering, which generalizes $k$-means, we show a convergence rate of
+$\Omega\left(\sqrt{\frac{kj}{n}}\right)$ is necessary, thereby proving that the
+bounds from [Fefferman, Mitter, and Narayanan, Journal of the Mathematical
+Society 2016] are essentially optimal.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Physics-guided Noise Neural Proxy for Low-light Raw Image Denoising 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09126v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09126v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hansen Feng, Lizhi Wang, Yiqi Huang, Yuzhi Wang, Hua Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Low-light raw image denoising plays a crucial role in mobile photography, and
+learning-based methods have become the mainstream approach. Training the
+learning-based methods with synthetic data emerges as an efficient and
+practical alternative to paired real data. However, the quality of synthetic
+data is inherently limited by the low accuracy of the noise model, which
+decreases the performance of low-light raw image denoising. In this paper, we
+develop a novel framework for accurate noise modeling that learns a
+physics-guided noise neural proxy (PNNP) from dark frames. PNNP integrates
+three efficient techniques: physics-guided noise decoupling (PND),
+physics-guided proxy model (PPM), and differentiable distribution-oriented loss
+(DDL). The PND decouples the dark frame into different components and handles
+different levels of noise in a flexible manner, which reduces the complexity of
+the noise neural proxy. The PPM incorporates physical priors to effectively
+constrain the generated noise, which promotes the accuracy of the noise neural
+proxy. The DDL provides explicit and reliable supervision for noise modeling,
+which promotes the precision of the noise neural proxy. Extensive experiments
+on public low-light raw image denoising datasets and real low-light imaging
+scenarios demonstrate the superior performance of our PNNP framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Training and Predicting Visual Error for Real-Time Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09125v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09125v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        João Libório Cardoso, Bernhard Kerbl, Lei Yang, Yury Uralsky, Michael Wimmer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual error metrics play a fundamental role in the quantification of
+perceived image similarity. Most recently, use cases for them in real-time
+applications have emerged, such as content-adaptive shading and shading reuse
+to increase performance and improve efficiency. A wide range of different
+metrics has been established, with the most sophisticated being capable of
+capturing the perceptual characteristics of the human visual system. However,
+their complexity, computational expense, and reliance on reference images to
+compare against prevent their generalized use in real-time, restricting such
+applications to using only the simplest available metrics. In this work, we
+explore the abilities of convolutional neural networks to predict a variety of
+visual metrics without requiring either reference or rendered images.
+Specifically, we train and deploy a neural network to estimate the visual error
+resulting from reusing shading or using reduced shading rates. The resulting
+models account for 70%-90% of the variance while achieving up to an order of
+magnitude faster computation times. Our solution combines image-space
+information that is readily available in most state-of-the-art deferred shading
+pipelines with reprojection from previous frames to enable an adequate estimate
+of visual errors, even in previously unseen regions. We describe a suitable
+convolutional network architecture and considerations for data preparation for
+training. We demonstrate the capability of our network to predict complex error
+metrics at interactive rates in a real-time application that implements
+content-adaptive shading in a deferred pipeline. Depending on the portion of
+unseen image regions, our approach can achieve up to $2\times$ performance
+compared to state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at Proceedings of the ACM in Computer Graphics and
+  Interactive Techniques. 14 Pages, 16 Figures, 3 Tables. For paper website and
+  higher quality figures, see https://jaliborc.github.io/rt-percept/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Automatic Music Playlist Generation via Simulation-based Reinforcement
+  Learning <span class="chip">KDD 23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09123v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09123v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Federico Tomasi, Joseph Cauteruccio, Surya Kanoria, Kamil Ciosek, Matteo Rinaldi, Zhenwen Dai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalization of playlists is a common feature in music streaming services,
+but conventional techniques, such as collaborative filtering, rely on explicit
+assumptions regarding content quality to learn how to make recommendations.
+Such assumptions often result in misalignment between offline model objectives
+and online user satisfaction metrics. In this paper, we present a reinforcement
+learning framework that solves for such limitations by directly optimizing for
+user satisfaction metrics via the use of a simulated playlist-generation
+environment. Using this simulator we develop and train a modified Deep
+Q-Network, the action head DQN (AH-DQN), in a manner that addresses the
+challenges imposed by the large state and action space of our RL formulation.
+The resulting policy is capable of making recommendations from large and
+dynamic sets of candidate items with the expectation of maximizing consumption
+metrics. We analyze and evaluate agents offline via simulations that use
+environment models trained on both public and proprietary streaming datasets.
+We show how these agents lead to better user-satisfaction metrics compared to
+baseline methods during online A/B tests. Finally, we demonstrate that
+performance assessments produced from our simulator are strongly correlated
+with observed online metric results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages. KDD 23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DSG: An End-to-End Document Structure Generator <span class="chip">ICDM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09118v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09118v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Johannes Rausch, Gentiana Rashiti, Maxim Gusev, Ce Zhang, Stefan Feuerriegel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Information in industry, research, and the public sector is widely stored as
+rendered documents (e.g., PDF files, scans). Hence, to enable downstream tasks,
+systems are needed that map rendered documents onto a structured hierarchical
+format. However, existing systems for this task are limited by heuristics and
+are not end-to-end trainable. In this work, we introduce the Document Structure
+Generator (DSG), a novel system for document parsing that is fully end-to-end
+trainable. DSG combines a deep neural network for parsing (i) entities in
+documents (e.g., figures, text blocks, headers, etc.) and (ii) relations that
+capture the sequence and nested structure between entities. Unlike existing
+systems that rely on heuristics, our DSG is trained end-to-end, making it
+effective and flexible for real-world applications. We further contribute a
+new, large-scale dataset called E-Periodica comprising real-world magazines
+with complex document structures for evaluation. Our results demonstrate that
+our DSG outperforms commercial OCR tools and, on top of that, achieves
+state-of-the-art performance. To the best of our knowledge, our DSG system is
+the first end-to-end trainable system for hierarchical document parsing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICDM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Insightful analysis of historical sources at scales beyond human
+  capabilities using unsupervised Machine Learning and XAI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09091v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09091v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oliver Eberle, Jochen Büttner, Hassan El-Hajj, Grégoire Montavon, Klaus-Robert Müller, Matteo Valleriani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Historical materials are abundant. Yet, piecing together how human knowledge
+has evolved and spread both diachronically and synchronically remains a
+challenge that can so far only be very selectively addressed. The vast volume
+of materials precludes comprehensive studies, given the restricted number of
+human specialists. However, as large amounts of historical materials are now
+available in digital form there is a promising opportunity for AI-assisted
+historical analysis. In this work, we take a pivotal step towards analyzing
+vast historical corpora by employing innovative machine learning (ML)
+techniques, enabling in-depth historical insights on a grand scale. Our study
+centers on the evolution of knowledge within the `Sacrobosco Collection' -- a
+digitized collection of 359 early modern printed editions of textbooks on
+astronomy used at European universities between 1472 and 1650 -- roughly 76,000
+pages, many of which contain astronomic, computational tables. An ML based
+analysis of these tables helps to unveil important facets of the
+spatio-temporal evolution of knowledge and innovation in the field of
+mathematical astronomy in the period, as taught at European universities.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online Relocating and Matching of Ride-Hailing Services: A Model-Based
+  Modular Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09071v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09071v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chang Gao, Xi Lin, Fang He, Xindi Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study proposes an innovative model-based modular approach (MMA) to
+dynamically optimize order matching and vehicle relocation in a ride-hailing
+platform. MMA utilizes a two-layer and modular modeling structure. The upper
+layer determines the spatial transfer patterns of vehicle flow within the
+system to maximize the total revenue of the current and future stages. With the
+guidance provided by the upper layer, the lower layer performs rapid
+vehicle-to-order matching and vehicle relocation. MMA is interpretable, and
+equipped with the customized and polynomial-time algorithm, which, as an online
+order-matching and vehicle-relocation algorithm, can scale past thousands of
+vehicles. We theoretically prove that the proposed algorithm can achieve the
+global optimum in stylized networks, while the numerical experiments based on
+both the toy network and realistic dataset demonstrate that MMA is capable of
+achieving superior systematic performance compared to batch matching and
+reinforcement-learning based methods. Moreover, its modular and lightweight
+modeling structure further enables it to achieve a high level of robustness
+against demand variation while maintaining a relatively low computational cost.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KCTS: Knowledge-Constrained Tree Search Decoding with Token-Level
+  Hallucination Detection <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09044v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09044v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sehyun Choi, Tianqing Fang, Zhaowei Wang, Yangqiu Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated remarkable human-level natural
+language generation capabilities. However, their potential to generate
+misinformation, often called the hallucination problem, poses a significant
+risk to their deployment. A common approach to address this issue is to
+retrieve relevant knowledge and fine-tune the LLM with the knowledge in its
+input. Unfortunately, this method incurs high training costs and may cause
+catastrophic forgetting for multi-tasking models. To overcome these
+limitations, we propose a knowledge-constrained decoding method called KCTS
+(Knowledge-Constrained Tree Search), which guides a frozen LM to generate text
+aligned with the reference knowledge at each decoding step using a knowledge
+classifier score and MCTS (Monte-Carlo Tree Search). To adapt the
+sequence-level knowledge classifier to token-level guidance, we also propose a
+novel token-level hallucination detection method called RIPA (Reward Inflection
+Point Approximation). Our empirical results on knowledge-grounded dialogue and
+abstractive summarization demonstrate the strength of KCTS as a plug-and-play,
+model-agnostic decoding method that can effectively reduce hallucinations in
+natural language generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at EMNLP 2023 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimal Scheduling of Electric Vehicle Charging with Deep Reinforcement
+  Learning considering End Users Flexibility 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09040v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09040v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christoforos Menos-Aikateriniadis, Stavros Sykiotis, Pavlos S. Georgilakis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid growth of decentralized energy resources and especially Electric
+Vehicles (EV), that are expected to increase sharply over the next decade, will
+put further stress on existing power distribution networks, increasing the need
+for higher system reliability and flexibility. In an attempt to avoid
+unnecessary network investments and to increase the controllability over
+distribution networks, network operators develop demand response (DR) programs
+that incentivize end users to shift their consumption in return for financial
+or other benefits. Artificial intelligence (AI) methods are in the research
+forefront for residential load scheduling applications, mainly due to their
+high accuracy, high computational speed and lower dependence on the physical
+characteristics of the models under development. The aim of this work is to
+identify households' EV cost-reducing charging policy under a Time-of-Use
+tariff scheme, with the use of Deep Reinforcement Learning, and more
+specifically Deep Q-Networks (DQN). A novel end users flexibility potential
+reward is inferred from historical data analysis, where households with solar
+power generation have been used to train and test the designed algorithm. The
+suggested DQN EV charging policy can lead to more than 20% of savings in end
+users electricity bills.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MINDE: Mutual Information Neural Diffusion Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09031v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09031v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giulio Franzese, Mustapha Bounoua, Pietro Michiardi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we present a new method for the estimation of Mutual Information
+(MI) between random variables. Our approach is based on an original
+interpretation of the Girsanov theorem, which allows us to use score-based
+diffusion models to estimate the Kullback Leibler divergence between two
+densities as a difference between their score functions. As a by-product, our
+method also enables the estimation of the entropy of random variables. Armed
+with such building blocks, we present a general recipe to measure MI, which
+unfolds in two directions: one uses conditional diffusion process, whereas the
+other uses joint diffusion processes that allow simultaneous modelling of two
+random variables. Our results, which derive from a thorough experimental
+protocol over all the variants of our approach, indicate that our method is
+more accurate than the main alternatives from the literature, especially for
+challenging distributions. Furthermore, our methods pass MI self-consistency
+tests, including data processing and additivity under independence, which
+instead are a pain-point of existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Subspace Adaptation Prior for Few-Shot Learning <span class="chip">ECML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09028v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09028v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mike Huisman, Aske Plaat, Jan N. van Rijn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Gradient-based meta-learning techniques aim to distill useful prior knowledge
+from a set of training tasks such that new tasks can be learned more
+efficiently with gradient descent. While these methods have achieved successes
+in various scenarios, they commonly adapt all parameters of trainable layers
+when learning new tasks. This neglects potentially more efficient learning
+strategies for a given task distribution and may be susceptible to overfitting,
+especially in few-shot learning where tasks must be learned from a limited
+number of examples. To address these issues, we propose Subspace Adaptation
+Prior (SAP), a novel gradient-based meta-learning algorithm that jointly learns
+good initialization parameters (prior knowledge) and layer-wise parameter
+subspaces in the form of operation subsets that should be adaptable. In this
+way, SAP can learn which operation subsets to adjust with gradient descent
+based on the underlying task distribution, simultaneously decreasing the risk
+of overfitting when learning new tasks. We demonstrate that this ability is
+helpful as SAP yields superior or competitive performance in few-shot image
+classification settings (gains between 0.1% and 3.9% in accuracy). Analysis of
+the learned subspaces demonstrates that low-dimensional operations often yield
+high activation strengths, indicating that they may be important for achieving
+good few-shot learning performance. For reproducibility purposes, we publish
+all our research code publicly.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Machine Learning Journal, Special Issue of the ECML PKDD
+  2023 Journal Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Federated Meta-Learning for Few-Shot Fault Diagnosis with Representation
+  Encoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09002v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09002v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jixuan Cui, Jun Li, Zhen Mei, Kang Wei, Sha Wei, Ming Ding, Wen Chen, Song Guo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning-based fault diagnosis (FD) approaches require a large amount of
+training data, which are difficult to obtain since they are located across
+different entities. Federated learning (FL) enables multiple clients to
+collaboratively train a shared model with data privacy guaranteed. However, the
+domain discrepancy and data scarcity problems among clients deteriorate the
+performance of the global FL model. To tackle these issues, we propose a novel
+framework called representation encoding-based federated meta-learning (REFML)
+for few-shot FD. First, a novel training strategy based on representation
+encoding and meta-learning is developed. It harnesses the inherent
+heterogeneity among training clients, effectively transforming it into an
+advantage for out-of-distribution generalization on unseen working conditions
+or equipment types. Additionally, an adaptive interpolation method that
+calculates the optimal combination of local and global models as the
+initialization of local training is proposed. This helps to further utilize
+local information to mitigate the negative effects of domain discrepancy. As a
+result, high diagnostic accuracy can be achieved on unseen working conditions
+or equipment types with limited training data. Compared with the
+state-of-the-art methods, such as FedProx, the proposed REFML framework
+achieves an increase in accuracy by 2.17%-6.50% when tested on unseen working
+conditions of the same equipment type and 13.44%-18.33% when tested on totally
+unseen equipment types, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Measuring the Stability of Process Outcome Predictions in Online
+  Settings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09000v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09000v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Suhwan Lee, Marco Comuzzi, Xixi Lu, Hajo A. Reijers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predictive Process Monitoring aims to forecast the future progress of process
+instances using historical event data. As predictive process monitoring is
+increasingly applied in online settings to enable timely interventions,
+evaluating the performance of the underlying models becomes crucial for
+ensuring their consistency and reliability over time. This is especially
+important in high risk business scenarios where incorrect predictions may have
+severe consequences. However, predictive models are currently usually evaluated
+using a single, aggregated value or a time-series visualization, which makes it
+challenging to assess their performance and, specifically, their stability over
+time. This paper proposes an evaluation framework for assessing the stability
+of models for online predictive process monitoring. The framework introduces
+four performance meta-measures: the frequency of significant performance drops,
+the magnitude of such drops, the recovery rate, and the volatility of
+performance. To validate this framework, we applied it to two artificial and
+two real-world event logs. The results demonstrate that these meta-measures
+facilitate the comparison and selection of predictive models for different
+risk-taking scenarios. Such insights are of particular value to enhance
+decision-making in dynamic business environments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 3 figures, Proceedings of the 5th International Conference
+  on Process Mining (ICPM 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reroute Prediction Service <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08988v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08988v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ítalo Romani de Oliveira, Samet Ayhan, Michael Biglin, Pablo Costas, Euclides C. Pinto Neto
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The cost of delays was estimated as 33 billion US dollars only in 2019 for
+the US National Airspace System, a peak value following a growth trend in past
+years. Aiming to address this huge inefficiency, we designed and developed a
+novel Data Analytics and Machine Learning system, which aims at reducing delays
+by proactively supporting re-routing decisions.
+  Given a time interval up to a few days in the future, the system predicts if
+a reroute advisory for a certain Air Route Traffic Control Center or for a
+certain advisory identifier will be issued, which may impact the pertinent
+routes. To deliver such predictions, the system uses historical reroute data,
+collected from the System Wide Information Management (SWIM) data services
+provided by the FAA, and weather data, provided by the US National Centers for
+Environmental Prediction (NCEP). The data is huge in volume, and has many items
+streamed at high velocity, uncorrelated and noisy. The system continuously
+processes the incoming raw data and makes it available for the next step where
+an interim data store is created and adaptively maintained for efficient query
+processing. The resulting data is fed into an array of ML algorithms, which
+compete for higher accuracy. The best performing algorithm is used in the final
+prediction, generating the final results. Mean accuracy values higher than 90%
+were obtained in our experiments with this system.
+  Our algorithm divides the area of interest in units of aggregation and uses
+temporal series of the aggregate measures of weather forecast parameters in
+each geographical unit, in order to detect correlations with reroutes and where
+they will most likely occur. Aiming at practical application, the system is
+formed by a number of microservices, which are deployed in the cloud, making
+the system distributed, scalable and highly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to the 2023 IEEE/AIAA Digital Aviation Systems Conference
+  (DASC)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PAGE: Equilibrate Personalization and Generalization in Federated
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08961v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08961v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qian Chen, Zilong Wang, Jiaqi Hu, Haonan Yan, Jianying Zhou, Xiaodong Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning (FL) is becoming a major driving force behind machine
+learning as a service, where customers (clients) collaboratively benefit from
+shared local updates under the orchestration of the service provider (server).
+Representing clients' current demands and the server's future demand, local
+model personalization and global model generalization are separately
+investigated, as the ill-effects of data heterogeneity enforce the community to
+focus on one over the other. However, these two seemingly competing goals are
+of equal importance rather than black and white issues, and should be achieved
+simultaneously. In this paper, we propose the first algorithm to balance
+personalization and generalization on top of game theory, dubbed PAGE, which
+reshapes FL as a co-opetition game between clients and the server. To explore
+the equilibrium, PAGE further formulates the game as Markov decision processes,
+and leverages the reinforcement learning algorithm, which simplifies the
+solving complexity. Extensive experiments on four widespread datasets show that
+PAGE outperforms state-of-the-art FL baselines in terms of global and local
+prediction accuracy simultaneously, and the accuracy can be improved by up to
+35.20% and 39.91%, respectively. In addition, biased variants of PAGE imply
+promising adaptiveness to demand shifts in practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CAMELL: Confidence-based Acquisition Model for Efficient <span class="highlight-title">Self-supervised</span>
+  Active Learning with Label Validation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08944v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08944v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carel van Niekerk, Christian Geishauser, Michael Heck, Shutong Feng, Hsien-chin Lin, Nurul Lubis, Benjamin Ruppik, Renato Vukovic, Milica Gašić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Supervised neural approaches are hindered by their dependence on large,
+meticulously annotated datasets, a requirement that is particularly cumbersome
+for sequential tasks. The quality of annotations tends to deteriorate with the
+transition from expert-based to crowd-sourced labelling. To address these
+challenges, we present \textbf{CAMELL} (Confidence-based Acquisition Model for
+Efficient self-supervised active Learning with Label validation), a pool-based
+active learning framework tailored for sequential multi-output problems. CAMELL
+possesses three core features: (1) it requires expert annotators to label only
+a fraction of a chosen sequence, (2) it facilitates self-supervision for the
+remainder of the sequence, and (3) it employs a label validation mechanism to
+prevent erroneous labels from contaminating the dataset and harming model
+performance. We evaluate CAMELL on sequential tasks, with a special emphasis on
+dialogue belief tracking, a task plagued by the constraints of limited and
+noisy datasets. Our experiments demonstrate that CAMELL outperforms the
+baselines in terms of efficiency. Furthermore, the data corrections suggested
+by our method contribute to an overall improvement in the quality of the
+resulting datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLaMA Rider: Spurring Large Language Models to Explore the Open World 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08922v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08922v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yicheng Feng, Yuxuan Wang, Jiazheng Liu, Sipeng Zheng, Zongqing Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, various studies have leveraged Large Language Models (LLMs) to help
+decision-making and planning in environments, and try to align the LLMs'
+knowledge with the world conditions. Nonetheless, the capacity of LLMs to
+continuously acquire environmental knowledge and adapt in an open world remains
+uncertain. In this paper, we propose an approach to spur LLMs to explore the
+open world, gather experiences, and learn to improve their task-solving
+capabilities. In this approach, a multi-round feedback-revision mechanism is
+utilized to encourage LLMs to actively select appropriate revision actions
+guided by feedback information from the environment. This facilitates
+exploration and enhances the model's performance. Besides, we integrate
+sub-task relabeling to assist LLMs in maintaining consistency in sub-task
+planning and help the model learn the combinatorial nature between tasks,
+enabling it to complete a wider range of tasks through training based on the
+acquired exploration experiences. By evaluation in Minecraft, an open-ended
+sandbox world, we demonstrate that our approach LLaMA-Rider enhances the
+efficiency of the LLM in exploring the environment, and effectively improves
+the LLM's ability to accomplish more tasks through fine-tuning with merely 1.3k
+instances of collected data, showing minimal training costs compared to the
+baseline using reinforcement learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Embarrassingly Simple Text Watermarks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08920v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08920v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryoma Sato, Yuki Takezawa, Han Bao, Kenta Niwa, Makoto Yamada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose Easymark, a family of embarrassingly simple yet effective
+watermarks. Text watermarking is becoming increasingly important with the
+advent of Large Language Models (LLM). LLMs can generate texts that cannot be
+distinguished from human-written texts. This is a serious problem for the
+credibility of the text. Easymark is a simple yet effective solution to this
+problem. Easymark can inject a watermark without changing the meaning of the
+text at all while a validator can detect if a text was generated from a system
+that adopted Easymark or not with high credibility. Easymark is extremely easy
+to implement so that it only requires a few lines of code. Easymark does not
+require access to LLMs, so it can be implemented on the user-side when the LLM
+providers do not offer watermarked LLMs. In spite of its simplicity, it
+achieves higher detection accuracy and BLEU scores than the state-of-the-art
+text watermarking methods. We also prove the impossibility theorem of perfect
+watermarking, which is valuable in its own right. This theorem shows that no
+matter how sophisticated a watermark is, a malicious user could remove it from
+the text, which motivate us to use a simple watermark such as Easymark. We
+carry out experiments with LLM-generated texts and confirm that Easymark can be
+detected reliably without any degradation of BLEU and perplexity, and
+outperform state-of-the-art watermarks in terms of both quality and
+reliability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Relation-aware Ensemble Learning for Knowledge Graph Embedding <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08917v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08917v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ling Yue, Yongqi Zhang, Quanming Yao, Yong Li, Xian Wu, Ziheng Zhang, Zhenxi Lin, Yefeng Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge graph (KG) embedding is a fundamental task in natural language
+processing, and various methods have been proposed to explore semantic patterns
+in distinctive ways. In this paper, we propose to learn an ensemble by
+leveraging existing methods in a relation-aware manner. However, exploring
+these semantics using relation-aware ensemble leads to a much larger search
+space than general ensemble methods. To address this issue, we propose a
+divide-search-combine algorithm RelEns-DSC that searches the relation-wise
+ensemble weights independently. This algorithm has the same computation cost as
+general ensemble methods but with much better performance. Experimental results
+on benchmark datasets demonstrate the effectiveness of the proposed method in
+efficiently searching relation-aware ensemble weights and achieving
+state-of-the-art embedding performance. The code is public at
+https://github.com/LARS-research/RelEns.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This short paper has been accepted by EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Scalarization for Multi-Task and Multi-Domain Learning at Scale <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08910v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08910v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amelie Royer, Tijmen Blankevoort, Babak Ehteshami Bejnordi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training a single model on multiple input domains and/or output tasks allows
+for compressing information from multiple sources into a unified backbone hence
+improves model efficiency. It also enables potential positive knowledge
+transfer across tasks/domains, leading to improved accuracy and data-efficient
+training. However, optimizing such networks is a challenge, in particular due
+to discrepancies between the different tasks or domains: Despite several
+hypotheses and solutions proposed over the years, recent work has shown that
+uniform scalarization training, i.e., simply minimizing the average of the task
+losses, yields on-par performance with more costly SotA optimization methods.
+This raises the issue of how well we understand the training dynamics of
+multi-task and multi-domain networks. In this work, we first devise a
+large-scale unified analysis of multi-domain and multi-task learning to better
+understand the dynamics of scalarization across varied task/domain combinations
+and model sizes. Following these insights, we then propose to leverage
+population-based training to efficiently search for the optimal scalarization
+weights when dealing with a large number of tasks or domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023; https://openreview.net/forum?id=TSuq3debnD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Community Membership Hiding as Counterfactual Graph Search via Deep
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08909v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08909v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrea Bernini, Fabrizio Silvestri, Gabriele Tolomei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Community detection techniques are useful tools for social media platforms to
+discover tightly connected groups of users who share common interests. However,
+this functionality often comes at the expense of potentially exposing
+individuals to privacy breaches by inadvertently revealing their tastes or
+preferences. Therefore, some users may wish to safeguard their anonymity and
+opt out of community detection for various reasons, such as affiliation with
+political or religious organizations.
+  In this study, we address the challenge of community membership hiding, which
+involves strategically altering the structural properties of a network graph to
+prevent one or more nodes from being identified by a given community detection
+algorithm. We tackle this problem by formulating it as a constrained
+counterfactual graph objective, and we solve it via deep reinforcement
+learning. We validate the effectiveness of our method through two distinct
+tasks: node and community deception. Extensive experiments show that our
+approach overall outperforms existing baselines in both tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Self supervised convolutional kernel based handcrafted feature
+  harmonization: Enhanced left ventricle hypertension disease phenotyping on
+  echocardiography 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08897v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08897v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jina Lee, Youngtaek Hong, Dawun Jeong, Yeonggul Jang, Sihyeon Jeong, Taekgeun Jung, Yeonyee E. Yoon, Inki Moon, Seung-Ah Lee, Hyuk-Jae Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Radiomics, a medical imaging technique, extracts quantitative handcrafted
+features from images to predict diseases. Harmonization in those features
+ensures consistent feature extraction across various imaging devices and
+protocols. Methods for harmonization include standardized imaging protocols,
+statistical adjustments, and evaluating feature robustness. Myocardial diseases
+such as Left Ventricular Hypertrophy (LVH) and Hypertensive Heart Disease (HHD)
+are diagnosed via echocardiography, but variable imaging settings pose
+challenges. Harmonization techniques are crucial for applying handcrafted
+features in disease diagnosis in such scenario. Self-supervised learning (SSL)
+enhances data understanding within limited datasets and adapts to diverse data
+settings. ConvNeXt-V2 integrates convolutional layers into SSL, displaying
+superior performance in various tasks. This study focuses on convolutional
+filters within SSL, using them as preprocessing to convert images into feature
+maps for handcrafted feature harmonization. Our proposed method excelled in
+harmonization evaluation and exhibited superior LVH classification performance
+compared to existing methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EHI: End-to-end Learning of Hierarchical Index for Efficient Dense
+  Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08891v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08891v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ramnath Kumar, Anshul Mittal, Nilesh Gupta, Aditya Kusupati, Inderjit Dhillon, Prateek Jain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dense embedding-based retrieval is now the industry standard for semantic
+search and ranking problems, like obtaining relevant web documents for a given
+query. Such techniques use a two-stage process: (a) contrastive learning to
+train a dual encoder to embed both the query and documents and (b) approximate
+nearest neighbor search (ANNS) for finding similar documents for a given query.
+These two stages are disjoint; the learned embeddings might be ill-suited for
+the ANNS method and vice-versa, leading to suboptimal performance. In this
+work, we propose End-to-end Hierarchical Indexing -- EHI -- that jointly learns
+both the embeddings and the ANNS structure to optimize retrieval performance.
+EHI uses a standard dual encoder model for embedding queries and documents
+while learning an inverted file index (IVF) style tree structure for efficient
+ANNS. To ensure stable and efficient learning of discrete tree-based ANNS
+structure, EHI introduces the notion of dense path embedding that captures the
+position of a query/document in the tree. We demonstrate the effectiveness of
+EHI on several benchmarks, including de-facto industry standard MS MARCO (Dev
+set and TREC DL19) datasets. For example, with the same compute budget, EHI
+outperforms state-of-the-art (SOTA) in by 0.6% (MRR@10) on MS MARCO dev set and
+by 4.2% (nDCG@10) on TREC DL19 benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ METRA: Scalable Unsupervised RL with Metric-Aware Abstraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08887v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08887v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seohong Park, Oleh Rybkin, Sergey Levine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised pre-training strategies have proven to be highly effective in
+natural language processing and computer vision. Likewise, unsupervised
+reinforcement learning (RL) holds the promise of discovering a variety of
+potentially useful behaviors that can accelerate the learning of a wide array
+of downstream tasks. Previous unsupervised RL approaches have mainly focused on
+pure exploration and mutual information skill learning. However, despite the
+previous attempts, making unsupervised RL truly scalable still remains a major
+open challenge: pure exploration approaches might struggle in complex
+environments with large state spaces, where covering every possible transition
+is infeasible, and mutual information skill learning approaches might
+completely fail to explore the environment due to the lack of incentives. To
+make unsupervised RL scalable to complex, high-dimensional environments, we
+propose a novel unsupervised RL objective, which we call Metric-Aware
+Abstraction (METRA). Our main idea is, instead of directly covering the entire
+state space, to only cover a compact latent space $Z$ that is metrically
+connected to the state space $S$ by temporal distances. By learning to move in
+every direction in the latent space, METRA obtains a tractable set of diverse
+behaviors that approximately cover the state space, being scalable to
+high-dimensional environments. Through our experiments in five locomotion and
+manipulation environments, we demonstrate that METRA can discover a variety of
+useful behaviors even in complex, pixel-based environments, being the first
+unsupervised RL method that discovers diverse locomotion behaviors in
+pixel-based Quadruped and Humanoid. Our code and videos are available at
+https://seohong.me/projects/metra/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Gesture Recognition for FMCW Radar on the Edge 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08876v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08876v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maximilian Strobel, Stephan Schoenfeldt, Jonas Daugalas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a lightweight gesture recognition system based on 60
+GHz frequency modulated continuous wave (FMCW) radar. We show that gestures can
+be characterized efficiently by a set of five features, and propose a slim
+radar processing algorithm to extract these features. In contrast to previous
+approaches, we avoid heavy 2D processing, i.e. range-Doppler imaging, and
+perform instead an early target detection - this allows us to port the system
+to fully embedded platforms with tight constraints on memory, compute and power
+consumption. A recurrent neural network (RNN) based architecture exploits these
+features to jointly detect and classify five different gestures. The proposed
+system recognizes gestures with an F1 score of 98.4% on our hold-out test
+dataset, it runs on an Arm Cortex-M4 microcontroller requiring less than 280 kB
+of flash memory, 120 kB of RAM, and consuming 75 mW of power.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 pages, 5 figures, submitted to 2024 IEEE Topical Conference on
+  Wireless Sensors and Sensor Networks (WiSNeT)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> of Methods for Handling Disk Data Imbalance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08867v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08867v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuangshuang Yuan, Peng Wu, Yuehui Chen, Qiang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Class imbalance exists in many classification problems, and since the data is
+designed for accuracy, imbalance in data classes can lead to classification
+challenges with a few classes having higher misclassification costs. The
+Backblaze dataset, a widely used dataset related to hard discs, has a small
+amount of failure data and a large amount of health data, which exhibits a
+serious class imbalance. This paper provides a comprehensive overview of
+research in the field of imbalanced data classification. The discussion is
+organized into three main aspects: data-level methods, algorithmic-level
+methods, and hybrid methods. For each type of method, we summarize and analyze
+the existing problems, algorithmic ideas, strengths, and weaknesses.
+Additionally, the challenges of unbalanced data classification are discussed,
+along with strategies to address them. It is convenient for researchers to
+choose the appropriate method according to their needs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptivity and Modularity for Efficient Generalization Over Task
+  Complexity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08866v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08866v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samira Abnar, Omid Saremi, Laurent Dinh, Shantel Wilson, Miguel Angel Bautista, Chen Huang, Vimal Thilak, Etai Littwin, Jiatao Gu, Josh Susskind, Samy Bengio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Can transformers generalize efficiently on problems that require dealing with
+examples with different levels of difficulty? We introduce a new task tailored
+to assess generalization over different complexities and present results that
+indicate that standard transformers face challenges in solving these tasks.
+These tasks are variations of pointer value retrieval previously introduced by
+Zhang et al. (2021). We investigate how the use of a mechanism for adaptive and
+modular computation in transformers facilitates the learning of tasks that
+demand generalization over the number of sequential computation steps (i.e.,
+the depth of the computation graph). Based on our observations, we propose a
+transformer-based architecture called Hyper-UT, which combines dynamic function
+generation from hyper networks with adaptive depth from Universal Transformers.
+This model demonstrates higher accuracy and a fairer allocation of
+computational resources when generalizing to higher numbers of computation
+steps. We conclude that mechanisms for adaptive depth and modularity complement
+each other in improving efficient generalization concerning example complexity.
+Additionally, to emphasize the broad applicability of our findings, we
+illustrate that in a standard image recognition task, Hyper- UT's performance
+matches that of a ViT model but with considerably reduced computational demands
+(achieving over 70\% average savings by effectively using fewer layers).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ In-Context Learning for Few-Shot Molecular Property Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08863v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08863v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christopher Fifty, Jure Leskovec, Sebastian Thrun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-context learning has become an important approach for few-shot learning in
+Large Language Models because of its ability to rapidly adapt to new tasks
+without fine-tuning model parameters. However, it is restricted to applications
+in natural language and inapplicable to other domains. In this paper, we adapt
+the concepts underpinning in-context learning to develop a new algorithm for
+few-shot molecular property prediction. Our approach learns to predict
+molecular properties from a context of (molecule, property measurement) pairs
+and rapidly adapts to new properties without fine-tuning. On the FS-Mol and
+BACE molecular property prediction benchmarks, we find this method surpasses
+the performance of recent meta-learning algorithms at small support sizes and
+is competitive with the best methods at large support sizes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adam-family Methods with Decoupled Weight Decay in Deep Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08858v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08858v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kuangyu Ding, Nachuan Xiao, Kim-Chuan Toh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we investigate the convergence properties of a wide class of
+Adam-family methods for minimizing quadratically regularized nonsmooth
+nonconvex optimization problems, especially in the context of training
+nonsmooth neural networks with weight decay. Motivated by the AdamW method, we
+propose a novel framework for Adam-family methods with decoupled weight decay.
+Within our framework, the estimators for the first-order and second-order
+moments of stochastic subgradients are updated independently of the weight
+decay term. Under mild assumptions and with non-diminishing stepsizes for
+updating the primary optimization variables, we establish the convergence
+properties of our proposed framework. In addition, we show that our proposed
+framework encompasses a wide variety of well-known Adam-family methods, hence
+offering convergence guarantees for these methods in the training of nonsmooth
+neural networks. More importantly, we show that our proposed framework
+asymptotically approximates the SGD method, thereby providing an explanation
+for the empirical observation that decoupled weight decay enhances
+generalization performance for Adam-family methods. As a practical application
+of our proposed framework, we propose a novel Adam-family method named Adam
+with Decoupled Weight Decay (AdamD), and establish its convergence properties
+under mild conditions. Numerical experiments demonstrate that AdamD outperforms
+Adam and is comparable to AdamW, in the aspects of both generalization
+performance and efficiency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Overcoming Recency Bias of Normalization Statistics in Continual
+  Learning: Balance and Adaptation <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08855v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08855v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yilin Lyu, Liyuan Wang, Xingxing Zhang, Zicheng Sun, Hang Su, Jun Zhu, Liping Jing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning entails learning a sequence of tasks and balancing their
+knowledge appropriately. With limited access to old training samples, much of
+the current work in deep neural networks has focused on overcoming catastrophic
+forgetting of old tasks in gradient-based optimization. However, the
+normalization layers provide an exception, as they are updated interdependently
+by the gradient and statistics of currently observed training samples, which
+require specialized strategies to mitigate recency bias. In this work, we focus
+on the most popular Batch Normalization (BN) and provide an in-depth
+theoretical analysis of its sub-optimality in continual learning. Our analysis
+demonstrates the dilemma between balance and adaptation of BN statistics for
+incremental tasks, which potentially affects training stability and
+generalization. Targeting on these particular challenges, we propose Adaptive
+Balance of BN (AdaB$^2$N), which incorporates appropriately a Bayesian-based
+strategy to adapt task-wise contributions and a modified momentum to balance BN
+statistics, corresponding to the training and testing stages. By implementing
+BN in a continual learning fashion, our approach achieves significant
+performance gains across a wide range of benchmarks, particularly for the
+challenging yet realistic online scenarios (e.g., up to 7.68%, 6.86% and 4.26%
+on Split CIFAR-10, Split CIFAR-100 and Split Mini-ImageNet, respectively). Our
+code is available at https://github.com/lvyilin/AdaB2N.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rank-DETR for High Quality Object Detection <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08854v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08854v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Pu, Weicong Liang, Yiduo Hao, Yuhui Yuan, Yukang Yang, Chao Zhang, Han Hu, Gao Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern detection transformers (DETRs) use a set of object queries to predict
+a list of bounding boxes, sort them by their classification confidence scores,
+and select the top-ranked predictions as the final detection results for the
+given input image. A highly performant object detector requires accurate
+ranking for the bounding box predictions. For DETR-based detectors, the
+top-ranked bounding boxes suffer from less accurate localization quality due to
+the misalignment between classification scores and localization accuracy, thus
+impeding the construction of high-quality detectors. In this work, we introduce
+a simple and highly performant DETR-based object detector by proposing a series
+of rank-oriented designs, combinedly called Rank-DETR. Our key contributions
+include: (i) a rank-oriented architecture design that can prompt positive
+predictions and suppress the negative ones to ensure lower false positive
+rates, as well as (ii) a rank-oriented loss function and matching cost design
+that prioritizes predictions of more accurate localization accuracy during
+ranking to boost the AP under high IoU thresholds. We apply our method to
+improve the recent SOTA methods (e.g., H-DETR and DINO-DETR) and report strong
+COCO object detection results when using different backbones such as
+ResNet-$50$, Swin-T, and Swin-L, demonstrating the effectiveness of our
+approach. Code is available at \url{https://github.com/LeapLabTHU/Rank-DETR}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Semi-Supervised End-To-End Contrastive Learning For Time Series
+  Classification <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08848v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08848v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huili Cai, Xiang Zhang, Xiaofeng Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time series classification is a critical task in various domains, such as
+finance, healthcare, and sensor data analysis. Unsupervised contrastive
+learning has garnered significant interest in learning effective
+representations from time series data with limited labels. The prevalent
+approach in existing contrastive learning methods consists of two separate
+stages: pre-training the encoder on unlabeled datasets and fine-tuning the
+well-trained model on a small-scale labeled dataset. However, such two-stage
+approaches suffer from several shortcomings, such as the inability of
+unsupervised pre-training contrastive loss to directly affect downstream
+fine-tuning classifiers, and the lack of exploiting the classification loss
+which is guided by valuable ground truth. In this paper, we propose an
+end-to-end model called SLOTS (Semi-supervised Learning fOr Time
+clasSification). SLOTS receives semi-labeled datasets, comprising a large
+number of unlabeled samples and a small proportion of labeled samples, and maps
+them to an embedding space through an encoder. We calculate not only the
+unsupervised contrastive loss but also measure the supervised contrastive loss
+on the samples with ground truth. The learned embeddings are fed into a
+classifier, and the classification loss is calculated using the available true
+labels. The unsupervised, supervised contrastive losses and classification loss
+are jointly used to optimize the encoder and classifier. We evaluate SLOTS by
+comparing it with ten state-of-the-art methods across five datasets. The
+results demonstrate that SLOTS is a simple yet effective framework. When
+compared to the two-stage framework, our end-to-end SLOTS utilizes the same
+input data, consumes a similar computational cost, but delivers significantly
+improved performance. We release code and datasets at
+https://anonymous.4open.science/r/SLOTS-242E.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Over-Memorization During Natural, Robust and Catastrophic
+  Overfitting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08847v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08847v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runqi Lin, Chaojian Yu, Bo Han, Tongliang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Overfitting negatively impacts the generalization ability of deep neural
+networks (DNNs) in both natural and adversarial training. Existing methods
+struggle to consistently address different types of overfitting, typically
+designing strategies that focus separately on either natural or adversarial
+patterns. In this work, we adopt a unified perspective by solely focusing on
+natural patterns to explore different types of overfitting. Specifically, we
+examine the memorization effect in DNNs and reveal a shared behaviour termed
+over-memorization, which impairs their generalization capacity. This behaviour
+manifests as DNNs suddenly becoming high-confidence in predicting certain
+training patterns and retaining a persistent memory for them. Furthermore, when
+DNNs over-memorize an adversarial pattern, they tend to simultaneously exhibit
+high-confidence prediction for the corresponding natural pattern. These
+findings motivate us to holistically mitigate different types of overfitting by
+hindering the DNNs from over-memorization natural patterns. To this end, we
+propose a general framework, Distraction Over-Memorization (DOM), which
+explicitly prevents over-memorization by either removing or augmenting the
+high-confidence natural patterns. Extensive experiments demonstrate the
+effectiveness of our proposed method in mitigating overfitting across various
+training paradigms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Framework for Few-Shot Policy Transfer through Observation Mapping and
+  Behavior Cloning <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08836v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08836v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yash Shukla, Bharat Kesari, Shivam Goel, Robert Wright, Jivko Sinapov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite recent progress in Reinforcement Learning for robotics applications,
+many tasks remain prohibitively difficult to solve because of the expensive
+interaction cost. Transfer learning helps reduce the training time in the
+target domain by transferring knowledge learned in a source domain. Sim2Real
+transfer helps transfer knowledge from a simulated robotic domain to a physical
+target domain. Knowledge transfer reduces the time required to train a task in
+the physical world, where the cost of interactions is high. However, most
+existing approaches assume exact correspondence in the task structure and the
+physical properties of the two domains. This work proposes a framework for
+Few-Shot Policy Transfer between two domains through Observation Mapping and
+Behavior Cloning. We use Generative Adversarial Networks (GANs) along with a
+cycle-consistency loss to map the observations between the source and target
+domains and later use this learned mapping to clone the successful source task
+behavior policy to the target domain. We observe successful behavior policy
+transfer with limited target task interactions and in cases where the source
+and target task are semantically dissimilar.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted to the IROS 2023 Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimal Sample Complexity for Average Reward Markov Decision Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08833v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08833v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengbo Wang, Jose Blanchet, Peter Glynn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We settle the sample complexity of policy learning for the maximization of
+the long run average reward associated with a uniformly ergodic Markov decision
+process (MDP), assuming a generative model. In this context, the existing
+literature provides a sample complexity upper bound of $\widetilde
+O(|S||A|t_{\text{mix}}^2 \epsilon^{-2})$ and a lower bound of
+$\Omega(|S||A|t_{\text{mix}} \epsilon^{-2})$. In these expressions, $|S|$ and
+$|A|$ denote the cardinalities of the state and action spaces respectively,
+$t_{\text{mix}}$ serves as a uniform upper limit for the total variation mixing
+times, and $\epsilon$ signifies the error tolerance. Therefore, a notable gap
+of $t_{\text{mix}}$ still remains to be bridged. Our primary contribution is to
+establish an estimator for the optimal policy of average reward MDPs with a
+sample complexity of $\widetilde O(|S||A|t_{\text{mix}}\epsilon^{-2})$,
+effectively reaching the lower bound in the literature. This is achieved by
+combining algorithmic ideas in Jin and Sidford (2021) with those of Li et al.
+(2020).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distance-rank Aware Sequential Reward Learning for Inverse Reinforcement
+  Learning with Sub-optimal Demonstrations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08823v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08823v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lu Li, Yuxin Pan, Ruobing Chen, Jie Liu, Zilin Wang, Yu Liu, Zhiheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inverse reinforcement learning (IRL) aims to explicitly infer an underlying
+reward function based on collected expert demonstrations. Considering that
+obtaining expert demonstrations can be costly, the focus of current IRL
+techniques is on learning a better-than-demonstrator policy using a reward
+function derived from sub-optimal demonstrations. However, existing IRL
+algorithms primarily tackle the challenge of trajectory ranking ambiguity when
+learning the reward function. They overlook the crucial role of considering the
+degree of difference between trajectories in terms of their returns, which is
+essential for further removing reward ambiguity. Additionally, it is important
+to note that the reward of a single transition is heavily influenced by the
+context information within the trajectory. To address these issues, we
+introduce the Distance-rank Aware Sequential Reward Learning (DRASRL)
+framework. Unlike existing approaches, DRASRL takes into account both the
+ranking of trajectories and the degrees of dissimilarity between them to
+collaboratively eliminate reward ambiguity when learning a sequence of
+contextually informed reward signals. Specifically, we leverage the distance
+between policies, from which the trajectories are generated, as a measure to
+quantify the degree of differences between traces. This distance-aware
+information is then used to infer embeddings in the representation space for
+reward learning, employing the contrastive learning technique. Meanwhile, we
+integrate the pairwise ranking loss function to incorporate ranking information
+into the latent features. Moreover, we resort to the Transformer architecture
+to capture the contextual dependencies within the trajectories in the latent
+space, leading to more accurate reward estimation. Through extensive
+experimentation, our DRASRL framework demonstrates significant performance
+improvements over previous SOTA methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the relationship between response time sequence in scale
+  answering process and severity of insomnia: a machine learning approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08817v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08817v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhao Su, Rongxun Liu, Keyin Zhou, Xinru Wei, Ning Wang, Zexin Lin, Yuanchen Xie, Jie Wang, Fei Wang, Shenzhong Zhang, Xizhe Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objectives: The study aims to investigate the relationship between insomnia
+and response time. Additionally, it aims to develop a machine learning model to
+predict the presence of insomnia in participants using response time data.
+Methods: A mobile application was designed to administer scale tests and
+collect response time data from 2729 participants. The relationship between
+symptom severity and response time was explored, and a machine learning model
+was developed to predict the presence of insomnia. Results: The result revealed
+a statistically significant difference (p<.001) in the total response time
+between participants with or without insomnia symptoms. A correlation was
+observed between the severity of specific insomnia aspects and response times
+at the individual questions level. The machine learning model demonstrated a
+high predictive accuracy of 0.743 in predicting insomnia symptoms based on
+response time data. Conclusions: These findings highlight the potential utility
+of response time data to evaluate cognitive and psychological measures,
+demonstrating the effectiveness of using response time as a diagnostic tool in
+the assessment of insomnia.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Nonlinear Method for time series forecasting using VMD-GARCH-LSTM
+  model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08812v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08812v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengtao Gui, Haoyuan Li, Sijie Xu, Yu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time series forecasting represents a significant and challenging task across
+various fields. Recently, methods based on mode decomposition have dominated
+the forecasting of complex time series because of the advantages of capturing
+local characteristics and extracting intrinsic modes from data. Unfortunately,
+most models fail to capture the implied volatilities that contain significant
+information. To enhance the forecasting of current, rapidly evolving, and
+volatile time series, we propose a novel decomposition-ensemble paradigm, the
+VMD-LSTM-GARCH model. The Variational Mode Decomposition algorithm is employed
+to decompose the time series into K sub-modes. Subsequently, the GARCH model
+extracts the volatility information from these sub-modes, which serve as the
+input for the LSTM. The numerical and volatility information of each sub-mode
+is utilized to train a Long Short-Term Memory network. This network predicts
+the sub-mode, and then we aggregate the predictions from all sub-modes to
+produce the output. By integrating econometric and artificial intelligence
+methods, and taking into account both the numerical and volatility information
+of the time series, our proposed model demonstrates superior performance in
+time series forecasting, as evidenced by the significant decrease in MSE, RMSE,
+and MAPE in our comparative experimental results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DDMT: Denoising Diffusion Mask <span class="highlight-title">Transformer</span> Models for Multivariate Time
+  Series Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08800v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08800v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaocheng Yang, Tingyin Wang, Xuanhui Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Anomaly detection in multivariate time series has emerged as a crucial
+challenge in time series research, with significant research implications in
+various fields such as fraud detection, fault diagnosis, and system state
+estimation. Reconstruction-based models have shown promising potential in
+recent years for detecting anomalies in time series data. However, due to the
+rapid increase in data scale and dimensionality, the issues of noise and Weak
+Identity Mapping (WIM) during time series reconstruction have become
+increasingly pronounced. To address this, we introduce a novel Adaptive Dynamic
+Neighbor Mask (ADNM) mechanism and integrate it with the Transformer and
+Denoising Diffusion Model, creating a new framework for multivariate time
+series anomaly detection, named Denoising Diffusion Mask Transformer (DDMT).
+The ADNM module is introduced to mitigate information leakage between input and
+output features during data reconstruction, thereby alleviating the problem of
+WIM during reconstruction. The Denoising Diffusion Transformer (DDT) employs
+the Transformer as an internal neural network structure for Denoising Diffusion
+Model. It learns the stepwise generation process of time series data to model
+the probability distribution of the data, capturing normal data patterns and
+progressively restoring time series data by removing noise, resulting in a
+clear recovery of anomalies. To the best of our knowledge, this is the first
+model that combines Denoising Diffusion Model and the Transformer for
+multivariate time series anomaly detection. Experimental evaluations were
+conducted on five publicly available multivariate time series anomaly detection
+datasets. The results demonstrate that the model effectively identifies
+anomalies in time series data, achieving state-of-the-art performance in
+anomaly detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mitigating Bias for Question Answering Models by Tracking Bias Influence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08795v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08795v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyu Derek Ma, Jiun-Yu Kao, Arpit Gupta, Yu-Hsiang Lin, Wenbo Zhao, Tagyoung Chung, Wei Wang, Kai-Wei Chang, Nanyun Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Models of various NLP tasks have been shown to exhibit stereotypes, and the
+bias in the question answering (QA) models is especially harmful as the output
+answers might be directly consumed by the end users. There have been datasets
+to evaluate bias in QA models, while bias mitigation technique for the QA
+models is still under-explored. In this work, we propose BMBI, an approach to
+mitigate the bias of multiple-choice QA models. Based on the intuition that a
+model would lean to be more biased if it learns from a biased example, we
+measure the bias level of a query instance by observing its influence on
+another instance. If the influenced instance is more biased, we derive that the
+query instance is biased. We then use the bias level detected as an
+optimization objective to form a multi-task learning setting in addition to the
+original QA task. We further introduce a new bias evaluation metric to quantify
+bias in a comprehensive and sensitive way. We show that our method could be
+applied to multiple QA formulations across multiple bias categories. It can
+significantly reduce the bias level in all 9 bias categories in the BBQ dataset
+while maintaining comparable QA accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analysis of Weather and Time Features in Machine Learning-aided ERCOT
+  Load Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08793v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08793v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Yang, Mingjian Tuo, Jin Lu, Xingpeng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate load forecasting is critical for efficient and reliable operations
+of the electric power system. A large part of electricity consumption is
+affected by weather conditions, making weather information an important
+determinant of electricity usage. Personal appliances and industry equipment
+also contribute significantly to electricity demand with temporal patterns,
+making time a useful factor to consider in load forecasting. This work develops
+several machine learning (ML) models that take various time and weather
+information as part of the input features to predict the short-term system-wide
+total load. Ablation studies were also performed to investigate and compare the
+impacts of different weather factors on the prediction accuracy. Actual load
+and historical weather data for the same region were processed and then used to
+train the ML models. It is interesting to observe that using all available
+features, each of which may be correlated to the load, is unlikely to achieve
+the best forecasting performance; features with redundancy may even decrease
+the inference capabilities of ML models. This indicates the importance of
+feature selection for ML models. Overall, case studies demonstrated the
+effectiveness of ML models trained with different weather and time input
+features for ERCOT load forecasting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Incentive Mechanism Design for Distributed Ensemble Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08792v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08792v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Huang, Pengchao Han, Jianwei Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Distributed ensemble learning (DEL) involves training multiple models at
+distributed learners, and then combining their predictions to improve
+performance. Existing related studies focus on DEL algorithm design and
+optimization but ignore the important issue of incentives, without which
+self-interested learners may be unwilling to participate in DEL. We aim to fill
+this gap by presenting a first study on the incentive mechanism design for DEL.
+Our proposed mechanism specifies both the amount of training data and reward
+for learners with heterogeneous computation and communication costs. One design
+challenge is to have an accurate understanding regarding how learners'
+diversity (in terms of training data) affects the ensemble accuracy. To this
+end, we decompose the ensemble accuracy into a diversity-precision tradeoff to
+guide the mechanism design. Another challenge is that the mechanism design
+involves solving a mixed-integer program with a large search space. To this
+end, we propose an alternating algorithm that iteratively updates each
+learner's training data size and reward. We prove that under mild conditions,
+the algorithm converges. Numerical results using MNIST dataset show an
+interesting result: our proposed mechanism may prefer a lower level of learner
+diversity to achieve a higher ensemble accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE GLOBECOM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Price of Stability in Quality-Aware Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08790v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08790v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yizhou Yan, Xinyu Tang, Chao Huang, Ming Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) is a distributed machine learning scheme that enables
+clients to train a shared global model without exchanging local data. The
+presence of label noise can severely degrade the FL performance, and some
+existing studies have focused on algorithm design for label denoising. However,
+they ignored the important issue that clients may not apply costly label
+denoising strategies due to them being self-interested and having heterogeneous
+valuations on the FL performance. To fill this gap, we model the clients'
+interactions as a novel label denoising game and characterize its equilibrium.
+We also analyze the price of stability, which quantifies the difference in the
+system performance (e.g., global model accuracy, social welfare) between the
+equilibrium outcome and the socially optimal solution. We prove that the
+equilibrium outcome always leads to a lower global model accuracy than the
+socially optimal solution does. We further design an efficient algorithm to
+compute the socially optimal solution. Numerical experiments on MNIST dataset
+show that the price of stability increases as the clients' data become noisier,
+calling for an effective incentive mechanism.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE GLOBECOM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Selectivity Drives Productivity: Efficient <span class="highlight-title">Dataset</span> Pruning for Enhanced
+  Transfer Learning <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08782v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08782v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihua Zhang, Yimeng Zhang, Aochuan Chen, Jinghan Jia, Jiancheng Liu, Gaowen Liu, Mingyi Hong, Shiyu Chang, Sijia Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Massive data is often considered essential for deep learning applications,
+but it also incurs significant computational and infrastructural costs.
+Therefore, dataset pruning (DP) has emerged as an effective way to improve data
+efficiency by identifying and removing redundant training samples without
+sacrificing performance. In this work, we aim to address the problem of DP for
+transfer learning, i.e., how to prune a source dataset for improved pretraining
+efficiency and lossless finetuning accuracy on downstream target tasks. To our
+best knowledge, the problem of DP for transfer learning remains open, as
+previous studies have primarily addressed DP and transfer learning as separate
+problems. By contrast, we establish a unified viewpoint to integrate DP with
+transfer learning and find that existing DP methods are not suitable for the
+transfer learning paradigm. We then propose two new DP methods, label mapping
+and feature mapping, for supervised and self-supervised pretraining settings
+respectively, by revisiting the DP problem through the lens of source-target
+domain mapping. Furthermore, we demonstrate the effectiveness of our approach
+on numerous transfer learning tasks. We show that source data classes can be
+pruned by up to 40% ~ 80% without sacrificing downstream performance, resulting
+in a significant 2 ~ 5 times speed-up during the pretraining stage. Besides,
+our proposal exhibits broad applicability and can improve other computationally
+intensive transfer learning techniques, such as adversarial pretraining. Codes
+are available at https://github.com/OPTML-Group/DP4TL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Thirty-seventh Conference on Neural Information Processing Systems
+  (NeurIPS 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Unprocessing Seven Years of Algorithmic Fairness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07261v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07261v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        André F. Cruz, Moritz Hardt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Seven years ago, researchers proposed a postprocessing method to equalize the
+error rates of a model across different demographic groups. The work launched
+hundreds of papers purporting to improve over the postprocessing baseline. We
+empirically evaluate these claims through thousands of model evaluations on
+several tabular datasets. We find that the fairness-accuracy Pareto frontier
+achieved by postprocessing contains all other methods we were feasibly able to
+evaluate. In doing so, we address two common methodological errors that have
+confounded previous observations. One relates to the comparison of methods with
+different unconstrained base models. The other concerns methods achieving
+different levels of constraint relaxation. At the heart of our study is a
+simple idea we call unprocessing that roughly corresponds to the inverse of
+postprocessing. Unprocessing allows for a direct comparison of methods using
+different underlying models and levels of relaxation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SpeedLimit: Neural Architecture Search for Quantized <span class="highlight-title">Transformer</span> Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.12127v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.12127v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuji Chai, Luke Bailey, Yunho Jin, Matthew Karle, Glenn G. Ko, David Brooks, Gu-Yeon Wei, H. T. Kung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While research in the field of transformer models has primarily focused on
+enhancing performance metrics such as accuracy and perplexity, practical
+applications in industry often necessitate a rigorous consideration of
+inference latency constraints. Addressing this challenge, we introduce
+SpeedLimit, a novel Neural Architecture Search (NAS) technique that optimizes
+accuracy whilst adhering to an upper-bound latency constraint. Our method
+incorporates 8-bit integer quantization in the search process to outperform the
+current state-of-the-art technique. Our results underline the feasibility and
+efficacy of seeking an optimal balance between performance and latency,
+providing new avenues for deploying state-of-the-art transformer models in
+latency-sensitive environments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LanguageMPC: Large Language Models as Decision Makers for Autonomous
+  Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03026v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03026v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Sha, Yao Mu, Yuxuan Jiang, Li Chen, Chenfeng Xu, Ping Luo, Shengbo Eben Li, Masayoshi Tomizuka, Wei Zhan, Mingyu Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing learning-based autonomous driving (AD) systems face challenges in
+comprehending high-level information, generalizing to rare events, and
+providing interpretability. To address these problems, this work employs Large
+Language Models (LLMs) as a decision-making component for complex AD scenarios
+that require human commonsense understanding. We devise cognitive pathways to
+enable comprehensive reasoning with LLMs, and develop algorithms for
+translating LLM decisions into actionable driving commands. Through this
+approach, LLM decisions are seamlessly integrated with low-level controllers by
+guided parameter matrix adaptation. Extensive experiments demonstrate that our
+proposed method not only consistently surpasses baseline approaches in
+single-vehicle tasks, but also helps handle complex driving behaviors even
+multi-vehicle coordination, thanks to the commonsense reasoning capabilities of
+LLMs. This paper presents an initial step toward leveraging LLMs as effective
+decision-makers for intricate AD scenarios in terms of safety, efficiency,
+generalizability, and interoperability. We aspire for it to serve as
+inspiration for future research in this field. Project page:
+https://sites.google.com/view/llm-mpc
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Explainable Equivariant Neural Networks for Particle Physics: PELICAN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.16506v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.16506v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Bogatskiy, Timothy Hoffman, David W. Miller, Jan T. Offermann, Xiaoyang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  PELICAN is a novel permutation equivariant and Lorentz invariant or covariant
+aggregator network designed to overcome common limitations found in
+architectures applied to particle physics problems. Compared to many approaches
+that use non-specialized architectures that neglect underlying physics
+principles and require very large numbers of parameters, PELICAN employs a
+fundamentally symmetry group-based architecture that demonstrates benefits in
+terms of reduced complexity, increased interpretability, and raw performance.
+We present a comprehensive study of the PELICAN algorithm architecture in the
+context of both tagging (classification) and reconstructing (regression)
+Lorentz-boosted top quarks, including the difficult task of specifically
+identifying and measuring the $W$-boson inside the dense environment of the
+Lorentz-boosted top-quark hadronic final state. We also extend the application
+of PELICAN to the tasks of identifying quark-initiated vs.~gluon-initiated
+jets, and a multi-class identification across five separate target categories
+of jets. When tested on the standard task of Lorentz-boosted top-quark tagging,
+PELICAN outperforms existing competitors with much lower model complexity and
+high sample efficiency. On the less common and more complex task of 4-momentum
+regression, PELICAN also outperforms hand-crafted, non-machine learning
+algorithms. We discuss the implications of symmetry-restricted architectures
+for the wider field of machine learning for physics.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>50 pages, 34 figures, 12 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SmoothLLM: Defending Large Language Models Against Jailbreaking Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03684v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03684v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alexander Robey, Eric Wong, Hamed Hassani, George J. Pappas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite efforts to align large language models (LLMs) with human values,
+widely-used LLMs such as GPT, Llama, Claude, and PaLM are susceptible to
+jailbreaking attacks, wherein an adversary fools a targeted LLM into generating
+objectionable content. To address this vulnerability, we propose SmoothLLM, the
+first algorithm designed to mitigate jailbreaking attacks on LLMs. Based on our
+finding that adversarially-generated prompts are brittle to character-level
+changes, our defense first randomly perturbs multiple copies of a given input
+prompt, and then aggregates the corresponding predictions to detect adversarial
+inputs. SmoothLLM reduces the attack success rate on numerous popular LLMs to
+below one percentage point, avoids unnecessary conservatism, and admits
+provable guarantees on attack mitigation. Moreover, our defense uses
+exponentially fewer queries than existing attacks and is compatible with any
+LLM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ One Forward is Enough for Neural Network Training via Likelihood Ratio
+  Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08960v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08960v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinyang Jiang, Zeliang Zhang, Chenliang Xu, Zhaofei Yu, Yijie Peng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While backpropagation (BP) is the mainstream approach for gradient
+computation in neural network training, its heavy reliance on the chain rule of
+differentiation constrains the designing flexibility of network architecture
+and training pipelines. We avoid the recursive computation in BP and develop a
+unified likelihood ratio (ULR) method for gradient estimation with just one
+forward propagation. Not only can ULR be extended to train a wide variety of
+neural network architectures, but the computation flow in BP can also be
+rearranged by ULR for better device adaptation. Moreover, we propose several
+variance reduction techniques to further accelerate the training process. Our
+experiments offer numerical results across diverse aspects, including various
+neural network training scenarios, computation flow rearrangement, and
+fine-tuning of pre-trained models. All findings demonstrate that ULR
+effectively enhances the flexibility of neural network training by permitting
+localized module training without compromising the global objective and
+significantly boosts the network robustness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generating Images with Multimodal Language Models <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17216v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17216v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Yu Koh, Daniel Fried, Ruslan Salakhutdinov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a method to fuse frozen text-only large language models (LLMs)
+with pre-trained image encoder and decoder models, by mapping between their
+embedding spaces. Our model demonstrates a wide suite of multimodal
+capabilities: image retrieval, novel image generation, and multimodal dialogue.
+Ours is the first approach capable of conditioning on arbitrarily interleaved
+image and text inputs to generate coherent image (and text) outputs. To achieve
+strong performance on image generation, we propose an efficient mapping network
+to ground the LLM to an off-the-shelf text-to-image generation model. This
+mapping network translates hidden representations of text into the embedding
+space of the visual models, enabling us to leverage the strong text
+representations of the LLM for visual outputs. Our approach outperforms
+baseline generation models on tasks with longer and more complex language. In
+addition to novel image generation, our model is also capable of image
+retrieval from a prespecified dataset, and decides whether to retrieve or
+generate at inference time. This is done with a learnt decision module which
+conditions on the hidden representations of the LLM. Our model exhibits a wider
+range of capabilities compared to prior multimodal language models. It can
+process image-and-text inputs, and produce retrieved images, generated images,
+and generated text -- outperforming non-LLM based generation models across
+several text-to-image tasks that measure context dependence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023. Project page: http://jykoh.com/gill</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Online Learning under Adversarial Nonlinear Constraints <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.03655v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.03655v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pavel Kolev, Georg Martius, Michael Muehlebach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In many applications, learning systems are required to process continuous
+non-stationary data streams. We study this problem in an online learning
+framework and propose an algorithm that can deal with adversarial time-varying
+and nonlinear constraints. As we show in our work, the algorithm called
+Constraint Violation Velocity Projection (CVV-Pro) achieves $\sqrt{T}$ regret
+and converges to the feasible set at a rate of $1/\sqrt{T}$, despite the fact
+that the feasible set is slowly time-varying and a priori unknown to the
+learner. CVV-Pro only relies on local sparse linear approximations of the
+feasible set and therefore avoids optimizing over the entire set at each
+iteration, which is in sharp contrast to projected gradients or Frank-Wolfe
+methods. We also empirically evaluate our algorithm on two-player games, where
+the players are subjected to a shared constraint.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Single-Cell Multimodal Prediction via <span class="highlight-title">Transformer</span>s <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.00233v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.00233v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenzhuo Tang, Hongzhi Wen, Renming Liu, Jiayuan Ding, Wei Jin, Yuying Xie, Hui Liu, Jiliang Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent development of multimodal single-cell technology has made the
+possibility of acquiring multiple omics data from individual cells, thereby
+enabling a deeper understanding of cellular states and dynamics. Nevertheless,
+the proliferation of multimodal single-cell data also introduces tremendous
+challenges in modeling the complex interactions among different modalities. The
+recently advanced methods focus on constructing static interaction graphs and
+applying graph neural networks (GNNs) to learn from multimodal data. However,
+such static graphs can be suboptimal as they do not take advantage of the
+downstream task information; meanwhile GNNs also have some inherent limitations
+when deeply stacking GNN layers. To tackle these issues, in this work, we
+investigate how to leverage transformers for multimodal single-cell data in an
+end-to-end manner while exploiting downstream task information. In particular,
+we propose a scMoFormer framework which can readily incorporate external domain
+knowledge and model the interactions within each modality and cross modalities.
+Extensive experiments demonstrate that scMoFormer achieves superior performance
+on various benchmark datasets. Remarkably, scMoFormer won a Kaggle silver medal
+with the rank of 24/1221 (Top 2%) without ensemble in a NeurIPS 2022
+competition. Our implementation is publicly available at Github.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CIKM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DataPerf: Benchmarks for Data-Centric AI Development <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.10062v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.10062v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mark Mazumder, Colby Banbury, Xiaozhe Yao, Bojan Karlaš, William Gaviria Rojas, Sudnya Diamos, Greg Diamos, Lynn He, Alicia Parrish, Hannah Rose Kirk, Jessica Quaye, Charvi Rastogi, Douwe Kiela, David Jurado, David Kanter, Rafael Mosquera, Juan Ciro, Lora Aroyo, Bilge Acun, Lingjiao Chen, Mehul Smriti Raje, Max Bartolo, Sabri Eyuboglu, Amirata Ghorbani, Emmett Goodman, Oana Inel, Tariq Kane, Christine R. Kirkpatrick, Tzu-Sheng Kuo, Jonas Mueller, Tristan Thrush, Joaquin Vanschoren, Margaret Warren, Adina Williams, Serena Yeung, Newsha Ardalani, Praveen Paritosh, Lilith Bat-Leah, Ce Zhang, James Zou, Carole-Jean Wu, Cody Coleman, Andrew Ng, Peter Mattson, Vijay Janapa Reddi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning research has long focused on models rather than datasets,
+and prominent datasets are used for common ML tasks without regard to the
+breadth, difficulty, and faithfulness of the underlying problems. Neglecting
+the fundamental importance of data has given rise to inaccuracy, bias, and
+fragility in real-world applications, and research is hindered by saturation
+across existing dataset benchmarks. In response, we present DataPerf, a
+community-led benchmark suite for evaluating ML datasets and data-centric
+algorithms. We aim to foster innovation in data-centric AI through competition,
+comparability, and reproducibility. We enable the ML community to iterate on
+datasets, instead of just architectures, and we provide an open, online
+platform with multiple rounds of challenges to support this iterative
+development. The first iteration of DataPerf contains five benchmarks covering
+a wide spectrum of data-centric techniques, tasks, and modalities in vision,
+speech, acquisition, debugging, and diffusion prompting, and we support hosting
+new contributed benchmarks from the community. The benchmarks, online
+evaluation platform, and baseline implementations are open source, and the
+MLCommons Association will maintain DataPerf to ensure long-term benefits to
+academia and industry.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023 Datasets and Benchmarks Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02490v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02490v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weihao Yu, Zhengyuan Yang, Linjie Li, Jianfeng Wang, Kevin Lin, Zicheng Liu, Xinchao Wang, Lijuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose MM-Vet, an evaluation benchmark that examines large multimodal
+models (LMMs) on complicated multimodal tasks. Recent LMMs have shown various
+intriguing abilities, such as solving math problems written on the blackboard,
+reasoning about events and celebrities in news images, and explaining visual
+jokes. Rapid model advancements pose challenges to evaluation benchmark
+development. Problems include: (1) How to systematically structure and evaluate
+the complicated multimodal tasks; (2) How to design evaluation metrics that
+work well across question and answer types; and (3) How to give model insights
+beyond a simple performance ranking. To this end, we present MM-Vet, designed
+based on the insight that the intriguing ability to solve complicated tasks is
+often achieved by a generalist model being able to integrate different core
+vision-language (VL) capabilities. MM-Vet defines 6 core VL capabilities and
+examines the 16 integrations of interest derived from the capability
+combination. For evaluation metrics, we propose an LLM-based evaluator for
+open-ended outputs. The evaluator enables the evaluation across different
+question types and answer styles, resulting in a unified scoring metric. We
+evaluate representative LMMs on MM-Vet, providing insights into the
+capabilities of different LMM system paradigms and models. Code and data are
+available at https://github.com/yuweihao/MM-Vet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Update results of OpenFlamingo-9B (MPT), LLaMA-Adapter v2-7B, and
+  Otter-9B (MPT). Code, data and leaderboard:
+  https://github.com/yuweihao/MM-Vet</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Impact of Cross-Domain Data on German Language Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07321v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07321v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amin Dada, Aokun Chen, Cheng Peng, Kaleb E Smith, Ahmad Idrissi-Yaghir, Constantin Marc Seibold, Jianning Li, Lars Heiliger, Xi Yang, Christoph M. Friedrich, Daniel Truhn, Jan Egger, Jiang Bian, Jens Kleesiek, Yonghui Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditionally, large language models have been either trained on general web
+crawls or domain-specific data. However, recent successes of generative large
+language models, have shed light on the benefits of cross-domain datasets. To
+examine the significance of prioritizing data diversity over quality, we
+present a German dataset comprising texts from five domains, along with another
+dataset aimed at containing high-quality data. Through training a series of
+models ranging between 122M and 750M parameters on both datasets, we conduct a
+comprehensive benchmark on multiple downstream tasks. Our findings demonstrate
+that the models trained on the cross-domain dataset outperform those trained on
+quality data alone, leading to improvements up to $4.45\%$ over the previous
+state-of-the-art. The models are available at
+https://huggingface.co/ikim-uk-essen
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 1 figure, accepted at Findings of the Association for
+  Computational Linguistics: EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-View Class Incremental Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09675v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09675v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Depeng Li, Tianqi Wang, Junwei Chen, Kenji Kawaguchi, Cheng Lian, Zhigang Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-view learning (MVL) has gained great success in integrating information
+from multiple perspectives of a dataset to improve downstream task performance.
+To make MVL methods more practical in an open-ended environment, this paper
+investigates a novel paradigm called multi-view class incremental learning
+(MVCIL), where a single model incrementally classifies new classes from a
+continual stream of views, requiring no access to earlier views of data.
+However, MVCIL is challenged by the catastrophic forgetting of old information
+and the interference with learning new concepts. To address this, we first
+develop a randomization-based representation learning technique serving for
+feature extraction to guarantee their separate view-optimal working states,
+during which multiple views belonging to a class are presented sequentially;
+Then, we integrate them one by one in the orthogonality fusion subspace spanned
+by the extracted features; Finally, we introduce selective weight consolidation
+for learning-without-forgetting decision-making while encountering new classes.
+Extensive experiments on synthetic and real-world datasets validate the
+effectiveness of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Information Fusion</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Theoretical Explanation of Activation Sparsity through Flat Minima and
+  Adversarial Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.03004v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.03004v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ze Peng, Lei Qi, Yinghuan Shi, Yang Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A recent empirical observation (Li et al., 2022b) of activation sparsity in
+MLP blocks offers an opportunity to drastically reduce computation costs for
+free. Although having attributed it to training dynamics, existing theoretical
+explanations of activation sparsity are restricted to shallow networks, small
+training steps and special training, despite its emergence in deep models
+standardly trained for a large number of steps. To fill these gaps, we propose
+the notion of gradient sparsity as one source of activation sparsity and a
+theoretical explanation based on it that sees sparsity a necessary step to
+adversarial robustness w.r.t. hidden features and parameters, which is
+approximately the flatness of minima for well-learned models. The theory
+applies to standardly trained LayerNorm-ed MLPs, and further to Transformers or
+other architectures trained with weight noises. Eliminating other sources of
+flatness except for sparsity, we discover the phenomenon that the ratio between
+the largest and smallest non-zero singular values of weight matrices is small.
+When discussing the emergence of this spectral concentration, we use random
+matrix theory (RMT) as a powerful tool to analyze stochastic gradient noises.
+Validational experiments are conducted to verify our gradient-sparsity-based
+explanation. We propose two plug-and-play modules for both training and
+finetuning for sparsity. Experiments on ImageNet-1k and C4 demonstrate their
+50% sparsity improvements, indicating further potential cost reduction in both
+training and inference.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DIT4BEARs Smart Roads Internship 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2107.06755v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2107.06755v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Abrar Jahin, Andrii Krutsylo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The research internship at UiT - The Arctic University of Norway was offered
+for our team being the winner of the 'Smart Roads - Winter Road Maintenance
+2021' Hackathon. The internship commenced on 3 May 2021 and ended on 21 May
+2021 with meetings happening twice each week. In spite of having different
+nationalities and educational backgrounds, we both interns tried to collaborate
+as a team as much as possible. The most alluring part was working on this
+project made us realize the critical conditions faced by the arctic people,
+where it was hard to gain such a unique experience from our residence. We
+developed and implemented several deep learning models to classify the states
+(dry, moist, wet, icy, snowy, slushy). Depending upon the best model, the
+weather forecast app will predict the state taking the Ta, Tsurf, Height,
+Speed, Water, etc. into consideration. The crucial part was to define a safety
+metric which is the product of the accident rates based on friction and the
+accident rates based on states. We developed a regressor that will predict the
+safety metric depending upon the state obtained from the classifier and the
+friction obtained from the sensor data. A pathfinding algorithm has been
+designed using the sensor data, open street map data, weather data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Universal Semantic-Geometric Representation for Robotic Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10474v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10474v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tong Zhang, Yingdong Hu, Hanchen Cui, Hang Zhao, Yang Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robots rely heavily on sensors, especially RGB and depth cameras, to perceive
+and interact with the world. RGB cameras record 2D images with rich semantic
+information while missing precise spatial information. On the other side, depth
+cameras offer critical 3D geometry data but capture limited semantics.
+Therefore, integrating both modalities is crucial for learning representations
+for robotic perception and control. However, current research predominantly
+focuses on only one of these modalities, neglecting the benefits of
+incorporating both. To this end, we present $\textbf{Semantic-Geometric
+Representation} (\textbf{SGR})$, a universal perception module for robotics
+that leverages the rich semantic information of large-scale pre-trained 2D
+models and inherits the merits of 3D spatial reasoning. Our experiments
+demonstrate that SGR empowers the agent to successfully complete a diverse
+range of simulated and real-world robotic manipulation tasks, outperforming
+state-of-the-art methods significantly in both single-task and multi-task
+settings. Furthermore, SGR possesses the capability to generalize to novel
+semantic attributes, setting it apart from the other methods. Project website:
+https://semantic-geometric-representation.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CoRL 2023. Project website:
+  https://semantic-geometric-representation.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Complementary Domain Adaptation and Generalization for Unsupervised
+  Continual Domain Shift Learning <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.15833v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.15833v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wonguk Cho, Jinha Park, Taesup Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual domain shift poses a significant challenge in real-world
+applications, particularly in situations where labeled data is not available
+for new domains. The challenge of acquiring knowledge in this problem setting
+is referred to as unsupervised continual domain shift learning. Existing
+methods for domain adaptation and generalization have limitations in addressing
+this issue, as they focus either on adapting to a specific domain or
+generalizing to unseen domains, but not both. In this paper, we propose
+Complementary Domain Adaptation and Generalization (CoDAG), a simple yet
+effective learning framework that combines domain adaptation and generalization
+in a complementary manner to achieve three major goals of unsupervised
+continual domain shift learning: adapting to a current domain, generalizing to
+unseen domains, and preventing forgetting of previously seen domains. Our
+approach is model-agnostic, meaning that it is compatible with any existing
+domain adaptation and generalization algorithms. We evaluate CoDAG on several
+benchmark datasets and demonstrate that our model outperforms state-of-the-art
+models in all datasets and evaluation metrics, highlighting its effectiveness
+and robustness in handling unsupervised continual domain shift learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Counterfactually Invariant Predictors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.09768v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.09768v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Francesco Quinzan, Cecilia Casolo, Krikamol Muandet, Yucen Luo, Niki Kilbertus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Notions of counterfactual invariance (CI) have proven essential for
+predictors that are fair, robust, and generalizable in the real world. We
+propose graphical criteria that yield a sufficient condition for a predictor to
+be counterfactually invariant in terms of a conditional independence in the
+observational distribution. In order to learn such predictors, we propose a
+model-agnostic framework, called Counterfactually Invariant Prediction (CIP),
+building on the Hilbert-Schmidt Conditional Independence Criterion (HSCIC), a
+kernel-based conditional dependence measure. Our experimental results
+demonstrate the effectiveness of CIP in enforcing counterfactual invariance
+across various simulated and real-world datasets including scalar and
+multi-variate settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Graph ARMA Processes from Time-Vertex Spectra 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.06887v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.06887v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eylem Tugce Guneyi, Berkay Yaldiz, Abdullah Canbolat, Elif Vural
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The modeling of time-varying graph signals as stationary time-vertex
+stochastic processes permits the inference of missing signal values by
+efficiently employing the correlation patterns of the process across different
+graph nodes and time instants. In this study, we propose an algorithm for
+computing graph autoregressive moving average (graph ARMA) processes based on
+learning the joint time-vertex power spectral density of the process from its
+incomplete realizations for the task of signal interpolation. Our solution
+relies on first roughly estimating the joint spectrum of the process from
+partially observed realizations and then refining this estimate by projecting
+it onto the spectrum manifold of the graph ARMA process through convex
+relaxations. The initially missing signal values are then estimated based on
+the learnt model. Experimental results show that the proposed approach achieves
+high accuracy in time-vertex signal estimation problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FRGNN: Mitigating the Impact of Distribution Shift on Graph Neural
+  Networks via Test-Time Feature Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.09259v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.09259v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Ding, Jielong Yang, Feng Ji, Xionghu Zhong, Linbo Xie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to inappropriate sample selection and limited training data, a
+distribution shift often exists between the training and test sets. This shift
+can adversely affect the test performance of Graph Neural Networks (GNNs).
+Existing approaches mitigate this issue by either enhancing the robustness of
+GNNs to distribution shift or reducing the shift itself. However, both
+approaches necessitate retraining the model, which becomes unfeasible when the
+model structure and parameters are inaccessible. To address this challenge, we
+propose FR-GNN, a general framework for GNNs to conduct feature reconstruction.
+FRGNN constructs a mapping relationship between the output and input of a
+well-trained GNN to obtain class representative embeddings and then uses these
+embeddings to reconstruct the features of labeled nodes. These reconstructed
+features are then incorporated into the message passing mechanism of GNNs to
+influence the predictions of unlabeled nodes at test time. Notably, the
+reconstructed node features can be directly utilized for testing the
+well-trained model, effectively reducing the distribution shift and leading to
+improved test performance. This remarkable achievement is attained without any
+modifications to the model structure or parameters. We provide theoretical
+guarantees for the effectiveness of our framework. Furthermore, we conduct
+comprehensive experiments on various public datasets. The experimental results
+demonstrate the superior performance of FRGNN in comparison to multiple
+categories of baseline methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Coverage Paths in Unknown Environments with Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16978v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16978v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arvi Jonnarth, Jie Zhao, Michael Felsberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Coverage path planning (CPP) is the problem of finding a path that covers the
+entire free space of a confined area, with applications ranging from robotic
+lawn mowing and vacuum cleaning, to demining and search-and-rescue tasks. While
+offline methods can find provably complete, and in some cases optimal, paths
+for known environments, their value is limited in online scenarios where the
+environment is not known beforehand. In this case, the path needs to be planned
+online while mapping the environment. We investigate how suitable reinforcement
+learning is for this challenging problem, and analyze the involved components
+required to efficiently learn coverage paths, such as action space, input
+feature representation, neural network architecture, and reward function.
+Compared to existing classical methods, this approach allows for a flexible
+path space, and enables the agent to adapt to specific environment dynamics. In
+addition to local sensory inputs for acting on short-term obstacle detections,
+we propose to use egocentric maps in multiple scales based on frontiers. This
+allows the agent to plan a long-term path in large-scale environments with
+feasible computational and memory complexity. Furthermore, we propose a novel
+total variation reward term for guiding the agent not to leave small holes of
+non-covered free space. To validate the effectiveness of our approach, we
+perform extensive experiments in simulation with a 2D ranging sensor on
+different variations of the CPP problem, surpassing the performance of both
+previous RL-based approaches and highly specialized methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ From Images to Features: Unbiased Morphology Classification via
+  Variational Auto-Encoders and Domain Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08627v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08627v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Quanfeng Xu, Shiyin Shen, Rafael S. de Souza, Mi Chen, Renhao Ye, Yumei She, Zhu Chen, Emille E. O. Ishida, Alberto Krone-Martins, Rupesh Durgesh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel approach for the dimensionality reduction of galaxy images
+by leveraging a combination of variational auto-encoders (VAE) and domain
+adaptation (DA). We demonstrate the effectiveness of this approach using a
+sample of low redshift galaxies with detailed morphological type labels from
+the Galaxy-Zoo DECaLS project. We show that 40-dimensional latent variables can
+effectively reproduce most morphological features in galaxy images. To further
+validate the effectiveness of our approach, we utilised a classical random
+forest (RF) classifier on the 40-dimensional latent variables to make detailed
+morphology feature classifications. This approach performs similarly to a
+direct neural network application on galaxy images. We further enhance our
+model by tuning the VAE network via DA using galaxies in the overlapping
+footprint of DECaLS and BASS+MzLS, enabling the unbiased application of our
+model to galaxy images in both surveys. We observed that DA led to even better
+morphological feature extraction and classification performance. Overall, this
+combination of VAE and DA can be applied to achieve image dimensionality
+reduction, defect image identification, and morphology classification in large
+optical surveys.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by MNRAS 2023 October 12. 10 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Answering Layer 3 queries with DiscoSCMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.09323v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.09323v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Heyang Gong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Addressing causal queries across the Pearl Causal Hierarchy (PCH) (i.e.,
+associational, interventional and counterfactual), which is formalized as
+\Layer{} Valuations, is a central task in contemporary causal inference
+research. Counterfactual questions, in particular, pose a significant challenge
+as they often necessitate a complete knowledge of structural equations. This
+paper identifies \textbf{the degeneracy problem} caused by the consistency
+rule. To tackle this, the \textit{Distribution-consistency Structural Causal
+Models} (DiscoSCMs) is introduced, which extends both the structural causal
+models (SCM) and the potential outcome framework. The correlation pattern of
+potential outcomes in personalized incentive scenarios, described by $P(y_x,
+y'_{x'})$, is used as a case study for elucidation. Although counterfactuals
+are no longer degenerate, they remain indeterminable. As a result, the
+condition of independent potential noise is incorporated into DiscoSCM. It is
+found that by adeptly using homogeneity, counterfactuals can be identified.
+Furthermore, more refined results are achieved in the unit problem scenario. In
+simpler terms, when modeling counterfactuals, one should contemplate: "Consider
+a person with average ability who takes a test and, due to good luck, achieves
+an exceptionally high score. If this person were to retake the test under
+identical external conditions, what score will he obtain? An exceptionally high
+score or an average score?" If your choose is predicting an average score, then
+you are essentially choosing DiscoSCM over the traditional frameworks based on
+the consistency rule.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Graph Neural Networks based Log Anomaly Detection and Explanation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00527v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00527v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhong Li, Jiayang Shi, Matthijs van Leeuwen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event logs are widely used to record the status of high-tech systems, making
+log anomaly detection important for monitoring those systems. Most existing log
+anomaly detection methods take a log event count matrix or log event sequences
+as input, exploiting quantitative and/or sequential relationships between log
+events to detect anomalies. Unfortunately, only considering quantitative or
+sequential relationships may result in low detection accuracy. To alleviate
+this problem, we propose a graph-based method for unsupervised log anomaly
+detection, dubbed Logs2Graphs, which first converts event logs into attributed,
+directed, and weighted graphs, and then leverages graph neural networks to
+perform graph-level anomaly detection. Specifically, we introduce One-Class
+Digraph Inception Convolutional Networks, abbreviated as OCDiGCN, a novel graph
+neural network model for detecting graph-level anomalies in a collection of
+attributed, directed, and weighted graphs. By coupling the graph representation
+and anomaly detection steps, OCDiGCN can learn a representation that is
+especially suited for anomaly detection, resulting in a high detection
+accuracy. Importantly, for each identified anomaly, we additionally provide a
+small subset of nodes that play a crucial role in OCDiGCN's prediction as
+explanations, which can offer valuable cues for subsequent root cause
+diagnosis. Experiments on five benchmark datasets show that Logs2Graphs
+performs at least on par with state-of-the-art log anomaly detection methods on
+simple datasets while largely outperforming state-of-the-art log anomaly
+detection methods on complicated datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint submitted to Engineering Applications of Artificial
+  Intelligence</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Prompt</span>-augmented Temporal Point Process for Streaming Event Sequence <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04993v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04993v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siqiao Xue, Yan Wang, Zhixuan Chu, Xiaoming Shi, Caigao Jiang, Hongyan Hao, Gangwei Jiang, Xiaoyun Feng, James Y. Zhang, Jun Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Temporal Point Processes (TPPs) are the prevalent paradigm for
+modeling continuous-time event sequences, such as user activities on the web
+and financial transactions. In real-world applications, event data is typically
+received in a \emph{streaming} manner, where the distribution of patterns may
+shift over time. Additionally, \emph{privacy and memory constraints} are
+commonly observed in practical scenarios, further compounding the challenges.
+Therefore, the continuous monitoring of a TPP to learn the streaming event
+sequence is an important yet under-explored problem. Our work paper addresses
+this challenge by adopting Continual Learning (CL), which makes the model
+capable of continuously learning a sequence of tasks without catastrophic
+forgetting under realistic constraints. Correspondingly, we propose a simple
+yet effective framework, PromptTPP\footnote{Our code is available at {\small
+\url{ https://github.com/yanyanSann/PromptTPP}}}, by integrating the base TPP
+with a continuous-time retrieval prompt pool. The prompts, small learnable
+parameters, are stored in a memory space and jointly optimized with the base
+TPP, ensuring that the model learns event streams sequentially without
+buffering past examples or task-specific attributes. We present a novel and
+realistic experimental setup for modeling event streams, where PromptTPP
+consistently achieves state-of-the-art performance across three real user
+behavior datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023 camera ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Time Series Continuous Modeling for Imputation and Forecasting with
+  Implicit Neural Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.05880v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.05880v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Etienne Le Naour, Louis Serrano, Léon Migus, Yuan Yin, Ghislain Agoua, Nicolas Baskiotis, Patrick Gallinari, Vincent Guigue
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a novel modeling approach for time series imputation and
+forecasting, tailored to address the challenges often encountered in real-world
+data, such as irregular samples, missing data, or unaligned measurements from
+multiple sensors. Our method relies on a continuous-time-dependent model of the
+series' evolution dynamics. It leverages adaptations of conditional, implicit
+neural representations for sequential data. A modulation mechanism, driven by a
+meta-learning algorithm, allows adaptation to unseen samples and extrapolation
+beyond observed time-windows for long-term predictions. The model provides a
+highly flexible and unified framework for imputation and forecasting tasks
+across a wide range of challenging scenarios. It achieves state-of-the-art
+performance on classical benchmarks and outperforms alternative time-continuous
+models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Newton-Cotes Graph Neural Networks: On the Time Evolution of Dynamic
+  Systems <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14642v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14642v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingbing Guo, Weiqing Wang, Zhuo Chen, Ningyu Zhang, Zequn Sun, Yixuan Lai, Qiang Zhang, Huajun Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reasoning system dynamics is one of the most important analytical approaches
+for many scientific studies. With the initial state of a system as input, the
+recent graph neural networks (GNNs)-based methods are capable of predicting the
+future state distant in time with high accuracy. Although these methods have
+diverse designs in modeling the coordinates and interacting forces of the
+system, we show that they actually share a common paradigm that learns the
+integration of the velocity over the interval between the initial and terminal
+coordinates. However, their integrand is constant w.r.t. time. Inspired by this
+observation, we propose a new approach to predict the integration based on
+several velocity estimations with Newton-Cotes formulas and prove its
+effectiveness theoretically. Extensive experiments on several benchmarks
+empirically demonstrate consistent and significant improvement compared with
+the state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023 (spotlight)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Local-Global Temporal Fusion Network with an Attention Mechanism for
+  Multiple and Multiclass Arrhythmia Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02416v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02416v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yun Kwan Kim, Minji Lee, Kunwook Jo, Hee Seok Song, Seong-Whan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clinical decision support systems (CDSSs) have been widely utilized to
+support the decisions made by cardiologists when detecting and classifying
+arrhythmia from electrocardiograms (ECGs). However, forming a CDSS for the
+arrhythmia classification task is challenging due to the varying lengths of
+arrhythmias. Although the onset time of arrhythmia varies, previously developed
+methods have not considered such conditions. Thus, we propose a framework that
+consists of (i) local temporal information extraction, (ii) global pattern
+extraction, and (iii) local-global information fusion with attention to perform
+arrhythmia detection and classification with a constrained input length. The
+10-class and 4-class performances of our approach were assessed by detecting
+the onset and offset of arrhythmia as an episode and the duration of arrhythmia
+based on the MIT-BIH arrhythmia database (MITDB) and MIT-BIH atrial
+fibrillation database (AFDB), respectively. The results were statistically
+superior to those achieved by the comparison models. To check the
+generalization ability of the proposed method, an AFDB-trained model was tested
+on the MITDB, and superior performance was attained compared with that of a
+state-of-the-art model. The proposed method can capture local-global
+information and dynamics without incurring information losses. Therefore,
+arrhythmias can be recognized more accurately, and their occurrence times can
+be calculated; thus, the clinical field can create more accurate treatment
+plans by using the proposed method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Structured Prediction Problem Archive 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.03574v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.03574v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Swoboda, Ahmed Abbas, Florian Bernard, Andrea Hornakova, Paul Roetzer, Bogdan Savchynskyy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Structured prediction problems are one of the fundamental tools in machine
+learning. In order to facilitate algorithm development for their numerical
+solution, we collect in one place a large number of datasets in easy to read
+formats for a diverse set of problem classes. We provide archival links to
+datasets, description of the considered problems and problem formats, and a
+short summary of problem characteristics including size, number of instances
+etc. For reference we also give a non-exhaustive selection of algorithms
+proposed in the literature for their solution. We hope that this central
+repository will make benchmarking and comparison to established works easier.
+We welcome submission of interesting new datasets and algorithms for inclusion
+in our archive.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Added new shape matching instances based of learned descriptors</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AutoML4ETC: Automated Neural Architecture Search for Real-World
+  Encrypted Traffic Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.02182v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.02182v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navid Malekghaini, Elham Akbari, Mohammad A. Salahuddin, Noura Limam, Raouf Boutaba, Bertrand Mathieu, Stephanie Moteau, Stephane Tuffin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning (DL) has been successfully applied to encrypted network traffic
+classification in experimental settings. However, in production use, it has
+been shown that a DL classifier's performance inevitably decays over time.
+Re-training the model on newer datasets has been shown to only partially
+improve its performance. Manually re-tuning the model architecture to meet the
+performance expectations on newer datasets is time-consuming and requires
+domain expertise. We propose AutoML4ETC, a novel tool to automatically design
+efficient and high-performing neural architectures for encrypted traffic
+classification. We define a novel, powerful search space tailored specifically
+for the early classification of encrypted traffic using packet header bytes. We
+show that with different search strategies over our search space, AutoML4ETC
+generates neural architectures that outperform the state-of-the-art encrypted
+traffic classifiers on several datasets, including public benchmark datasets
+and real-world TLS and QUIC traffic collected from the Orange mobile network.
+In addition to being more accurate, AutoML4ETC's architectures are
+significantly more efficient and lighter in terms of the number of parameters.
+Finally, we make AutoML4ETC publicly available for future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper accepted for publication in IEEE TNSM journal. Please cite that
+  version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Topological Reconstruction of Particle Physics Processes using Graph
+  Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.13937v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.13937v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lukas Ehrke, John Andrew Raine, Knut Zoch, Manuel Guth, Tobias Golling
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a new approach, the Topograph, which reconstructs underlying
+physics processes, including the intermediary particles, by leveraging
+underlying priors from the nature of particle physics decays and the
+flexibility of message passing graph neural networks. The Topograph not only
+solves the combinatoric assignment of observed final state objects, associating
+them to their original mother particles, but directly predicts the properties
+of intermediate particles in hard scatter processes and their subsequent
+decays. In comparison to standard combinatoric approaches or modern approaches
+using graph neural networks, which scale exponentially or quadratically, the
+complexity of Topographs scales linearly with the number of reconstructed
+objects.
+  We apply Topographs to top quark pair production in the all hadronic decay
+channel, where we outperform the standard approach and match the performance of
+the state-of-the-art machine learning technique.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, 24 figures, 8 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cross-Domain Policy Adaptation via Value-Guided Data Filtering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17625v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17625v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kang Xu, Chenjia Bai, Xiaoteng Ma, Dong Wang, Bin Zhao, Zhen Wang, Xuelong Li, Wei Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generalizing policies across different domains with dynamics mismatch poses a
+significant challenge in reinforcement learning. For example, a robot learns
+the policy in a simulator, but when it is deployed in the real world, the
+dynamics of the environment may be different. Given the source and target
+domain with dynamics mismatch, we consider the online dynamics adaptation
+problem, in which case the agent can access sufficient source domain data while
+online interactions with the target domain are limited. Existing research has
+attempted to solve the problem from the dynamics discrepancy perspective. In
+this work, we reveal the limitations of these methods and explore the problem
+from the value difference perspective via a novel insight on the value
+consistency across domains. Specifically, we present the Value-Guided Data
+Filtering (VGDF) algorithm, which selectively shares transitions from the
+source domain based on the proximity of paired value targets across the two
+domains. Empirical results on various environments with kinematic and
+morphology shifts demonstrate that our method achieves superior performance
+compared to prior approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pure Monte Carlo Counterfactual Regret Minimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.03084v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.03084v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ju Qi, Ting Feng, Falun Hei, Zhemei Fang, Yunfeng Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Counterfactual Regret Minimization (CFR) and its variants are the best
+algorithms so far for solving large-scale incomplete information games.
+However, we believe that there are two problems with CFR: First, matrix
+multiplication is required in CFR iteration, and the time complexity of one
+iteration is too high; Secondly, the game characteristics in the real world are
+different. Just using one CFR algorithm will not be perfectly suitable for all
+game problems.
+  For these two problems, this paper proposes a new algorithm called Pure CFR
+(PCFR) based on CFR. PCFR can be seen as a combination of CFR and Fictitious
+Play (FP), inheriting the concept of counterfactual regret (value) from CFR,
+and using the best response strategy instead of the regret matching strategy
+for the next iteration. This algorithm has three advantages. First, PCFR can be
+combined with any CFR variant. The resulting Pure MCCFR (PMCCFR) can
+significantly reduce the time and space complexity of one iteration. Secondly,
+our experiments show that the convergence speed of the PMCCFR is 2$\sim$3 times
+that of the MCCFR. Finally, there is a type of game that is very suitable for
+PCFR. We call this type of game clear-game, which is characterized by a high
+proportion of dominated strategies. Experiments show that in clear-game, the
+convergence rate of PMCCFR is two orders of magnitude higher than that of
+MCCFR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Robust Multimodal Learning with Missing Modalities via
+  Parameter-Efficient Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03986v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03986v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Kaykobad Reza, Ashley Prater-Bennette, M. Salman Asif
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal learning seeks to utilize data from multiple sources to improve
+the overall performance of downstream tasks. It is desirable for redundancies
+in the data to make multimodal systems robust to missing or corrupted
+observations in some correlated modalities. However, we observe that the
+performance of several existing multimodal networks significantly deteriorates
+if one or multiple modalities are absent at test time. To enable robustness to
+missing modalities, we propose simple and parameter-efficient adaptation
+procedures for pretrained multimodal networks. In particular, we exploit
+low-rank adaptation and modulation of intermediate features to compensate for
+the missing modalities. We demonstrate that such adaptation can partially
+bridge performance drop due to missing modalities and outperform independent,
+dedicated networks trained for the available modality combinations in some
+cases. The proposed adaptation requires extremely small number of parameters
+(e.g., fewer than 0.7% of the total parameters in most experiments). We conduct
+a series of experiments to highlight the robustness of our proposed method
+using diverse datasets for RGB-thermal and RGB-Depth semantic segmentation,
+multimodal material segmentation, and multimodal sentiment analysis tasks. Our
+proposed method demonstrates versatility across various tasks and datasets, and
+outperforms existing methods for robust multimodal learning with missing
+modalities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 3 figures, 11 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Provable Robust Watermarking for AI-Generated Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.17439v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.17439v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuandong Zhao, Prabhanjan Ananth, Lei Li, Yu-Xiang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of watermarking large language models (LLMs) generated
+text -- one of the most promising approaches for addressing the safety
+challenges of LLM usage. In this paper, we propose a rigorous theoretical
+framework to quantify the effectiveness and robustness of LLM watermarks. We
+propose a robust and high-quality watermark method, Unigram-Watermark, by
+extending an existing approach with a simplified fixed grouping strategy. We
+prove that our watermark method enjoys guaranteed generation quality,
+correctness in watermark detection, and is robust against text editing and
+paraphrasing. Experiments on three varying LLMs and two datasets verify that
+our Unigram-Watermark achieves superior detection accuracy and comparable
+generation quality in perplexity, thus promoting the responsible use of LLMs.
+Code is available at https://github.com/XuandongZhao/Unigram-Watermark.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OpenDataVal: a Unified Benchmark for Data Valuation <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10577v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10577v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kevin Fu Jiang, Weixin Liang, James Zou, Yongchan Kwon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Assessing the quality and impact of individual data points is critical for
+improving model performance and mitigating undesirable biases within the
+training dataset. Several data valuation algorithms have been proposed to
+quantify data quality, however, there lacks a systemic and standardized
+benchmarking system for data valuation. In this paper, we introduce
+OpenDataVal, an easy-to-use and unified benchmark framework that empowers
+researchers and practitioners to apply and compare various data valuation
+algorithms. OpenDataVal provides an integrated environment that includes (i) a
+diverse collection of image, natural language, and tabular datasets, (ii)
+implementations of eleven different state-of-the-art data valuation algorithms,
+and (iii) a prediction model API that can import any models in scikit-learn.
+Furthermore, we propose four downstream machine learning tasks for evaluating
+the quality of data values. We perform benchmarking analysis using OpenDataVal,
+quantifying and comparing the efficacy of state-of-the-art data valuation
+approaches. We find that no single algorithm performs uniformly best across all
+tasks, and an appropriate algorithm should be employed for a user's downstream
+task. OpenDataVal is publicly available at https://opendataval.github.io with
+comprehensive documentation. Furthermore, we provide a leaderboard where
+researchers can evaluate the effectiveness of their own data valuation
+algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages, NeurIPS 2023 Track on Datasets and Benchmarks</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ZSON: Zero-Shot Object-Goal Navigation using Multimodal Goal Embeddings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.12403v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.12403v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arjun Majumdar, Gunjan Aggarwal, Bhavika Devnani, Judy Hoffman, Dhruv Batra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a scalable approach for learning open-world object-goal navigation
+(ObjectNav) -- the task of asking a virtual robot (agent) to find any instance
+of an object in an unexplored environment (e.g., "find a sink"). Our approach
+is entirely zero-shot -- i.e., it does not require ObjectNav rewards or
+demonstrations of any kind. Instead, we train on the image-goal navigation
+(ImageNav) task, in which agents find the location where a picture (i.e., goal
+image) was captured. Specifically, we encode goal images into a multimodal,
+semantic embedding space to enable training semantic-goal navigation
+(SemanticNav) agents at scale in unannotated 3D environments (e.g., HM3D).
+After training, SemanticNav agents can be instructed to find objects described
+in free-form natural language (e.g., "sink", "bathroom sink", etc.) by
+projecting language goals into the same multimodal, semantic embedding space.
+As a result, our approach enables open-world ObjectNav. We extensively evaluate
+our agents on three ObjectNav datasets (Gibson, HM3D, and MP3D) and observe
+absolute improvements in success of 4.2% - 20.0% over existing zero-shot
+methods. For reference, these gains are similar or better than the 5%
+improvement in success between the Habitat 2020 and 2021 ObjectNav challenge
+winners. In an open-world setting, we discover that our agents can generalize
+to compound instructions with a room explicitly mentioned (e.g., "Find a
+kitchen sink") and when the target room can be inferred (e.g., "Find a sink and
+a stove").
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>code: https://github.com/gunagg/zson</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ProGO: Probabilistic Global Optimizer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04457v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04457v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Zhang, Sujit Ghosh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of global optimization, many existing algorithms face challenges
+posed by non-convex target functions and high computational complexity or
+unavailability of gradient information. These limitations, exacerbated by
+sensitivity to initial conditions, often lead to suboptimal solutions or failed
+convergence. This is true even for Metaheuristic algorithms designed to
+amalgamate different optimization techniques to improve their efficiency and
+robustness. To address these challenges, we develop a sequence of
+multidimensional integration-based methods that we show to converge to the
+global optima under some mild regularity conditions. Our probabilistic approach
+does not require the use of gradients and is underpinned by a mathematically
+rigorous convergence framework anchored in the nuanced properties of nascent
+optima distribution. In order to alleviate the problem of multidimensional
+integration, we develop a latent slice sampler that enjoys a geometric rate of
+convergence in generating samples from the nascent optima distribution, which
+is used to approximate the global optima. The proposed Probabilistic Global
+Optimizer (ProGO) provides a scalable unified framework to approximate the
+global optima of any continuous function defined on a domain of arbitrary
+dimension. Empirical illustrations of ProGO across a variety of popular
+non-convex test functions (having finite global optima) reveal that the
+proposed algorithm outperforms, by order of magnitude, many existing
+state-of-the-art methods, including gradient-based, zeroth-order gradient-free,
+and some Bayesian Optimization methods, in term regret value and speed of
+convergence. It is, however, to be noted that our approach may not be suitable
+for functions that are expensive to compute.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model-Free, Regret-Optimal Best Policy Identification in Online CMDPs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15395v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15395v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Zhou, Honghao Wei, Lei Ying
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper considers the best policy identification (BPI) problem in online
+Constrained Markov Decision Processes (CMDPs). We are interested in algorithms
+that are model-free, have low regret, and identify an optimal policy with a
+high probability. Existing model-free algorithms for online CMDPs with
+sublinear regret and constraint violation do not provide any convergence
+guarantee to an optimal policy and provide only average performance guarantees
+when a policy is uniformly sampled at random from all previously used policies.
+In this paper, we develop a new algorithm, named
+Pruning-Refinement-Identification (PRI), based on a fundamental structural
+property of CMDPs proved in Koole(1988); Ross(1989), which we call limited
+stochasticity. The property says for a CMDP with $N$ constraints, there exists
+an optimal policy with at most $N$ stochastic decisions.
+  The proposed algorithm first identifies at which step and in which state a
+stochastic decision has to be taken and then fine-tunes the distributions of
+these stochastic decisions. PRI achieves trio objectives: (i) PRI is a
+model-free algorithm; and (ii) it outputs a near-optimal policy with a high
+probability at the end of learning; and (iii) in the tabular setting, PRI
+guarantees $\tilde{\mathcal{O}}(\sqrt{K})$ regret and constraint violation,
+which significantly improves the best existing regret bound
+$\tilde{\mathcal{O}}(K^{\frac{4}{5}})$ under a model-free algorithm, where $K$
+is the total number of episodes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ H-InDex: Visual Reinforcement Learning with Hand-Informed
+  Representations for Dexterous Manipulation <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01404v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01404v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yanjie Ze, Yuyao Liu, Ruizhe Shi, Jiaxin Qin, Zhecheng Yuan, Jiashun Wang, Huazhe Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human hands possess remarkable dexterity and have long served as a source of
+inspiration for robotic manipulation. In this work, we propose a human
+$\textbf{H}$and$\textbf{-In}$formed visual representation learning framework to
+solve difficult $\textbf{Dex}$terous manipulation tasks ($\textbf{H-InDex}$)
+with reinforcement learning. Our framework consists of three stages: (i)
+pre-training representations with 3D human hand pose estimation, (ii) offline
+adapting representations with self-supervised keypoint detection, and (iii)
+reinforcement learning with exponential moving average BatchNorm. The last two
+stages only modify $0.36\%$ parameters of the pre-trained representation in
+total, ensuring the knowledge from pre-training is maintained to the full
+extent. We empirically study 12 challenging dexterous manipulation tasks and
+find that H-InDex largely surpasses strong baseline methods and the recent
+visual foundation models for motor control. Code is available at
+https://yanjieze.com/H-InDex .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023. Code and videos: https://yanjieze.com/H-InDex</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generative Modeling with Phase Stochastic Bridges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07805v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07805v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianrong Chen, Jiatao Gu, Laurent Dinh, Evangelos A. Theodorou, Josh Susskind, Shuangfei Zhai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models (DMs) represent state-of-the-art generative models for
+continuous inputs. DMs work by constructing a Stochastic Differential Equation
+(SDE) in the input space (ie, position space), and using a neural network to
+reverse it. In this work, we introduce a novel generative modeling framework
+grounded in \textbf{phase space dynamics}, where a phase space is defined as
+{an augmented space encompassing both position and velocity.} Leveraging
+insights from Stochastic Optimal Control, we construct a path measure in the
+phase space that enables efficient sampling. {In contrast to DMs, our framework
+demonstrates the capability to generate realistic data points at an early stage
+of dynamics propagation.} This early prediction sets the stage for efficient
+data generation by leveraging additional velocity information along the
+trajectory. On standard image generation benchmarks, our model yields favorable
+performance over baselines in the regime of small Number of Function
+Evaluations (NFEs). Furthermore, our approach rivals the performance of
+diffusion models equipped with efficient sampling techniques, underscoring its
+potential as a new tool generative modeling.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Causal Deep Learning for Vulnerability Detection <span class="chip">ICSE 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07958v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07958v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Mahbubur Rahman, Ira Ceka, Chengzhi Mao, Saikat Chakraborty, Baishakhi Ray, Wei Le
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning vulnerability detection has shown promising results in recent
+years. However, an important challenge that still blocks it from being very
+useful in practice is that the model is not robust under perturbation and it
+cannot generalize well over the out-of-distribution (OOD) data, e.g., applying
+a trained model to unseen projects in real world. We hypothesize that this is
+because the model learned non-robust features, e.g., variable names, that have
+spurious correlations with labels. When the perturbed and OOD datasets no
+longer have the same spurious features, the model prediction fails. To address
+the challenge, in this paper, we introduced causality into deep learning
+vulnerability detection. Our approach CausalVul consists of two phases. First,
+we designed novel perturbations to discover spurious features that the model
+may use to make predictions. Second, we applied the causal learning algorithms,
+specifically, do-calculus, on top of existing deep learning models to
+systematically remove the use of spurious features and thus promote causal
+based prediction. Our results show that CausalVul consistently improved the
+model accuracy, robustness and OOD performance for all the state-of-the-art
+models and datasets we experimented. To the best of our knowledge, this is the
+first work that introduces do calculus based causal learning to software
+engineering models and shows it's indeed useful for improving the model
+accuracy, robustness and generalization. Our replication package is located at
+https://figshare.com/s/0ffda320dcb96c249ef2.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICSE 2024 (not camera-ready version)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Locally Adaptive and Differentiable Regression 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.07418v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.07418v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingxuan Han, Varun Shankar, Jeff M Phillips, Chenglong Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over-parameterized models like deep nets and random forests have become very
+popular in machine learning. However, the natural goals of continuity and
+differentiability, common in regression models, are now often ignored in modern
+overparametrized, locally-adaptive models. We propose a general framework to
+construct a global continuous and differentiable model based on a weighted
+average of locally learned models in corresponding local regions. This model is
+competitive in dealing with data with different densities or scales of function
+values in different local regions. We demonstrate that when we mix kernel ridge
+and polynomial regression terms in the local models, and stitch them together
+continuously, we achieve faster statistical convergence in theory and improved
+performance in various practical settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Policy Optimization Method Towards Optimal-time Stability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.00521v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.00521v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shengjie Wang, Fengbo Lan, Xiang Zheng, Yuxue Cao, Oluwatosin Oseni, Haotian Xu, Tao Zhang, Yang Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In current model-free reinforcement learning (RL) algorithms, stability
+criteria based on sampling methods are commonly utilized to guide policy
+optimization. However, these criteria only guarantee the infinite-time
+convergence of the system's state to an equilibrium point, which leads to
+sub-optimality of the policy. In this paper, we propose a policy optimization
+technique incorporating sampling-based Lyapunov stability. Our approach enables
+the system's state to reach an equilibrium point within an optimal time and
+maintain stability thereafter, referred to as "optimal-time stability". To
+achieve this, we integrate the optimization method into the Actor-Critic
+framework, resulting in the development of the Adaptive Lyapunov-based
+Actor-Critic (ALAC) algorithm. Through evaluations conducted on ten robotic
+tasks, our approach outperforms previous studies significantly, effectively
+guiding the system to generate stable patterns.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 11 figues. 7th Annual Conference on Robot Learning. 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can We Edit Multimodal Large Language Models? <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08475v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08475v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyuan Cheng, Bozhong Tian, Qingbin Liu, Xi Chen, Yongheng Wang, Huajun Chen, Ningyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we focus on editing Multimodal Large Language Models (MLLMs).
+Compared to editing single-modal LLMs, multimodal model editing is more
+challenging, which demands a higher level of scrutiny and careful consideration
+in the editing process. To facilitate research in this area, we construct a new
+benchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite
+of innovative metrics for evaluation. We conduct comprehensive experiments
+involving various model editing baselines and analyze the impact of editing
+different components for multimodal LLMs. Empirically, we notice that previous
+baselines can implement editing multimodal LLMs to some extent, but the effect
+is still barely satisfactory, indicating the potential difficulty of this task.
+We hope that our work can provide the NLP community with insights. Code and
+dataset are available in https://github.com/zjunlp/EasyEdit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Federated Generalization via Information-Theoretic Distribution
+  Diversification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07171v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07171v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheshun Wu, Zenglin Xu, Dun Zeng, Qifan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) has surged in prominence due to its capability of
+collaborative model training without direct data sharing. However, the vast
+disparity in local data distributions among clients, often termed the
+non-Independent Identically Distributed (non-IID) challenge, poses a
+significant hurdle to FL's generalization efficacy. The scenario becomes even
+more complex when not all clients participate in the training process, a common
+occurrence due to unstable network connections or limited computational
+capacities. This can greatly complicate the assessment of the trained models'
+generalization abilities. While a plethora of recent studies has centered on
+the generalization gap pertaining to unseen data from participating clients
+with diverse distributions, the divergence between the training distributions
+of participating clients and the testing distributions of non-participating
+ones has been largely overlooked. In response, our paper unveils an
+information-theoretic generalization framework for FL. Specifically, it
+quantifies generalization errors by evaluating the information entropy of local
+distributions and discerning discrepancies across these distributions. Inspired
+by our deduced generalization bounds, we introduce a weighted aggregation
+approach and a duo of client selection strategies. These innovations aim to
+bolster FL's generalization prowess by encompassing a more varied set of client
+data distributions. Our extensive empirical evaluations reaffirm the potency of
+our proposed methods, aligning seamlessly with our theoretical construct.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machine Learning for Synthetic Data Generation: A <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.04062v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.04062v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingzhou Lu, Minjie Shen, Huazheng Wang, Capucine van Rechem, Wenqi Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning heavily relies on data, but real-world applications often
+encounter various data-related issues. These include data of poor quality,
+insufficient data points leading to under-fitting of machine learning models,
+and difficulties in data access due to concerns surrounding privacy, safety,
+and regulations. In light of these challenges, the concept of synthetic data
+generation emerges as a promising alternative that allows for data sharing and
+utilization in ways that real-world data cannot facilitate. This paper presents
+a comprehensive systematic review of existing studies that employ machine
+learning models for the purpose of generating synthetic data. The review
+encompasses various perspectives, starting with the applications of synthetic
+data generation, spanning computer vision, speech, natural language processing,
+healthcare, and business domains. Additionally, it explores different machine
+learning methods, with particular emphasis on neural network architectures and
+deep generative models. The paper also addresses the crucial aspects of privacy
+and fairness concerns related to synthetic data generation. Furthermore, this
+study identifies the challenges and opportunities prevalent in this emerging
+field, shedding light on the potential avenues for future research. By delving
+into the intricacies of synthetic data generation, this paper aims to
+contribute to the advancement of knowledge and inspire further exploration in
+synthetic data generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">6</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Sparse Spatial Relation in Graph Inference for Text-Based VQA 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09147v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09147v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sheng Zhou, Dan Guo, Jia Li, Xun Yang, Meng Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-based visual question answering (TextVQA) faces the significant
+challenge of avoiding redundant relational inference. To be specific, a large
+number of detected objects and optical character recognition (OCR) tokens
+result in rich visual relationships. Existing works take all visual
+relationships into account for answer prediction. However, there are three
+observations: (1) a single subject in the images can be easily detected as
+multiple objects with distinct bounding boxes (considered repetitive objects).
+The associations between these repetitive objects are superfluous for answer
+reasoning; (2) two spatially distant OCR tokens detected in the image
+frequently have weak semantic dependencies for answer reasoning; and (3) the
+co-existence of nearby objects and tokens may be indicative of important visual
+cues for predicting answers. Rather than utilizing all of them for answer
+prediction, we make an effort to identify the most important connections or
+eliminate redundant ones. We propose a sparse spatial graph network (SSGN) that
+introduces a spatially aware relation pruning technique to this task. As
+spatial factors for relation measurement, we employ spatial distance, geometric
+dimension, overlap area, and DIoU for spatially aware pruning. We consider
+three visual relationships for graph learning: object-object, OCR-OCR tokens,
+and object-OCR token relationships. SSGN is a progressive graph learning
+architecture that verifies the pivotal relations in the correlated object-token
+sparse graph, and then in the respective object-based sparse graph and
+token-based sparse graph. Experiment results on TextVQA and ST-VQA datasets
+demonstrate that SSGN achieves promising performances. And some visualization
+results further demonstrate the interpretability of our method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by TIP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MM-BigBench: Evaluating Multimodal Models on Multimodal Content
+  Comprehension Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.09036v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.09036v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaocui Yang, Wenfang Wu, Shi Feng, Ming Wang, Daling Wang, Yang Li, Qi Sun, Yifei Zhang, Xiaoming Fu, Soujanya Poria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The popularity of multimodal large language models (MLLMs) has triggered a
+recent surge in research efforts dedicated to evaluating these models.
+Nevertheless, existing evaluation studies of MLLMs primarily focus on the
+comprehension and reasoning of unimodal (vision) content, neglecting
+performance evaluations in the domain of multimodal (vision-language) content
+understanding. Beyond multimodal reasoning, tasks related to multimodal content
+comprehension necessitate a profound understanding of multimodal contexts,
+achieved through the multimodal interaction to obtain a final answer. In this
+paper, we introduce a comprehensive assessment framework called MM-BigBench,
+which incorporates a diverse range of metrics to offer an extensive evaluation
+of the performance of various models and instructions across a wide spectrum of
+diverse multimodal content comprehension tasks. Consequently, our work
+complements research on the performance of MLLMs in multimodal comprehension
+tasks, achieving a more comprehensive and holistic evaluation of MLLMs. To
+begin, we employ the Best Performance metric to ascertain each model's
+performance upper bound on different datasets. Subsequently, the Mean Relative
+Gain metric offers an assessment of the overall performance of various models
+and instructions, while the Stability metric measures their sensitivity.
+Furthermore, previous research centers on evaluating models independently or
+solely assessing instructions, neglecting the adaptability between models and
+instructions. We propose the Adaptability metric to quantify the adaptability
+between models and instructions. Our paper evaluates a total of 20 language
+models (14 MLLMs) on 14 multimodal datasets spanning 6 tasks, with 10
+instructions for each task, and derives novel insights. Our code will be
+released at https://github.com/declare-lab/MM-BigBench.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Underview</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Low-latency Speech Enhancement via Speech Token Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08981v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08981v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huaying Xue, Xiulian Peng, Yan Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing deep learning based speech enhancement mainly employ a data-driven
+approach, which leverage large amounts of data with a variety of noise types to
+achieve noise removal from noisy signal. However, the high dependence on the
+data limits its generalization on the unseen complex noises in real-life
+environment. In this paper, we focus on the low-latency scenario and regard
+speech enhancement as a speech generation problem conditioned on the noisy
+signal, where we generate clean speech instead of identifying and removing
+noises. Specifically, we propose a conditional generative framework for speech
+enhancement, which models clean speech by acoustic codes of a neural speech
+codec and generates the speech codes conditioned on past noisy frames in an
+auto-regressive way. Moreover, we propose an explicit-alignment approach to
+align noisy frames with the generated speech tokens to improve the robustness
+and scalability to different input lengths. Different from other methods that
+leverage multiple stages to generate speech codes, we leverage a single-stage
+speech generation approach based on the TF-Codec neural codec to achieve high
+speech quality with low latency. Extensive results on both synthetic and
+real-recorded test set show its superiority over data-driven approaches in
+terms of noise robustness and temporal speech coherence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-View Class Incremental Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09675v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09675v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Depeng Li, Tianqi Wang, Junwei Chen, Kenji Kawaguchi, Cheng Lian, Zhigang Zeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-view learning (MVL) has gained great success in integrating information
+from multiple perspectives of a dataset to improve downstream task performance.
+To make MVL methods more practical in an open-ended environment, this paper
+investigates a novel paradigm called multi-view class incremental learning
+(MVCIL), where a single model incrementally classifies new classes from a
+continual stream of views, requiring no access to earlier views of data.
+However, MVCIL is challenged by the catastrophic forgetting of old information
+and the interference with learning new concepts. To address this, we first
+develop a randomization-based representation learning technique serving for
+feature extraction to guarantee their separate view-optimal working states,
+during which multiple views belonging to a class are presented sequentially;
+Then, we integrate them one by one in the orthogonality fusion subspace spanned
+by the extracted features; Finally, we introduce selective weight consolidation
+for learning-without-forgetting decision-making while encountering new classes.
+Extensive experiments on synthetic and real-world datasets validate the
+effectiveness of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Information Fusion</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Soundify: Matching Sound Effects to Video <span class="chip">NeurIPS 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.09726v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.09726v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Chuan-En Lin, Anastasis Germanidis, Cristóbal Valenzuela, Yining Shi, Nikolas Martelaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the art of video editing, sound helps add character to an object and
+immerse the viewer within a space. Through formative interviews with
+professional editors (N=10), we found that the task of adding sounds to video
+can be challenging. This paper presents Soundify, a system that assists editors
+in matching sounds to video. Given a video, Soundify identifies matching
+sounds, synchronizes the sounds to the video, and dynamically adjusts panning
+and volume to create spatial audio. In a human evaluation study (N=889), we
+show that Soundify is capable of matching sounds to video out-of-the-box for a
+diverse range of audio categories. In a within-subjects expert study (N=12), we
+demonstrate the usefulness of Soundify in helping video editors match sounds to
+video with lighter workload, reduced task completion time, and improved
+usability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Full paper in UIST 2023; Short paper in NeurIPS 2021 ML4CD Workshop;
+  Online demo: http://soundify.cc</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can We Edit Multimodal Large Language Models? <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08475v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08475v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyuan Cheng, Bozhong Tian, Qingbin Liu, Xi Chen, Yongheng Wang, Huajun Chen, Ningyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we focus on editing Multimodal Large Language Models (MLLMs).
+Compared to editing single-modal LLMs, multimodal model editing is more
+challenging, which demands a higher level of scrutiny and careful consideration
+in the editing process. To facilitate research in this area, we construct a new
+benchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite
+of innovative metrics for evaluation. We conduct comprehensive experiments
+involving various model editing baselines and analyze the impact of editing
+different components for multimodal LLMs. Empirically, we notice that previous
+baselines can implement editing multimodal LLMs to some extent, but the effect
+is still barely satisfactory, indicating the potential difficulty of this task.
+We hope that our work can provide the NLP community with insights. Code and
+dataset are available in https://github.com/zjunlp/EasyEdit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-10-12T00:00:00Z">2023-10-12</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">110</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tree-Planner: Efficient Close-loop Task Planning with Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08582v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08582v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengkang Hu, Yao Mu, Xinmiao Yu, Mingyu Ding, Shiguang Wu, Wenqi Shao, Qiguang Chen, Bin Wang, Yu Qiao, Ping Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies close-loop task planning, which refers to the process of
+generating a sequence of skills (a plan) to accomplish a specific goal while
+adapting the plan based on real-time observations. Recently, prompting Large
+Language Models (LLMs) to generate actions iteratively has become a prevalent
+paradigm due to its superior performance and user-friendliness. However, this
+paradigm is plagued by two inefficiencies: high token consumption and redundant
+error correction, both of which hinder its scalability for large-scale testing
+and applications. To address these issues, we propose Tree-Planner, which
+reframes task planning with LLMs into three distinct phases: plan sampling,
+action tree construction, and grounded deciding. Tree-Planner starts by using
+an LLM to sample a set of potential plans before execution, followed by the
+aggregation of them to form an action tree. Finally, the LLM performs a
+top-down decision-making process on the tree, taking into account real-time
+environmental information. Experiments show that Tree-Planner achieves
+state-of-the-art performance while maintaining high efficiency. By decomposing
+LLM queries into a single plan-sampling call and multiple grounded-deciding
+calls, a considerable part of the prompt are less likely to be repeatedly
+consumed. As a result, token consumption is reduced by 92.2% compared to the
+previously best-performing model. Additionally, by enabling backtracking on the
+action tree as needed, the correction process becomes more flexible, leading to
+a 40.5% decrease in error corrections. Project page:
+https://tree-planner.github.io/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual Data-Type Understanding does not emerge from Scaling
+  Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08577v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08577v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vishaal Udandarao, Max F. Burg, Samuel Albanie, Matthias Bethge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in the development of vision-language models (VLMs) are
+yielding remarkable success in recognizing visual semantic content, including
+impressive instances of compositional image understanding. Here, we introduce
+the novel task of \textit{Visual Data-Type Identification}, a basic perceptual
+skill with implications for data curation (e.g., noisy data-removal from large
+datasets, domain-specific retrieval) and autonomous vision (e.g.,
+distinguishing changing weather conditions from camera lens staining). We
+develop two datasets consisting of animal images altered across a diverse set
+of 27 visual \textit{data-types}, spanning four broad categories. An extensive
+zero-shot evaluation of 39 VLMs, ranging from 100M to 80B parameters, shows a
+nuanced performance landscape. While VLMs are reasonably good at identifying
+certain stylistic \textit{data-types}, such as cartoons and sketches, they
+struggle with simpler \textit{data-types} arising from basic manipulations like
+image rotations or additive noise. Our findings reveal that (i) model scaling
+alone yields marginal gains for contrastively-trained models like CLIP, and
+(ii) there is a pronounced drop in performance for the largest
+auto-regressively trained VLMs like OpenFlamingo. This finding points to a
+blind spot in current frontier VLMs: they excel in recognizing semantic content
+but fail to acquire an understanding of visual \textit{data-types} through
+scaling. By analyzing the pre-training distributions of these models and
+incorporating \textit{data-type} information into the captions during
+fine-tuning, we achieve a significant enhancement in performance. By exploring
+this previously uncharted task, we aim to set the stage for further advancing
+VLMs to equip them with visual data-type understanding. Code and datasets are
+released \href{https://github.com/bethgelab/DataTypeIdentification}{here}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Transformer</span>s as Decision Makers: Provable In-Context Reinforcement
+  Learning via Supervised <span class="highlight-title">Pretrain</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08566v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08566v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Licong Lin, Yu Bai, Song Mei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large transformer models pretrained on offline reinforcement learning
+datasets have demonstrated remarkable in-context reinforcement learning (ICRL)
+capabilities, where they can make good decisions when prompted with interaction
+trajectories from unseen environments. However, when and how transformers can
+be trained to perform ICRL have not been theoretically well-understood. In
+particular, it is unclear which reinforcement-learning algorithms transformers
+can perform in context, and how distribution mismatch in offline training data
+affects the learned algorithms. This paper provides a theoretical framework
+that analyzes supervised pretraining for ICRL. This includes two recently
+proposed training methods -- algorithm distillation and decision-pretrained
+transformers. First, assuming model realizability, we prove the
+supervised-pretrained transformer will imitate the conditional expectation of
+the expert algorithm given the observed trajectory. The generalization error
+will scale with model capacity and a distribution divergence factor between the
+expert and offline algorithms. Second, we show transformers with ReLU attention
+can efficiently approximate near-optimal online reinforcement learning
+algorithms like LinUCB and Thompson sampling for stochastic linear bandits, and
+UCB-VI for tabular Markov decision processes. This provides the first
+quantitative analysis of the ICRL capabilities of transformers pretrained from
+offline trajectories.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Phenomenal Yet Puzzling: Testing Inductive Reasoning Capabilities of
+  Language Models with Hypothesis Refinement 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08559v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08559v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Linlu Qiu, Liwei Jiang, Ximing Lu, Melanie Sclar, Valentina Pyatkin, Chandra Bhagavatula, Bailin Wang, Yoon Kim, Yejin Choi, Nouha Dziri, Xiang Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The ability to derive underlying principles from a handful of observations
+and then generalize to novel situations -- known as inductive reasoning -- is
+central to human intelligence. Prior work suggests that language models (LMs)
+often fall short on inductive reasoning, despite achieving impressive success
+on research benchmarks. In this work, we conduct a systematic study of the
+inductive reasoning capabilities of LMs through iterative hypothesis
+refinement, a technique that more closely mirrors the human inductive process
+than standard input-output prompting. Iterative hypothesis refinement employs a
+three-step process: proposing, selecting, and refining hypotheses in the form
+of textual rules. By examining the intermediate rules, we observe that LMs are
+phenomenal hypothesis proposers (i.e., generating candidate rules), and when
+coupled with a (task-specific) symbolic interpreter that is able to
+systematically filter the proposed set of rules, this hybrid approach achieves
+strong results across inductive reasoning benchmarks that require inducing
+causal relations, language-like instructions, and symbolic concepts. However,
+they also behave as puzzling inductive reasoners, showing notable performance
+gaps in rule induction (i.e., identifying plausible rules) and rule application
+(i.e., applying proposed rules to instances), suggesting that LMs are proposing
+hypotheses without being able to actually apply the rules. Through empirical
+and human analyses, we further reveal several discrepancies between the
+inductive reasoning processes of LMs and humans, shedding light on both the
+potentials and limitations of using LMs in inductive reasoning tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Do <span class="highlight-title">pretrain</span>ed <span class="highlight-title">Transformer</span>s Really Learn In-context by Gradient Descent? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08540v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08540v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingfeng Shen, Aayush Mishra, Daniel Khashabi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Is In-Context Learning (ICL) implicitly equivalent to Gradient Descent (GD)?
+Several recent works draw analogies between the dynamics of GD and the emergent
+behavior of ICL in large language models. However, these works make assumptions
+far from the realistic natural language setting in which language models are
+trained. Such discrepancies between theory and practice, therefore, necessitate
+further investigation to validate their applicability.
+  We start by highlighting the weaknesses in prior works that construct
+Transformer weights to simulate gradient descent. Their experiments with
+training Transformers on ICL objective, inconsistencies in the order
+sensitivity of ICL and GD, sparsity of the constructed weights, and sensitivity
+to parameter changes are some examples of a mismatch from the real-world
+setting.
+  Furthermore, we probe and compare the ICL vs. GD hypothesis in a natural
+setting. We conduct comprehensive empirical analyses on language models
+pretrained on natural data (LLaMa-7B). Our comparisons on various performance
+metrics highlight the inconsistent behavior of ICL and GD as a function of
+various factors such as datasets, models, and number of demonstrations. We
+observe that ICL and GD adapt the output distribution of language models
+differently. These results indicate that the equivalence between ICL and GD is
+an open hypothesis, requires nuanced considerations and calls for further
+studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Formally Specifying the High-Level Behavior of LLM-Based Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08535v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08535v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Maxwell Crouse, Ibrahim Abdelaziz, Kinjal Basu, Soham Dan, Sadhana Kumaravel, Achille Fokoue, Pavan Kapanipathi, Luis Lastras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LLM-based agents have recently emerged as promising tools for solving
+challenging problems without the need for task-specific finetuned models that
+can be expensive to procure. Currently, the design and implementation of such
+agents is ad hoc, as the wide variety of tasks that LLM-based agents may be
+applied to naturally means there can be no one-size-fits-all approach to agent
+design. In this work we aim to alleviate the difficulty of designing and
+implementing new agents by proposing a minimalistic, high-level generation
+framework that simplifies the process of building agents. The framework we
+introduce allows the user to specify desired agent behaviors in Linear Temporal
+Logic (LTL). The declarative LTL specification is then used to construct a
+constrained decoder that guarantees the LLM will produce an output exhibiting
+the desired behavior. By designing our framework in this way, we obtain several
+benefits, including the ability to enforce complex agent behavior, the ability
+to formally validate prompt examples, and the ability to seamlessly incorporate
+content-focused logical constraints into generation. In particular, our
+declarative approach, in which the desired behavior is simply described without
+concern for how it should be implemented or enforced, enables rapid design,
+implementation and experimentation with different LLM-based agents. We
+demonstrate how the proposed framework can be used to implement recent
+LLM-based agents, and show how the guardrails our approach provides can lead to
+improvements in agent performance. In addition, we release our code for general
+use.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLM-augmented Preference Learning from Natural Language 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08523v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08523v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Inwon Kang, Sikai Ruan, Tyler Ho, Jui-Chien Lin, Farhad Mohsin, Oshani Seneviratne, Lirong Xia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Finding preferences expressed in natural language is an important but
+challenging task. State-of-the-art(SotA) methods leverage transformer-based
+models such as BERT, RoBERTa, etc. and graph neural architectures such as graph
+attention networks. Since Large Language Models (LLMs) are equipped to deal
+with larger context lengths and have much larger model sizes than the
+transformer-based model, we investigate their ability to classify comparative
+text directly. This work aims to serve as a first step towards using LLMs for
+the CPC task. We design and conduct a set of experiments that format the
+classification task into an input prompt for the LLM and a methodology to get a
+fixed-format response that can be automatically evaluated. Comparing
+performances with existing methods, we see that pre-trained LLMs are able to
+outperform the previous SotA models with no fine-tuning involved. Our results
+show that the LLMs can consistently outperform the SotA when the target text is
+large -- i.e. composed of multiple sentences --, and are still comparable to
+the SotA performance in shorter text. We also find that few-shot learning
+yields better performance than zero-shot learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HoneyBee: Progressive Instruction Finetuning of Large Language Models
+  for Materials Science 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08511v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08511v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Song, Santiago Miret, Huan Zhang, Bang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose an instruction-based process for trustworthy data curation in
+materials science (MatSci-Instruct), which we then apply to finetune a
+LLaMa-based language model targeted for materials science (HoneyBee).
+MatSci-Instruct helps alleviate the scarcity of relevant, high-quality
+materials science textual data available in the open literature, and HoneyBee
+is the first billion-parameter language model specialized to materials science.
+In MatSci-Instruct we improve the trustworthiness of generated data by
+prompting multiple commercially available large language models for generation
+with an Instructor module (e.g. Chat-GPT) and verification from an independent
+Verifier module (e.g. Claude). Using MatSci-Instruct, we construct a dataset of
+multiple tasks and measure the quality of our dataset along multiple
+dimensions, including accuracy against known facts, relevance to materials
+science, as well as completeness and reasonableness of the data. Moreover, we
+iteratively generate more targeted instructions and instruction-data in a
+finetuning-evaluation-feedback loop leading to progressively better performance
+for our finetuned HoneyBee models. Our evaluation on the MatSci-NLP benchmark
+shows HoneyBee's outperformance of existing language models on materials
+science tasks and iterative improvement in successive stages of
+instruction-data refinement. We study the quality of HoneyBee's language
+modeling through automatic evaluation and analyze case studies to further
+understand the model's capabilities and limitations. Our code and relevant
+datasets are publicly available at
+\url{https://github.com/BangLab-UdeM-Mila/NLP4MatSci-HoneyBee}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Uncertainty-based Retrieval Framework for Ancient Chinese CWS and
+  POS 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08496v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08496v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pengyu Wang, Zhichen Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic analysis for modern Chinese has greatly improved the accuracy of
+text mining in related fields, but the study of ancient Chinese is still
+relatively rare. Ancient text division and lexical annotation are important
+parts of classical literature comprehension, and previous studies have tried to
+construct auxiliary dictionary and other fused knowledge to improve the
+performance. In this paper, we propose a framework for ancient Chinese Word
+Segmentation and Part-of-Speech Tagging that makes a twofold effort: on the one
+hand, we try to capture the wordhood semantics; on the other hand, we
+re-predict the uncertain samples of baseline model by introducing external
+knowledge. The performance of our architecture outperforms pre-trained BERT
+with CRF and existing tools such as Jiayan.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prometheus: Inducing Fine-grained Evaluation Capability in Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08491v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08491v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seungone Kim, Jamin Shin, Yejin Cho, Joel Jang, Shayne Longpre, Hwaran Lee, Sangdoo Yun, Seongjin Shin, Sungdong Kim, James Thorne, Minjoon Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, using a powerful proprietary Large Language Model (LLM) (e.g.,
+GPT-4) as an evaluator for long-form responses has become the de facto
+standard. However, for practitioners with large-scale evaluation tasks and
+custom criteria in consideration (e.g., child-readability), using proprietary
+LLMs as an evaluator is unreliable due to the closed-source nature,
+uncontrolled versioning, and prohibitive costs. In this work, we propose
+Prometheus, a fully open-source LLM that is on par with GPT-4's evaluation
+capabilities when the appropriate reference materials (reference answer, score
+rubric) are accompanied. We first construct the Feedback Collection, a new
+dataset that consists of 1K fine-grained score rubrics, 20K instructions, and
+100K responses and language feedback generated by GPT-4. Using the Feedback
+Collection, we train Prometheus, a 13B evaluator LLM that can assess any given
+long-form text based on customized score rubric provided by the user.
+Experimental results show that Prometheus scores a Pearson correlation of 0.897
+with human evaluators when evaluating with 45 customized score rubrics, which
+is on par with GPT-4 (0.882), and greatly outperforms ChatGPT (0.392).
+Furthermore, measuring correlation with GPT-4 with 1222 customized score
+rubrics across four benchmarks (MT Bench, Vicuna Bench, Feedback Bench, Flask
+Eval) shows similar trends, bolstering Prometheus's capability as an evaluator
+LLM. Lastly, Prometheus achieves the highest accuracy on two human preference
+benchmarks (HHH Alignment & MT Bench Human Judgment) compared to open-sourced
+reward models explicitly trained on human preference datasets, highlighting its
+potential as an universal reward model. We open-source our code, dataset, and
+model at https://github.com/kaistAI/Prometheus.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in Progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GraphextQA: A Benchmark for Evaluating Graph-Enhanced Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08487v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08487v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanchun Shen, Ruotong Liao, Zhen Han, Yunpu Ma, Volker Tresp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While multi-modal models have successfully integrated information from image,
+video, and audio modalities, integrating graph modality into large language
+models (LLMs) remains unexplored. This discrepancy largely stems from the
+inherent divergence between structured graph data and unstructured text data.
+Incorporating graph knowledge provides a reliable source of information,
+enabling potential solutions to address issues in text generation, e.g.,
+hallucination, and lack of domain knowledge. To evaluate the integration of
+graph knowledge into language models, a dedicated dataset is needed. However,
+there is currently no benchmark dataset specifically designed for multimodal
+graph-language models. To address this gap, we propose GraphextQA, a question
+answering dataset with paired subgraphs, retrieved from Wikidata, to facilitate
+the evaluation and future development of graph-language models. Additionally,
+we introduce a baseline model called CrossGNN, which conditions answer
+generation on the paired graphs by cross-attending question-aware graph
+features at decoding. The proposed dataset is designed to evaluate
+graph-language models' ability to understand graphs and make use of it for
+answer generation. We perform experiments with language-only models and the
+proposed graph-language model to validate the usefulness of the paired graphs
+and to demonstrate the difficulty of the task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Understanding the Humans Behind Online Misinformation: An Observational
+  Study Through the Lens of the COVID-19 Pandemic 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08483v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08483v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohit Chandra, Anush Mattapalli, Munmun De Choudhury
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of online misinformation has emerged as one of the biggest
+threats to society. Considerable efforts have focused on building
+misinformation detection models, still the perils of misinformation remain
+abound. Mitigating online misinformation and its ramifications requires a
+holistic approach that encompasses not only an understanding of its intricate
+landscape in relation to the complex issue and topic-rich information ecosystem
+online, but also the psychological drivers of individuals behind it. Adopting a
+time series analytic technique and robust causal inference-based design, we
+conduct a large-scale observational study analyzing over 32 million COVID-19
+tweets and 16 million historical timeline tweets. We focus on understanding the
+behavior and psychology of users disseminating misinformation during COVID-19
+and its relationship with the historical inclinations towards sharing
+misinformation on Non-COVID topics before the pandemic. Our analysis
+underscores the intricacies inherent to cross-topic misinformation, and
+highlights that users' historical inclination toward sharing misinformation is
+positively associated with their present behavior pertaining to misinformation
+sharing on emergent topics and beyond. This work may serve as a valuable
+foundation for designing user-centric inoculation strategies and
+ecologically-grounded agile interventions for effectively tackling online
+misinformation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can We Edit Multimodal Large Language Models? <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08475v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08475v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyuan Cheng, Bozhong Tian, Qingbin Liu, Xi Chen, Yongheng Wang, Huajun Chen, Ningyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we focus on editing Multimodal Large Language Models (MLLMs).
+Compared to editing single-modal LLMs, multimodal model editing is more
+challenging, which demands a higher level of scrutiny and careful consideration
+in the editing process. To facilitate research in this area, we construct a new
+benchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite
+of innovative metrics for evaluation. We conduct comprehensive experiments
+involving various model editing baselines and analyze the impact of editing
+different components for multimodal LLMs. Empirically, we notice that previous
+baselines can implement editing multimodal LLMs to some extent, but the effect
+is still barely satisfactory, indicating the potential difficulty of this task.
+We hope that our work can provide the NLP community with insights\footnote{Code
+and dataset are available in https://github.com/zjunlp/EasyEdit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DistillSpec: Improving Speculative Decoding via Knowledge Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08461v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08461v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongchao Zhou, Kaifeng Lyu, Ankit Singh Rawat, Aditya Krishna Menon, Afshin Rostamizadeh, Sanjiv Kumar, Jean-François Kagy, Rishabh Agarwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speculative decoding (SD) accelerates large language model inference by
+employing a faster draft model for generating multiple tokens, which are then
+verified in parallel by the larger target model, resulting in the text
+generated according to the target model distribution. However, identifying a
+compact draft model that is well-aligned with the target model is challenging.
+To tackle this issue, we propose DistillSpec that uses knowledge distillation
+to better align the draft model with the target model, before applying SD.
+DistillSpec makes two key design choices, which we demonstrate via systematic
+study to be crucial to improving the draft and target alignment: utilizing
+on-policy data generation from the draft model, and tailoring the divergence
+function to the task and decoding strategy. Notably, DistillSpec yields
+impressive 10 - 45% speedups over standard SD on a range of standard
+benchmarks, using both greedy and non-greedy sampling. Furthermore, we combine
+DistillSpec with lossy SD to achieve fine-grained control over the latency vs.
+task performance trade-off. Finally, in practical scenarios with models of
+varying sizes, first using distillation to boost the performance of the target
+model and then applying DistillSpec to train a well-aligned draft model can
+reduce decoding latency by 6-10x with minimal performance drop, compared to
+standard decoding without distillation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Confederacy of Models: a Comprehensive Evaluation of LLMs on Creative
+  Writing <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08433v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08433v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Carlos Gómez-Rodríguez, Paul Williams
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We evaluate a range of recent LLMs on English creative writing, a challenging
+and complex task that requires imagination, coherence, and style. We use a
+difficult, open-ended scenario chosen to avoid training data reuse: an epic
+narration of a single combat between Ignatius J. Reilly, the protagonist of the
+Pulitzer Prize-winning novel A Confederacy of Dunces (1980), and a pterodactyl,
+a prehistoric flying reptile. We ask several LLMs and humans to write such a
+story and conduct a human evalution involving various criteria such as fluency,
+coherence, originality, humor, and style. Our results show that some
+state-of-the-art commercial LLMs match or slightly outperform our writers in
+most dimensions; whereas open-source LLMs lag behind. Humans retain an edge in
+creativity, while humor shows a binary divide between LLMs that can handle it
+comparably to humans and those that fail at it. We discuss the implications and
+limitations of our study and suggest directions for future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication in Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span>ing Large Language Models with Chain-of-Thought for Few-Shot
+  Knowledge Base Question Generation <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08395v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08395v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuanyuan Liang, Jianing Wang, Hanlun Zhu, Lei Wang, Weining Qian, Yunshi Lan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of Question Generation over Knowledge Bases (KBQG) aims to convert a
+logical form into a natural language question. For the sake of expensive cost
+of large-scale question annotation, the methods of KBQG under low-resource
+scenarios urgently need to be developed. However, current methods heavily rely
+on annotated data for fine-tuning, which is not well-suited for few-shot
+question generation. The emergence of Large Language Models (LLMs) has shown
+their impressive generalization ability in few-shot tasks. Inspired by
+Chain-of-Thought (CoT) prompting, which is an in-context learning strategy for
+reasoning, we formulate KBQG task as a reasoning problem, where the generation
+of a complete question is splitted into a series of sub-question generation.
+Our proposed prompting method KQG-CoT first retrieves supportive logical forms
+from the unlabeled data pool taking account of the characteristics of the
+logical form. Then, we write a prompt to explicit the reasoning chain of
+generating complicated questions based on the selected demonstrations. To
+further ensure prompt quality, we extend KQG-CoT into KQG-CoT+ via sorting the
+logical forms by their complexity. We conduct extensive experiments over three
+public KBQG datasets. The results demonstrate that our prompting method
+consistently outperforms other prompting baselines on the evaluated datasets.
+Remarkably, our KQG-CoT+ method could surpass existing few-shot SoTA results of
+the PathQuestions dataset by 18.25, 10.72, and 10.18 absolute points on BLEU-4,
+METEOR, and ROUGE-L, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by EMNLP 2023 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Better Evaluation of Instruction-Following: A Case-Study in
+  Summarization <span class="chip">CoNLL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08394v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08394v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ondrej Skopek, Rahul Aralikatte, Sian Gooding, Victor Carbune
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite recent advances, evaluating how well large language models (LLMs)
+follow user instructions remains an open problem. While evaluation methods of
+language models have seen a rise in prompt-based approaches, limited work on
+the correctness of these methods has been conducted. In this work, we perform a
+meta-evaluation of a variety of metrics to quantify how accurately they measure
+the instruction-following abilities of LLMs. Our investigation is performed on
+grounded query-based summarization by collecting a new short-form, real-world
+dataset riSum, containing $300$ document-instruction pairs with $3$ answers
+each. All $900$ answers are rated by $3$ human annotators. Using riSum, we
+analyze agreement between evaluation methods and human judgment. Finally, we
+propose new LLM-based reference-free evaluation methods that improve upon
+established baselines and perform on-par with costly reference-based metrics
+which require high-quality summaries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CoNLL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reconstructing Materials Tetrahedron: Challenges in Materials
+  Information Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08383v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08383v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kausik Hira, Mohd Zaki, Dhruvil Sheth,  Mausam, N M Anoop Krishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Discovery of new materials has a documented history of propelling human
+progress for centuries and more. The behaviour of a material is a function of
+its composition, structure, and properties, which further depend on its
+processing and testing conditions. Recent developments in deep learning and
+natural language processing have enabled information extraction at scale from
+published literature such as peer-reviewed publications, books, and patents.
+However, this information is spread in multiple formats, such as tables, text,
+and images, and with little or no uniformity in reporting style giving rise to
+several machine learning challenges. Here, we discuss, quantify, and document
+these outstanding challenges in automated information extraction (IE) from
+materials science literature towards the creation of a large materials science
+knowledge base. Specifically, we focus on IE from text and tables and outline
+several challenges with examples. We hope the present work inspires researchers
+to address the challenges in a coherent fashion, providing to fillip to IE for
+the materials knowledge base.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Factual Consistency for Knowledge-Grounded Dialogue Systems
+  via Knowledge Enhancement and Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08372v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08372v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boyang Xue, Weichao Wang, Hongru Wang, Fei Mi, Rui Wang, Yasheng Wang, Lifeng Shang, Xin Jiang, Qun Liu, Kam-Fai Wong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pretrained language models (PLMs) based knowledge-grounded dialogue systems
+are prone to generate responses that are factually inconsistent with the
+provided knowledge source. In such inconsistent responses, the dialogue models
+fail to accurately express the external knowledge they rely upon. Inspired by
+previous work which identified that feed-forward networks (FFNs) within
+Transformers are responsible for factual knowledge expressions, we investigate
+two methods to efficiently improve the factual expression capability {of FFNs}
+by knowledge enhancement and alignment respectively. We first propose
+\textsc{K-Dial}, which {explicitly} introduces {extended FFNs in Transformers
+to enhance factual knowledge expressions} given the specific patterns of
+knowledge-grounded dialogue inputs. Additionally, we apply the reinforcement
+learning for factual consistency (RLFC) method to implicitly adjust FFNs'
+expressions in responses by aligning with gold knowledge for the factual
+consistency preference. To comprehensively assess the factual consistency and
+dialogue quality of responses, we employ extensive automatic measures and human
+evaluations including sophisticated fine-grained NLI-based metrics.
+Experimental results on WoW and CMU\_DoG datasets demonstrate that our methods
+efficiently enhance the ability of the FFN module to convey factual knowledge,
+validating the efficacy of improving factual consistency for knowledge-grounded
+dialogue systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MCU: A Task-centric Framework for Open-ended Agent Evaluation in
+  Minecraft 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08367v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08367v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haowei Lin, Zihao Wang, Jianzhu Ma, Yitao Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To pursue the goal of creating an open-ended agent in Minecraft, an
+open-ended game environment with unlimited possibilities, this paper introduces
+a task-centric framework named MCU for Minecraft agent evaluation. The MCU
+framework leverages the concept of atom tasks as fundamental building blocks,
+enabling the generation of diverse or even arbitrary tasks. Within the MCU
+framework, each task is measured with six distinct difficulty scores (time
+consumption, operational effort, planning complexity, intricacy, creativity,
+novelty). These scores offer a multi-dimensional assessment of a task from
+different angles, and thus can reveal an agent's capability on specific facets.
+The difficulty scores also serve as the feature of each task, which creates a
+meaningful task space and unveils the relationship between tasks. For efficient
+evaluation of Minecraft agents employing the MCU framework, we maintain a
+unified benchmark, namely SkillForge, which comprises representative tasks with
+diverse categories and difficulty distribution. We also provide convenient
+filters for users to select tasks to assess specific capabilities of agents. We
+show that MCU has the high expressivity to cover all tasks used in recent
+literature on Minecraft agent, and underscores the need for advancements in
+areas such as creativity, precise control, and out-of-distribution
+generalization under the goal of open-ended Minecraft agent development.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Large Language Models to Knowledge Graphs for Biomarker Discovery
+  in Cancer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08365v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08365v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md. Rezaul Karim, Lina Molinas Comet, Md Shajalal, Oya Beyan, Dietrich Rebholz-Schuhmann, Stefan Decker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Domain experts often rely on up-to-date knowledge for apprehending and
+disseminating specific biological processes that help them design strategies to
+develop prevention and therapeutic decision-making. A challenging scenario for
+artificial intelligence (AI) is using biomedical data (e.g., texts, imaging,
+omics, and clinical) to provide diagnosis and treatment recommendations for
+cancerous conditions. Data and knowledge about cancer, drugs, genes, proteins,
+and their mechanism is spread across structured (knowledge bases (KBs)) and
+unstructured (e.g., scientific articles) sources. A large-scale knowledge graph
+(KG) can be constructed by integrating these data, followed by extracting facts
+about semantically interrelated entities and relations. Such KGs not only allow
+exploration and question answering (QA) but also allow domain experts to deduce
+new knowledge. However, exploring and querying large-scale KGs is tedious for
+non-domain users due to a lack of understanding of the underlying data assets
+and semantic technologies. In this paper, we develop a domain KG to leverage
+cancer-specific biomarker discovery and interactive QA. For this, a domain
+ontology called OncoNet Ontology (ONO) is developed to enable semantic
+reasoning for validating gene-disease relations. The KG is then enriched by
+harmonizing the ONO, controlled vocabularies, and additional biomedical
+concepts from scientific articles by employing BioBERT- and SciBERT-based
+information extraction (IE) methods. Further, since the biomedical domain is
+evolving, where new findings often replace old ones, without employing
+up-to-date findings, there is a high chance an AI system exhibits concept drift
+while providing diagnosis and treatment. Therefore, we finetuned the KG using
+large language models (LLMs) based on more recent articles and KBs that might
+not have been seen by the named entity recognition models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2302.04737</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Defending Our Privacy With Backdoors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08320v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08320v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominik Hintersdorf, Lukas Struppek, Daniel Neider, Kristian Kersting
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of large AI models trained on uncurated, often sensitive
+web-scraped data has raised significant privacy concerns. One of the concerns
+is that adversaries can extract information about the training data using
+privacy attacks. Unfortunately, the task of removing specific information from
+the models without sacrificing performance is not straightforward and has
+proven to be challenging. We propose a rather easy yet effective defense based
+on backdoor attacks to remove private information such as names of individuals
+from models, and focus in this work on text encoders. Specifically, through
+strategic insertion of backdoors, we align the embeddings of sensitive phrases
+with those of neutral terms-"a person" instead of the person's name. Our
+empirical results demonstrate the effectiveness of our backdoor-based defense
+on CLIP by assessing its performance using a specialized privacy attack for
+zero-shot classifiers. Our approach provides not only a new "dual-use"
+perspective on backdoor attacks, but also presents a promising avenue to
+enhance the privacy of individuals within models trained on uncurated
+web-scraped data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Not All Demonstration Examples are Equally Beneficial: Reweighting
+  Demonstration Examples for In-Context Learning <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08309v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08309v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhe Yang, Damai Dai, Peiyi Wang, Zhifang Sui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have recently gained the In-Context Learning
+(ICL) ability with the models scaling up, allowing them to quickly adapt to
+downstream tasks with only a few demonstration examples prepended in the input
+sequence. Nonetheless, the current practice of ICL treats all demonstration
+examples equally, which still warrants improvement, as the quality of examples
+is usually uneven. In this paper, we investigate how to determine approximately
+optimal weights for demonstration examples and how to apply them during ICL. To
+assess the quality of weights in the absence of additional validation data, we
+design a masked self-prediction (MSP) score that exhibits a strong correlation
+with the final ICL performance. To expedite the weight-searching process, we
+discretize the continuous weight space and adopt beam search. With
+approximately optimal weights obtained, we further propose two strategies to
+apply them to demonstrations at different model positions. Experimental results
+on 8 text classification tasks show that our approach outperforms conventional
+ICL by a large margin. Our code are publicly available at
+https:github.com/Zhe-Young/WICL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MProto: Multi-Prototype Network with Denoised Optimal Transport for
+  Distantly Supervised Named Entity Recognition <span class="chip">EMNLP-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08298v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08298v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuhui Wu, Yongliang Shen, Zeqi Tan, Wenqi Ren, Jietian Guo, Shiliang Pu, Weiming Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Distantly supervised named entity recognition (DS-NER) aims to locate entity
+mentions and classify their types with only knowledge bases or gazetteers and
+unlabeled corpus. However, distant annotations are noisy and degrade the
+performance of NER models. In this paper, we propose a noise-robust prototype
+network named MProto for the DS-NER task. Different from previous
+prototype-based NER methods, MProto represents each entity type with multiple
+prototypes to characterize the intra-class variance among entity
+representations. To optimize the classifier, each token should be assigned an
+appropriate ground-truth prototype and we consider such token-prototype
+assignment as an optimal transport (OT) problem. Furthermore, to mitigate the
+noise from incomplete labeling, we propose a novel denoised optimal transport
+(DOT) algorithm. Specifically, we utilize the assignment result between Other
+class tokens and all prototypes to distinguish unlabeled entity tokens from
+true negatives. Experiments on several DS-NER benchmarks demonstrate that our
+MProto achieves state-of-the-art performance. The source code is now available
+on Github.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP-2023, camera ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Expanding the Vocabulary of <span class="highlight-title">BERT</span> for Knowledge Base Construction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08291v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08291v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dong Yang, Xu Wang, Remzi Celebi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge base construction entails acquiring structured information to
+create a knowledge base of factual and relational data, facilitating question
+answering, information retrieval, and semantic understanding. The challenge
+called "Knowledge Base Construction from Pretrained Language Models" at
+International Semantic Web Conference 2023 defines tasks focused on
+constructing knowledge base using language model. Our focus was on Track 1 of
+the challenge, where the parameters are constrained to a maximum of 1 billion,
+and the inclusion of entity descriptions within the prompt is prohibited.
+  Although the masked language model offers sufficient flexibility to extend
+its vocabulary, it is not inherently designed for multi-token prediction. To
+address this, we present Vocabulary Expandable BERT for knowledge base
+construction, which expand the language model's vocabulary while preserving
+semantic embeddings for newly added words. We adopt task-specific
+re-pre-training on masked language model to further enhance the language model.
+  Through experimentation, the results show the effectiveness of our
+approaches. Our framework achieves F1 score of 0.323 on the hidden test set and
+0.362 on the validation set, both data set is provided by the challenge.
+Notably, our framework adopts a lightweight language model (BERT-base, 0.13
+billion parameters) and surpasses the model using prompts directly on large
+language model (Chatgpt-3, 175 billion parameters). Besides, Token-Recode
+achieves comparable performances as Re-pretrain. This research advances
+language understanding models by enabling the direct embedding of multi-token
+entities, signifying a substantial step forward in link prediction task in
+knowledge graph and metadata completion in data management.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing Odia Braille Literacy: The Influence of Speed on Error
+  Reduction and Enhanced Comprehension 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08280v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08280v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Monnie Parida, Manjira Sinha, Anupam Basu, Pabitra Mitra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study aims to conduct an extensive detailed analysis of the Odia Braille
+reading comprehension among students with visual disability. Specifically, the
+study explores their reading speed and hand or finger movements. The study also
+aims to investigate any comprehension difficulties and reading errors they may
+encounter. Six students from the 9th and 10th grades, aged between 14 and 16,
+participated in the study. We observed participants hand movements to
+understand how reading errors were connected to hand movement and identify the
+students reading difficulties. We also evaluated the participants Odia Braille
+reading skills, including their reading speed (in words per minute), errors,
+and comprehension. The average speed of Odia Braille reader is 17.64wpm.
+According to the study, there was a noticeable correlation between reading
+speed and reading errors. As reading speed decreased, the number of reading
+errors tended to increase. Moreover, the study established a link between
+reduced Braille reading errors and improved reading comprehension. In contrast,
+the study found that better comprehension was associated with increased reading
+speed. The researchers concluded with some interesting findings about preferred
+Braille reading patterns. These findings have important theoretical,
+developmental, and methodological implications for instruction.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>4 Pages, Paper accepted in Diversity and Inclusion track at
+  CODS-COMAD 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CP-KGC: Constrained-<span class="highlight-title">Prompt</span> Knowledge Graph Completion with Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08279v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08279v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Yang, Li Fang, Yi Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge graph completion (KGC) aims to utilize existing knowledge to deduce
+and infer missing connections within knowledge graphs. Text-based approaches,
+like SimKGC, have outperformed graph embedding methods, showcasing the promise
+of inductive KGC. However, the efficacy of text-based methods hinges on the
+quality of entity textual descriptions. In this paper, we identify the key
+issue of whether large language models (LLMs) can generate effective text. To
+mitigate hallucination in LLM-generated text in this paper, we introduce a
+constraint-based prompt that utilizes the entity and its textual description as
+contextual constraints to enhance data quality. Our Constrained-Prompt
+Knowledge Graph Completion (CP-KGC) method demonstrates effective inference
+under low resource computing conditions and surpasses prior results on the
+WN18RR and FB15K237 datasets. This showcases the integration of LLMs in KGC
+tasks and provides new directions for future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Impact of Co-occurrence on Factual Knowledge of Large Language Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08256v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08256v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheongwoong Kang, Jaesik Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) often make factually incorrect responses despite
+their success in various applications. In this paper, we hypothesize that
+relying heavily on simple co-occurrence statistics of the pre-training corpora
+is one of the main factors that cause factual errors. Our results reveal that
+LLMs are vulnerable to the co-occurrence bias, defined as preferring frequently
+co-occurred words over the correct answer. Consequently, LLMs struggle to
+recall facts whose subject and object rarely co-occur in the pre-training
+dataset although they are seen during finetuning. We show that co-occurrence
+bias remains despite scaling up model sizes or finetuning. Therefore, we
+suggest finetuning on a debiased dataset to mitigate the bias by filtering out
+biased samples whose subject-object co-occurrence count is high. Although
+debiased finetuning allows LLMs to memorize rare facts in the training set, it
+is not effective in recalling rare facts unseen during finetuning. Further
+research in mitigation will help build reliable language models by preventing
+potential errors. The code is available at
+\url{https://github.com/CheongWoong/impact_of_cooccurrence}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Who Said That? Benchmarking Social Media AI Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08240v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08240v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wanyun Cui, Linqiu Zhang, Qianle Wang, Shuyang Cai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  AI-generated text has proliferated across various online platforms, offering
+both transformative prospects and posing significant risks related to
+misinformation and manipulation. Addressing these challenges, this paper
+introduces SAID (Social media AI Detection), a novel benchmark developed to
+assess AI-text detection models' capabilities in real social media platforms.
+It incorporates real AI-generate text from popular social media platforms like
+Zhihu and Quora. Unlike existing benchmarks, SAID deals with content that
+reflects the sophisticated strategies employed by real AI users on the Internet
+which may evade detection or gain visibility, providing a more realistic and
+challenging evaluation landscape. A notable finding of our study, based on the
+Zhihu dataset, reveals that annotators can distinguish between AI-generated and
+human-generated texts with an average accuracy rate of 96.5%. This finding
+necessitates a re-evaluation of human capability in recognizing AI-generated
+text in today's widely AI-influenced environment. Furthermore, we present a new
+user-oriented AI-text detection challenge focusing on the practicality and
+effectiveness of identifying AI-generated text based on user information and
+multiple responses. The experimental results demonstrate that conducting
+detection tasks on actual social media platforms proves to be more challenging
+compared to traditional simulated AI-text detection, resulting in a decreased
+accuracy. On the other hand, user-oriented AI-generated text detection
+significantly improve the accuracy of detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language Models are Universal Embedders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08232v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08232v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Zhang, Zehan Li, Yanzhao Zhang, Dingkun Long, Pengjun Xie, Meishan Zhang, Min Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the large language model (LLM) revolution, embedding is a key component of
+various systems. For example, it is used to retrieve knowledge or memories for
+LLMs, to build content moderation filters, etc. As such cases span from English
+to other natural or programming languages, from retrieval to classification and
+beyond, it is desirable to build a unified embedding model rather than
+dedicated ones for each scenario. In this work, we make an initial step towards
+this goal, demonstrating that multiple languages (both natural and programming)
+pre-trained transformer decoders can embed universally when finetuned on
+limited English data. We provide a comprehensive practice with thorough
+evaluations. On English MTEB, our models achieve competitive performance on
+different embedding tasks by minimal training data. On other benchmarks, such
+as multilingual classification and code search, our models (without any
+supervision) perform comparably to, or even surpass heavily supervised
+baselines and/or APIs. These results provide evidence of a promising path
+towards building powerful unified embedders that can be applied across tasks
+and languages.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, in progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast Word Error Rate Estimation Using <span class="highlight-title">Self-Supervised</span> Representations
+  For Speech And Text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08225v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08225v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chanho Park, Chengsong Lu, Mingjie Chen, Thomas Hain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The quality of automatic speech recognition (ASR) is typically measured by
+word error rate (WER). WER estimation is a task aiming to predict the WER of an
+ASR system, given a speech utterance and a transcription. This task has gained
+increasing attention while advanced ASR systems are trained on large amounts of
+data. In this case, WER estimation becomes necessary in many scenarios, for
+example, selecting training data with unknown transcription quality or
+estimating the testing performance of an ASR system without ground truth
+transcriptions. Facing large amounts of data, the computation efficiency of a
+WER estimator becomes essential in practical applications. However, previous
+works usually did not consider it as a priority. In this paper, a Fast WER
+estimator (Fe-WER) using self-supervised learning representation (SSLR) is
+introduced. The estimator is built upon SSLR aggregated by average pooling. The
+results show that Fe-WER outperformed the e-WER3 baseline relatively by 19.69%
+and 7.16% on Ted-Lium3 in both evaluation metrics of root mean square error and
+Pearson correlation coefficient, respectively. Moreover, the estimation
+weighted by duration was 10.43% when the target was 10.88%. Lastly, the
+inference speed was about 4x in terms of a real-time factor.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SimCKP: Simple Contrastive Learning of Keyphrase Representations <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minseok Choi, Chaeheon Gwak, Seho Kim, Si Hyeong Kim, Jaegul Choo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Keyphrase generation (KG) aims to generate a set of summarizing words or
+phrases given a source document, while keyphrase extraction (KE) aims to
+identify them from the text. Because the search space is much smaller in KE, it
+is often combined with KG to predict keyphrases that may or may not exist in
+the corresponding document. However, current unified approaches adopt sequence
+labeling and maximization-based generation that primarily operate at a token
+level, falling short in observing and scoring keyphrases as a whole. In this
+work, we propose SimCKP, a simple contrastive learning framework that consists
+of two stages: 1) An extractor-generator that extracts keyphrases by learning
+context-aware phrase-level representations in a contrastive manner while also
+generating keyphrases that do not appear in the document; 2) A reranker that
+adapts scores for each generated phrase by likewise aligning their
+representations with the corresponding document. Experimental results on
+multiple benchmark datasets demonstrate the effectiveness of our proposed
+approach, which outperforms the state-of-the-art models by a significant
+margin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual Question Generation in Bengali 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08187v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08187v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahmud Hasan, Labiba Islam, Jannatul Ferdous Ruma, Tasmiah Tahsin Mayeesha, Rashedur M. Rahman
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The task of Visual Question Generation (VQG) is to generate human-like
+questions relevant to the given image. As VQG is an emerging research field,
+existing works tend to focus only on resource-rich language such as English due
+to the availability of datasets. In this paper, we propose the first Bengali
+Visual Question Generation task and develop a novel transformer-based
+encoder-decoder architecture that generates questions in Bengali when given an
+image. We propose multiple variants of models - (i) image-only: baseline model
+of generating questions from images without additional information, (ii)
+image-category and image-answer-category: guided VQG where we condition the
+model to generate questions based on the answer and the category of expected
+question. These models are trained and evaluated on the translated VQAv2.0
+dataset. Our quantitative and qualitative results establish the first state of
+the art models for VQG task in Bengali and demonstrate that our models are
+capable of generating grammatically correct and relevant questions. Our
+quantitative results show that our image-cat model achieves a BLUE-1 score of
+33.12 and BLEU-3 score of 7.56 which is the highest of the other two variants.
+We also perform a human evaluation to assess the quality of the generation
+tasks. Human evaluation suggests that image-cat model is capable of generating
+goal-driven and attribute-specific questions and also stays relevant to the
+corresponding image.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages including references, 4 figures and 3 tables. Accepted in
+  the Proceedings of the Workshop on Multimodal, Multilingual Natural Language
+  Generation and Multilingual WebNLG Challenge (MM-NLG 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EIPE-text: Evaluation-Guided Iterative Plan Extraction for Long-Form
+  Narrative Text Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08185v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08185v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wang You, Wenshan Wu, Yaobo Liang, Shaoguang Mao, Chenfei Wu, Maosong Cao, Yuzhe Cai, Yiduo Guo, Yan Xia, Furu Wei, Nan Duan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Plan-and-Write is a common hierarchical approach in long-form narrative text
+generation, which first creates a plan to guide the narrative writing.
+Following this approach, several studies rely on simply prompting large
+language models for planning, which often yields suboptimal results. In this
+paper, we propose a new framework called Evaluation-guided Iterative Plan
+Extraction for long-form narrative text generation (EIPE-text), which extracts
+plans from the corpus of narratives and utilizes the extracted plans to
+construct a better planner. EIPE-text has three stages: plan extraction,
+learning, and inference. In the plan extraction stage, it iteratively extracts
+and improves plans from the narrative corpus and constructs a plan corpus. We
+propose a question answer (QA) based evaluation mechanism to automatically
+evaluate the plans and generate detailed plan refinement instructions to guide
+the iterative improvement. In the learning stage, we build a better planner by
+fine-tuning with the plan corpus or in-context learning with examples in the
+plan corpus. Finally, we leverage a hierarchical approach to generate long-form
+narratives. We evaluate the effectiveness of EIPE-text in the domains of novels
+and storytelling. Both GPT-4-based evaluations and human evaluations
+demonstrate that our method can generate more coherent and relevant long-form
+narratives. Our code will be released in the future.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Cognitive Knowledge Structure of Large Language Models: An
+  Educational Diagnostic Assessment Approach <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08172v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08172v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheyuan Zhang, Jifan Yu, Juanzi Li, Lei Hou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have not only exhibited exceptional performance
+across various tasks, but also demonstrated sparks of intelligence. Recent
+studies have focused on assessing their capabilities on human exams and
+revealed their impressive competence in different domains. However, cognitive
+research on the overall knowledge structure of LLMs is still lacking. In this
+paper, based on educational diagnostic assessment method, we conduct an
+evaluation using MoocRadar, a meticulously annotated human test dataset based
+on Bloom Taxonomy. We aim to reveal the knowledge structures of LLMs and gain
+insights of their cognitive capabilities. This research emphasizes the
+significance of investigating LLMs' knowledge and understanding the disparate
+cognitive patterns of LLMs. By shedding light on models' knowledge, researchers
+can advance development and utilization of LLMs in a more informed and
+effective manner.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of EMNLP 2023 (Short Paper)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Simplicity Level Estimate (SLE): A Learned Reference-Less Metric for
+  Sentence Simplification <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08170v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08170v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liam Cripwell, Joël Legrand, Claire Gardent
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automatic evaluation for sentence simplification remains a challenging
+problem. Most popular evaluation metrics require multiple high-quality
+references -- something not readily available for simplification -- which makes
+it difficult to test performance on unseen domains. Furthermore, most existing
+metrics conflate simplicity with correlated attributes such as fluency or
+meaning preservation. We propose a new learned evaluation metric (SLE) which
+focuses on simplicity, outperforming almost all existing metrics in terms of
+correlation with human judgements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP 2023 (Main Conference)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multiclass Classification of Policy Documents with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08167v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08167v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erkan Gunes, Christoffer Koch Florczak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classifying policy documents into policy issue topics has been a long-time
+effort in political science and communication disciplines. Efforts to automate
+text classification processes for social science research purposes have so far
+achieved remarkable results, but there is still a large room for progress. In
+this work, we test the prediction performance of an alternative strategy, which
+requires human involvement much less than full manual coding. We use the GPT
+3.5 and GPT 4 models of the OpenAI, which are pre-trained instruction-tuned
+Large Language Models (LLM), to classify congressional bills and congressional
+hearings into Comparative Agendas Project's 21 major policy issue topics. We
+propose three use-case scenarios and estimate overall accuracies ranging from
+%58-83 depending on scenario and GPT model employed. The three scenarios aims
+at minimal, moderate, and major human interference, respectively. Overall, our
+results point towards the insufficiency of complete reliance on GPT with
+minimal human intervention, an increasing accuracy along with the human effort
+exerted, and a surprisingly high accuracy achieved in the most humanly
+demanding use-case. However, the superior use-case achieved the %83 accuracy on
+the %65 of the data in which the two models agreed, suggesting that a similar
+approach to ours can be relatively easily implemented and allow for mostly
+automated coding of a majority of a given dataset. This could free up resources
+allowing manual human coding of the remaining %35 of the data to achieve an
+overall higher level of accuracy while reducing costs significantly.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ziya-VL: Bilingual Large Vision-Language Model via Multi-Task
+  Instruction Tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08166v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08166v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyu Lu, Dixiang Zhang, Xiaojun Wu, Xinyu Gao, Ruyi Gan, Jiaxing Zhang, Yan Song, Pingjian Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements enlarge the capabilities of large language models (LLMs)
+in zero-shot image-to-text generation and understanding by integrating
+multi-modal inputs. However, such success is typically limited to English
+scenarios due to the lack of large-scale and high-quality non-English
+multi-modal resources, making it extremely difficult to establish competitive
+counterparts in other languages. In this paper, we introduce the Ziya-VL
+series, a set of bilingual large-scale vision-language models (LVLMs) designed
+to incorporate visual semantics into LLM for multi-modal dialogue. Composed of
+Ziya-VL-Base and Ziya-VL-Chat, our models adopt the Querying Transformer from
+BLIP-2, further exploring the assistance of optimization schemes such as
+instruction tuning, multi-stage training and low-rank adaptation module for
+visual-language alignment. In addition, we stimulate the understanding ability
+of GPT-4 in multi-modal scenarios, translating our gathered English image-text
+datasets into Chinese and generating instruction-response through the
+in-context learning method. The experiment results demonstrate that compared to
+the existing LVLMs, Ziya-VL achieves competitive performance across a wide
+range of English-only tasks including zero-shot image-text retrieval, image
+captioning, and visual question answering. The evaluation leaderboard accessed
+by GPT-4 also indicates that our models possess satisfactory image-text
+understanding and generation capabilities in Chinese multi-modal scenario
+dialogues. Code, demo and models are available at
+~\url{https://huggingface.co/IDEA-CCNL/Ziya-BLIP2-14B-Visual-v1}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Context Compression for Auto-regressive <span class="highlight-title">Transformer</span>s with Sentinel
+  Tokens <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08152v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08152v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyu Ren, Qi Jia, Kenny Q. Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The quadratic complexity of the attention module makes it gradually become
+the bulk of compute in Transformer-based LLMs during generation. Moreover, the
+excessive key-value cache that arises when dealing with long inputs also brings
+severe issues on memory footprint and inference latency. In this work, we
+propose a plug-and-play approach that is able to incrementally compress the
+intermediate activation of a specified span of tokens into compact ones,
+thereby reducing both memory and computational cost when processing subsequent
+context. Experiments on both in-domain language modeling and zero-shot
+open-ended document generation demonstrate the advantage of our approach over
+sparse attention baselines in terms of fluency, n-gram matching, and semantic
+similarity. At last, we comprehensively profile the benefit of context
+compression on improving the system throughout. Code is available at
+https://github.com/DRSY/KV_Compression.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Relevance of Phoneme Duration Variability of Synthesized Training
+  Data for Automatic Speech Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08132v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08132v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nick Rossenbach, Benedikt Hilmes, Ralf Schlüter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Synthetic data generated by text-to-speech (TTS) systems can be used to
+improve automatic speech recognition (ASR) systems in low-resource or domain
+mismatch tasks. It has been shown that TTS-generated outputs still do not have
+the same qualities as real data. In this work we focus on the temporal
+structure of synthetic data and its relation to ASR training. By using a novel
+oracle setup we show how much the degradation of synthetic data quality is
+influenced by duration modeling in non-autoregressive (NAR) TTS. To get
+reference phoneme durations we use two common alignment methods, a hidden
+Markov Gaussian-mixture model (HMM-GMM) aligner and a neural connectionist
+temporal classification (CTC) aligner. Using a simple algorithm based on random
+walks we shift phoneme duration distributions of the TTS system closer to real
+durations, resulting in an improvement of an ASR system using synthetic data in
+a semi-supervised setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at ASRU 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fine-grained Conversational Decoding via Isotropic and Proximal Search <span class="chip">EMNLP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08130v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08130v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxuan Yao, Han Wu, Qiling Xu, Linqi Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  General-purpose text decoding approaches are usually adopted for dialogue
+response generation. Although the quality of the generated responses can be
+improved with dialogue-specific encoding methods, conversational decoding
+methods are still under-explored. Inspired by \citet{wu2023learning} that a
+good dialogue feature space should follow the rules of locality and isotropy,
+we present a fine-grained conversational decoding method, termed
+\textit{isotropic and proximal search (IPS)}. Our method is designed to
+generate the semantic-concentrated response, while still maintaining
+informativeness and discrimination against the context. Experiments show that
+our approach outperforms existing decoding strategies in the dialogue field
+across both automatic and human evaluation metrics. More in-depth analyses
+further confirm the effectiveness of our approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in EMNLP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Who Wrote it and Why? <span class="highlight-title">Prompt</span>ing Large-Language Models for Authorship
+  Verification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08123v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08123v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chia-Yu Hung, Zhiqiang Hu, Yujia Hu, Roy Ka-Wei Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Authorship verification (AV) is a fundamental task in natural language
+processing (NLP) and computational linguistics, with applications in forensic
+analysis, plagiarism detection, and identification of deceptive content.
+Existing AV techniques, including traditional stylometric and deep learning
+approaches, face limitations in terms of data requirements and lack of
+explainability. To address these limitations, this paper proposes PromptAV, a
+novel technique that leverages Large-Language Models (LLMs) for AV by providing
+step-by-step stylometric explanation prompts. PromptAV outperforms
+state-of-the-art baselines, operates effectively with limited training data,
+and enhances interpretability through intuitive explanations, showcasing its
+potential as an effective and interpretable solution for the AV task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages,1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Voice Conversion for Stuttered Speech, Instruments, Unseen Languages and
+  Textually Described Voices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08104v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08104v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matthew Baas, Herman Kamper
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Voice conversion aims to convert source speech into a target voice using
+recordings of the target speaker as a reference. Newer models are producing
+increasingly realistic output. But what happens when models are fed with
+non-standard data, such as speech from a user with a speech impairment? We
+investigate how a recent voice conversion model performs on non-standard
+downstream voice conversion tasks. We use a simple but robust approach called
+k-nearest neighbors voice conversion (kNN-VC). We look at four non-standard
+applications: stuttered voice conversion, cross-lingual voice conversion,
+musical instrument conversion, and text-to-voice conversion. The latter
+involves converting to a target voice specified through a text description,
+e.g. "a young man with a high-pitched voice". Compared to an established
+baseline, we find that kNN-VC retains high performance in stuttered and
+cross-lingual voice conversion. Results are more mixed for the musical
+instrument and text-to-voice conversion tasks. E.g., kNN-VC works well on some
+instruments like drums but not on others. Nevertheless, this shows that voice
+conversion models - and kNN-VC in particular - are increasingly applicable in a
+range of non-standard downstream tasks. But there are still limitations when
+samples are very far from the training distribution. Code, samples, trained
+models: https://rf5.github.io/sacair2023-knnvc-demo/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 1 figure, 5 tables. Accepted at SACAIR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ QASiNa: Religious Domain Question Answering using Sirah Nabawiyah 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08102v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08102v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Razif Rizqullah, Ayu Purwarianti, Alham Fikri Aji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Nowadays, Question Answering (QA) tasks receive significant research focus,
+particularly with the development of Large Language Model (LLM) such as Chat
+GPT [1]. LLM can be applied to various domains, but it contradicts the
+principles of information transmission when applied to the Islamic domain. In
+Islam we strictly regulates the sources of information and who can give
+interpretations or tafseer for that sources [2]. The approach used by LLM to
+generate answers based on its own interpretation is similar to the concept of
+tafseer, LLM is neither an Islamic expert nor a human which is not permitted in
+Islam. Indonesia is the country with the largest Islamic believer population in
+the world [3]. With the high influence of LLM, we need to make evaluation of
+LLM in religious domain. Currently, there is only few religious QA dataset
+available and none of them using Sirah Nabawiyah especially in Indonesian
+Language. In this paper, we propose the Question Answering Sirah Nabawiyah
+(QASiNa) dataset, a novel dataset compiled from Sirah Nabawiyah literatures in
+Indonesian language. We demonstrate our dataset by using mBERT [4], XLM-R [5],
+and IndoBERT [6] which fine-tuned with Indonesian translation of SQuAD v2.0
+[7]. XLM-R model returned the best performance on QASiNa with EM of 61.20,
+F1-Score of 75.94, and Substring Match of 70.00. We compare XLM-R performance
+with Chat GPT-3.5 and GPT-4 [1]. Both Chat GPT version returned lower EM and
+F1-Score with higher Substring Match, the gap of EM and Substring Match get
+wider in GPT-4. The experiment indicate that Chat GPT tends to give excessive
+interpretations as evidenced by its higher Substring Match scores compared to
+EM and F1-Score, even after providing instruction and context. This concludes
+Chat GPT is unsuitable for question answering task in religious domain
+especially for Islamic religion.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 Pages. In Proceeding of 10th International Conference on Advanced
+  Informatics: Concepts, Theory and Applications (ICAICTA 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span>or: A Conversational and Autonomous <span class="highlight-title">Prompt</span> Generation Agent for
+  Intelligent Text Entry Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08101v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08101v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junxiao Shen, John J. Dudley, Jingyao Zheng, Bill Byrne, Per Ola Kristensson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text entry is an essential task in our day-to-day digital interactions.
+Numerous intelligent features have been developed to streamline this process,
+making text entry more effective, efficient, and fluid. These improvements
+include sentence prediction and user personalization. However, as deep
+learning-based language models become the norm for these advanced features, the
+necessity for data collection and model fine-tuning increases. These challenges
+can be mitigated by harnessing the in-context learning capability of large
+language models such as GPT-3.5. This unique feature allows the language model
+to acquire new skills through prompts, eliminating the need for data collection
+and fine-tuning. Consequently, large language models can learn various text
+prediction techniques. We initially showed that, for a sentence prediction
+task, merely prompting GPT-3.5 surpassed a GPT-2 backed system and is
+comparable with a fine-tuned GPT-3.5 model, with the latter two methods
+requiring costly data collection, fine-tuning and post-processing. However, the
+task of prompting large language models to specialize in specific text
+prediction tasks can be challenging, particularly for designers without
+expertise in prompt engineering. To address this, we introduce Promptor, a
+conversational prompt generation agent designed to engage proactively with
+designers. Promptor can automatically generate complex prompts tailored to meet
+specific needs, thus offering a solution to this challenge. We conducted a user
+study involving 24 participants creating prompts for three intelligent text
+entry tasks, half of the participants used Promptor while the other half
+designed prompts themselves. The results show that Promptor-designed prompts
+result in a 35% increase in similarity and 22% in coherence over those by
+designers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ClimateNLP: Analyzing Public Sentiment Towards Climate Change Using
+  Natural Language Processing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08099v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08099v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ajay Krishnan T. K., V. S. Anoop
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Climate change's impact on human health poses unprecedented and diverse
+challenges. Unless proactive measures based on solid evidence are implemented,
+these threats will likely escalate and continue to endanger human well-being.
+The escalating advancements in information and communication technologies have
+facilitated the widespread availability and utilization of social media
+platforms. Individuals utilize platforms such as Twitter and Facebook to
+express their opinions, thoughts, and critiques on diverse subjects,
+encompassing the pressing issue of climate change. The proliferation of climate
+change-related content on social media necessitates comprehensive analysis to
+glean meaningful insights. This paper employs natural language processing (NLP)
+techniques to analyze climate change discourse and quantify the sentiment of
+climate change-related tweets. We use ClimateBERT, a pretrained model
+fine-tuned specifically for the climate change domain. The objective is to
+discern the sentiment individuals express and uncover patterns in public
+opinion concerning climate change. Analyzing tweet sentiments allows a deeper
+comprehension of public perceptions, concerns, and emotions about this critical
+global challenge. The findings from this experiment unearth valuable insights
+into public sentiment and the entities associated with climate change
+discourse. Policymakers, researchers, and organizations can leverage such
+analyses to understand public perceptions, identify influential actors, and
+devise informed strategies to address climate change challenges.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Low-Resource Clickbait Spoiling for Indonesian via Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08085v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08085v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ni Putu Intan Maharani, Ayu Purwarianti, Alham Fikri Aji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clickbait spoiling aims to generate a short text to satisfy the curiosity
+induced by a clickbait post. As it is a newly introduced task, the dataset is
+only available in English so far. Our contributions include the construction of
+manually labeled clickbait spoiling corpus in Indonesian and an evaluation on
+using cross-lingual zero-shot question answering-based models to tackle
+clikcbait spoiling for low-resource language like Indonesian. We utilize
+selection of multilingual language models. The experimental results suggest
+that XLM-RoBERTa (large) model outperforms other models for phrase and passage
+spoilers, meanwhile, mDeBERTa (base) model outperforms other models for
+multipart spoilers.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICAICTA 2023 (10th International Conference on Advanced
+  Informatics: Concepts, Theory and Applications)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ To token or not to token: A Comparative Study of Text Representations
+  for Cross-Lingual Transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08078v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08078v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Mushfiqur Rahman, Fardin Ahsan Sakib, Fahim Faisal, Antonios Anastasopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Choosing an appropriate tokenization scheme is often a bottleneck in
+low-resource cross-lingual transfer. To understand the downstream implications
+of text representation choices, we perform a comparative analysis on language
+models having diverse text representation modalities including 2
+segmentation-based models (\texttt{BERT}, \texttt{mBERT}), 1 image-based model
+(\texttt{PIXEL}), and 1 character-level model (\texttt{CANINE}). First, we
+propose a scoring Language Quotient (LQ) metric capable of providing a weighted
+representation of both zero-shot and few-shot evaluation combined. Utilizing
+this metric, we perform experiments comprising 19 source languages and 133
+target languages on three tasks (POS tagging, Dependency parsing, and NER). Our
+analysis reveals that image-based models excel in cross-lingual transfer when
+languages are closely related and share visually similar scripts. However, for
+tasks biased toward word meaning (POS, NER), segmentation-based models prove to
+be superior. Furthermore, in dependency parsing tasks where word relationships
+play a crucial role, models with their character-level focus, outperform
+others. Finally, we propose a recommendation scheme based on our findings to
+guide model selection according to task and language requirements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at 3RD MULTILINGUAL REPRESENTATION LEARNING (MRL) WORKSHOP,
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Training Generative Question-Answering on Synthetic Data Obtained from
+  an Instruct-tuned Mo <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08072v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08072v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kosuke Takahashi, Takahiro Omi, Kosuke Arima, Tatsuya Ishigaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a simple and cost-effective method for synthesizing data
+to train question-answering systems. For training, fine-tuning GPT models is a
+common practice in resource-rich languages like English, however, it becomes
+challenging for non-English languages due to the scarcity of sufficient
+question-answer (QA) pairs. Existing approaches use question and answer
+generators trained on human-authored QA pairs, which involves substantial human
+expenses. In contrast, we use an instruct-tuned model to generate QA pairs in a
+zero-shot or few-shot manner. We conduct experiments to compare various
+strategies for obtaining QA pairs from the instruct-tuned model. The results
+demonstrate that a model trained on our proposed synthetic data achieves
+comparable performance to a model trained on manually curated datasets, without
+incurring human costs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>PACLIC 2023 short paper, 4 pages (6 pages including references), 4
+  figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Negative Pairs in Code Search <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08069v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08069v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haochen Li, Xin Zhou, Luu Anh Tuan, Chunyan Miao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, contrastive learning has become a key component in fine-tuning code
+search models for software development efficiency and effectiveness. It pulls
+together positive code snippets while pushing negative samples away given
+search queries. Among contrastive learning, InfoNCE is the most widely used
+loss function due to its better performance. However, the following problems in
+negative samples of InfoNCE may deteriorate its representation learning: 1) The
+existence of false negative samples in large code corpora due to duplications.
+2). The failure to explicitly differentiate between the potential relevance of
+negative samples. As an example, a bubble sorting algorithm example is less
+``negative'' than a file saving function for the quick sorting algorithm query.
+In this paper, we tackle the above problems by proposing a simple yet effective
+Soft-InfoNCE loss that inserts weight terms into InfoNCE. In our proposed loss
+function, we apply three methods to estimate the weights of negative pairs and
+show that the vanilla InfoNCE loss is a special case of Soft-InfoNCE.
+Theoretically, we analyze the effects of Soft-InfoNCE on controlling the
+distribution of learnt code representations and on deducing a more precise
+mutual information estimation. We furthermore discuss the superiority of
+proposed loss functions with other design alternatives. Extensive experiments
+demonstrate the effectiveness of Soft-InfoNCE and weights estimation methods
+under state-of-the-art code search models on a large-scale public dataset
+consisting of six programming languages. Source code is available at
+\url{https://github.com/Alex-HaochenLi/Soft-InfoNCE}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ QLLM: Accurate and Efficient Low-Bitwidth Quantization for Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08041v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08041v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Liu, Ruihao Gong, Xiuying Wei, Zhiwei Dong, Jianfei Cai, Bohan Zhuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) excel in NLP, but their demands hinder their
+widespread deployment. While Quantization-Aware Training (QAT) offers a
+solution, its extensive training costs make Post-Training Quantization (PTQ) a
+more practical approach for LLMs. In existing studies, activation outliers in
+particular channels are identified as the bottleneck to PTQ accuracy. They
+propose to transform the magnitudes from activations to weights, which however
+offers limited alleviation or suffers from unstable gradients, resulting in a
+severe performance drop at low-bitwidth. In this paper, we propose QLLM, an
+accurate and efficient low-bitwidth PTQ method designed for LLMs. QLLM
+introduces an adaptive channel reassembly technique that reallocates the
+magnitude of outliers to other channels, thereby mitigating their impact on the
+quantization range. This is achieved by channel disassembly and channel
+assembly, which first breaks down the outlier channels into several
+sub-channels to ensure a more balanced distribution of activation magnitudes.
+Then similar channels are merged to maintain the original channel number for
+efficiency. Additionally, an adaptive strategy is designed to autonomously
+determine the optimal number of sub-channels for channel disassembly. To
+further compensate for the performance loss caused by quantization, we propose
+an efficient tuning method that only learns a small number of low-rank weights
+while freezing the pre-trained quantized model. After training, these low-rank
+parameters can be fused into the frozen weights without affecting inference.
+Extensive experiments on LLaMA-1 and LLaMA-2 show that QLLM can obtain accurate
+quantized models efficiently. For example, QLLM quantizes the 4-bit LLaMA-2-70B
+within 10 hours on a single A100-80G GPU, outperforming the previous
+state-of-the-art method by 7.89% on the average accuracy across five zero-shot
+tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Large Language Models for Multi-Modal Out-of-Distribution
+  Detection <span class="chip">EMNLP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08027v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08027v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Dai, Hao Lang, Kaisheng Zeng, Fei Huang, Yongbin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-distribution (OOD) detection is essential for reliable and trustworthy
+machine learning. Recent multi-modal OOD detection leverages textual
+information from in-distribution (ID) class names for visual OOD detection, yet
+it currently neglects the rich contextual information of ID classes. Large
+language models (LLMs) encode a wealth of world knowledge and can be prompted
+to generate descriptive features for each class. Indiscriminately using such
+knowledge causes catastrophic damage to OOD detection due to LLMs'
+hallucinations, as is observed by our analysis. In this paper, we propose to
+apply world knowledge to enhance OOD detection performance through selective
+generation from LLMs. Specifically, we introduce a consistency-based
+uncertainty calibration method to estimate the confidence score of each
+generation. We further extract visual objects from each image to fully
+capitalize on the aforementioned world knowledge. Extensive experiments
+demonstrate that our method consistently outperforms the state-of-the-art.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP2023 Findings Long Paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Harnessing Large Language Models' Empathetic Response Generation
+  Capabilities for Online Mental Health Counselling Support 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08017v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08017v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyuan Brandon Loh, Aravind Sesagiri Raamkumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated remarkable performance across
+various information-seeking and reasoning tasks. These computational systems
+drive state-of-the-art dialogue systems, such as ChatGPT and Bard. They also
+carry substantial promise in meeting the growing demands of mental health care,
+albeit relatively unexplored. As such, this study sought to examine LLMs'
+capability to generate empathetic responses in conversations that emulate those
+in a mental health counselling setting. We selected five LLMs: version 3.5 and
+version 4 of the Generative Pre-training (GPT), Vicuna FastChat-T5, Pathways
+Language Model (PaLM) version 2, and Falcon-7B-Instruct. Based on a simple
+instructional prompt, these models responded to utterances derived from the
+EmpatheticDialogues (ED) dataset. Using three empathy-related metrics, we
+compared their responses to those from traditional response generation dialogue
+systems, which were fine-tuned on the ED dataset, along with human-generated
+responses. Notably, we discovered that responses from the LLMs were remarkably
+more empathetic in most scenarios. We position our findings in light of
+catapulting advancements in creating empathetic conversational systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>7 pages, 1 figure</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Think, Act, and Ask: Open-World Interactive Personalized Robot
+  Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07968v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07968v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinpei Dai, Run Peng, Sikai Li, Joyce Chai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-Shot Object Navigation (ZSON) enables agents to navigate towards
+open-vocabulary objects in unknown environments. The existing works of ZSON
+mainly focus on following individual instructions to find generic object
+classes, neglecting the utilization of natural language interaction and the
+complexities of identifying user-specific objects. To address these
+limitations, we introduce Zero-shot Interactive Personalized Object Navigation
+(ZIPON), where robots need to navigate to personalized goal objects while
+engaging in conversations with users. To solve ZIPON, we propose a new
+framework termed Open-woRld Interactive persOnalized Navigation (ORION), which
+uses Large Language Models (LLMs) to make sequential decisions to manipulate
+different modules for perception, navigation and communication. Experimental
+results show that the performance of interactive agents that can leverage user
+feedback exhibits significant improvement. However, obtaining a good balance
+between task completion and the efficiency of navigation and interaction
+remains challenging for all methods. We further provide more findings on the
+impact of diverse user feedback forms on the agents' performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Video available at https://www.youtube.com/watch?v=QW6rMHVpxUY</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Clustering of Spell Variations for Proper Nouns Transliterated from the
+  other languages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07962v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07962v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Prathamesh Pawar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the prominent problems with processing and operating on text data is
+the non uniformity of it. Due to the change in the dialects and languages, the
+caliber of translation is low. This creates a unique problem while using NLP in
+text data; which is the spell variation arising from the inconsistent
+translations and transliterations. This problem can also be further aggravated
+by the human error arising from the various ways to write a Proper Noun from an
+Indian language into its English equivalent. Translating proper nouns
+originating from Indian languages can be complicated as some proper nouns are
+also used as common nouns which might be taken literally. Applications of NLP
+that require addresses, names and other proper nouns face this problem
+frequently. We propose a method to cluster these spell variations for proper
+nouns using ML techniques and mathematical similarity equations. We aimed to
+use Affinity Propagation to determine relative similarity between the tokens.
+The results are augmented by filtering the token-variation pair by a similarity
+threshold. We were able to reduce the spell variations by a considerable
+amount. This application can significantly reduce the amount of human
+annotation efforts needed for data cleansing and formatting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>3 pages, published Airial Conference 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A New Approach Towards Autoformalization <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07957v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07957v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nilay Patel, Jeffrey Flanigan, Rahul Saha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Verifying mathematical proofs is difficult, but can be automated with the
+assistance of a computer. Autoformalization is the task of automatically
+translating natural language mathematics into a formal language that can be
+verified by a program. This is a challenging task, and especially for
+higher-level mathematics found in research papers. Research paper mathematics
+requires large amounts of background and context. In this paper, we propose an
+avenue towards tackling autoformalization for research-level mathematics, by
+breaking the task into easier and more approachable subtasks: unlinked
+formalization (formalization with unlinked definitions and theorems), entity
+linking (linking to the proper theorems and definitions), and finally adjusting
+types so it passes the type checker. In addition, we present arXiv2Formal, a
+benchmark dataset for unlinked formalization consisting of 50 theorems
+formalized for the Lean theorem prover sampled from papers on arXiv.org. We
+welcome any contributions from the community to future versions of this
+dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review at MATHAI 2023 @ NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Calibrating Likelihoods towards Consistency in Summarization Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08764v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08764v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Polina Zablotskaia, Misha Khalman, Rishabh Joshi, Livio Baldini Soares, Shoshana Jakobovits, Joshua Maynez, Shashi Narayan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the recent advances in abstractive text summarization, current
+summarization models still suffer from generating factually inconsistent
+summaries, reducing their utility for real-world application. We argue that the
+main reason for such behavior is that the summarization models trained with
+maximum likelihood objective assign high probability to plausible sequences
+given the context, but they often do not accurately rank sequences by their
+consistency. In this work, we solve this problem by calibrating the likelihood
+of model generated sequences to better align with a consistency metric measured
+by natural language inference (NLI) models. The human evaluation study and
+automatic metrics show that the calibrated models generate more consistent and
+higher-quality summaries. We also show that the models trained using our method
+return probabilities that are better aligned with the NLI scores, which
+significantly increase reliability of summarization models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CompA: Addressing the Gap in Compositional Reasoning in Audio-Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08753v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08753v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sreyan Ghosh, Ashish Seth, Sonal Kumar, Utkarsh Tyagi, Chandra Kiran Evuru, S. Ramaneswaran, S. Sakshi, Oriol Nieto, Ramani Duraiswami, Dinesh Manocha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A fundamental characteristic of audio is its compositional nature.
+Audio-language models (ALMs) trained using a contrastive approach (e.g., CLAP)
+that learns a shared representation between audio and language modalities have
+improved performance in many downstream applications, including zero-shot audio
+classification, audio retrieval, etc. However, the ability of these models to
+effectively perform compositional reasoning remains largely unexplored and
+necessitates additional research. In this paper, we propose CompA, a collection
+of two expert-annotated benchmarks with a majority of real-world audio samples,
+to evaluate compositional reasoning in ALMs. Our proposed CompA-order evaluates
+how well an ALM understands the order or occurrence of acoustic events in
+audio, and CompA-attribute evaluates attribute binding of acoustic events. An
+instance from either benchmark consists of two audio-caption pairs, where both
+audios have the same acoustic events but with different compositions. An ALM is
+evaluated on how well it matches the right audio to the right caption. Using
+this benchmark, we first show that current ALMs perform only marginally better
+than random chance, thereby struggling with compositional reasoning. Next, we
+propose CompA-CLAP, where we fine-tune CLAP using a novel learning method to
+improve its compositional reasoning abilities. To train CompA-CLAP, we first
+propose improvements to contrastive training with composition-aware hard
+negatives, allowing for more focused training. Next, we propose a novel modular
+contrastive loss that helps the model learn fine-grained compositional
+understanding and overcomes the acute scarcity of openly available
+compositional audios. CompA-CLAP significantly improves over all our baseline
+models on the CompA benchmark, indicating its superior compositional reasoning
+capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Pre-print under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Circuit Component Reuse Across Tasks in <span class="highlight-title">Transformer</span> Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08744v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08744v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jack Merullo, Carsten Eickhoff, Ellie Pavlick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work in mechanistic interpretability has shown that behaviors in
+language models can be successfully reverse-engineered through circuit
+analysis. A common criticism, however, is that each circuit is task-specific,
+and thus such analysis cannot contribute to understanding the models at a
+higher level. In this work, we present evidence that insights (both low-level
+findings about specific heads and higher-level findings about general
+algorithms) can indeed generalize across tasks. Specifically, we study the
+circuit discovered in Wang et al. (2022) for the Indirect Object Identification
+(IOI) task and 1.) show that it reproduces on a larger GPT2 model, and 2.) that
+it is mostly reused to solve a seemingly different task: Colored Objects
+(Ippolito & Callison-Burch, 2023). We provide evidence that the process
+underlying both tasks is functionally very similar, and contains about a 78%
+overlap in in-circuit attention heads. We further present a proof-of-concept
+intervention experiment, in which we adjust four attention heads in middle
+layers in order to 'repair' the Colored Objects circuit and make it behave like
+the IOI circuit. In doing so, we boost accuracy from 49.6% to 93.7% on the
+Colored Objects task and explain most sources of error. The intervention
+affects downstream attention heads in specific ways predicted by their
+interactions in the IOI circuit, indicating that this subcircuit behavior is
+invariant to the different task inputs. Overall, our results provide evidence
+that it may yet be possible to explain large language models' behavior in terms
+of a relatively small number of interpretable task-general algorithmic building
+blocks and computational components.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Zero-Shot Language Agent for Computer Control with Structured
+  Reflection <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08740v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08740v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Li, Gang Li, Zhiwei Deng, Bryan Wang, Yang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have shown increasing capacity at planning and
+executing a high-level goal in a live computer environment (e.g. MiniWoB++). To
+perform a task, recent works often require a model to learn from trace examples
+of the task via either supervised learning or few/many-shot prompting. Without
+these trace examples, it remains a challenge how an agent can autonomously
+learn and improve its control on a computer, which limits the ability of an
+agent to perform a new task. We approach this problem with a zero-shot agent
+that requires no given expert traces. Our agent plans for executable actions on
+a partially observed environment, and iteratively progresses a task by
+identifying and learning from its mistakes via self-reflection and structured
+thought management. On the easy tasks of MiniWoB++, we show that our zero-shot
+agent often outperforms recent SoTAs, with more efficient reasoning. For tasks
+with more complexity, our reflective agent performs on par with prior best
+models, even though previous works had the advantages of accessing expert
+traces or additional screen information.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Toward Joint Language Modeling for Speech Units and Text <span class="chip">EMNLP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08715v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08715v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ju-Chieh Chou, Chung-Ming Chien, Wei-Ning Hsu, Karen Livescu, Arun Babu, Alexis Conneau, Alexei Baevski, Michael Auli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech and text are two major forms of human language. The research community
+has been focusing on mapping speech to text or vice versa for many years.
+However, in the field of language modeling, very little effort has been made to
+model them jointly. In light of this, we explore joint language modeling for
+speech units and text. Specifically, we compare different speech tokenizers to
+transform continuous speech signals into discrete units and use different
+methods to construct mixed speech-text data. We introduce automatic metrics to
+evaluate how well the joint LM mixes speech and text. We also fine-tune the LM
+on downstream spoken language understanding (SLU) tasks with different
+modalities (speech or text) and test its performance to assess the model's
+learning of shared representations. Our results show that by mixing speech
+units and text with our proposed mixing techniques, the joint LM improves over
+a speech-only baseline on SLU tasks and shows zero-shot cross-modal
+transferability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP findings 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can <span class="highlight-title">GPT</span> models be Financial Analysts? An Evaluation of Chat<span class="highlight-title">GPT</span> and <span class="highlight-title">GPT</span>-4
+  on mock CFA Exams 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08678v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08678v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ethan Callanan, Amarachi Mbakwe, Antony Papadimitriou, Yulong Pei, Mathieu Sibue, Xiaodan Zhu, Zhiqiang Ma, Xiaomo Liu, Sameena Shah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have demonstrated remarkable performance on a
+wide range of Natural Language Processing (NLP) tasks, often matching or even
+beating state-of-the-art task-specific models. This study aims at assessing the
+financial reasoning capabilities of LLMs. We leverage mock exam questions of
+the Chartered Financial Analyst (CFA) Program to conduct a comprehensive
+evaluation of ChatGPT and GPT-4 in financial analysis, considering Zero-Shot
+(ZS), Chain-of-Thought (CoT), and Few-Shot (FS) scenarios. We present an
+in-depth analysis of the models' performance and limitations, and estimate
+whether they would have a chance at passing the CFA exams. Finally, we outline
+insights into potential strategies and improvements to enhance the
+applicability of LLMs in finance. In this perspective, we hope this work paves
+the way for future studies to continue enhancing LLMs for financial reasoning
+through rigorous evaluation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LoftQ: LoRA-Fine-Tuning-Aware Quantization for Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08659v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08659v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixiao Li, Yifan Yu, Chen Liang, Pengcheng He, Nikos Karampatziakis, Weizhu Chen, Tuo Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Quantization is an indispensable technique for serving Large Language Models
+(LLMs) and has recently found its way into LoRA fine-tuning. In this work we
+focus on the scenario where quantization and LoRA fine-tuning are applied
+together on a pre-trained model. In such cases it is common to observe a
+consistent gap in the performance on downstream tasks between full fine-tuning
+and quantization plus LoRA fine-tuning approach. In response, we propose LoftQ
+(LoRA-Fine-Tuning-aware Quantization), a novel quantization framework that
+simultaneously quantizes an LLM and finds a proper low-rank initialization for
+LoRA fine-tuning. Such an initialization alleviates the discrepancy between the
+quantized and full-precision model and significantly improves the
+generalization in downstream tasks. We evaluate our method on natural language
+understanding, question answering, summarization, and natural language
+generation tasks. Experiments show that our method is highly effective and
+outperforms existing quantization methods, especially in the challenging 2-bit
+and 2/4-bit mixed precision regimes. We will release our code.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ StoryBench: A Multifaceted Benchmark for Continuous Story Visualization <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.11606v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.11606v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emanuele Bugliarello, Hernan Moraldo, Ruben Villegas, Mohammad Babaeizadeh, Mohammad Taghi Saffar, Han Zhang, Dumitru Erhan, Vittorio Ferrari, Pieter-Jan Kindermans, Paul Voigtlaender
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating video stories from text prompts is a complex task. In addition to
+having high visual quality, videos need to realistically adhere to a sequence
+of text prompts whilst being consistent throughout the frames. Creating a
+benchmark for video generation requires data annotated over time, which
+contrasts with the single caption used often in video datasets. To fill this
+gap, we collect comprehensive human annotations on three existing datasets, and
+introduce StoryBench: a new, challenging multi-task benchmark to reliably
+evaluate forthcoming text-to-video models. Our benchmark includes three video
+generation tasks of increasing difficulty: action execution, where the next
+action must be generated starting from a conditioning video; story
+continuation, where a sequence of actions must be executed starting from a
+conditioning video; and story generation, where a video must be generated from
+only text prompts. We evaluate small yet strong text-to-video baselines, and
+show the benefits of training on story-like data algorithmically generated from
+existing video captions. Finally, we establish guidelines for human evaluation
+of video stories, and reaffirm the need of better automatic metrics for video
+generation. StoryBench aims at encouraging future research efforts in this
+exciting new area.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS D&B 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GenAI Against Humanity: Nefarious Applications of Generative Artificial
+  Intelligence and Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00737v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00737v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emilio Ferrara
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Artificial Intelligence (GenAI) and Large Language Models (LLMs)
+are marvels of technology; celebrated for their prowess in natural language
+processing and multimodal content generation, they promise a transformative
+future. But as with all powerful tools, they come with their shadows. Picture
+living in a world where deepfakes are indistinguishable from reality, where
+synthetic identities orchestrate malicious campaigns, and where targeted
+misinformation or scams are crafted with unparalleled precision. Welcome to the
+darker side of GenAI applications. This article is not just a journey through
+the meanders of potential misuse of GenAI and LLMs, but also a call to
+recognize the urgency of the challenges ahead. As we navigate the seas of
+misinformation campaigns, malicious content generation, and the eerie creation
+of sophisticated malware, we'll uncover the societal implications that ripple
+through the GenAI revolution we are witnessing. From AI-powered botnets on
+social media platforms to the unnerving potential of AI to generate fabricated
+identities, or alibis made of synthetic realities, the stakes have never been
+higher. The lines between the virtual and the real worlds are blurring, and the
+consequences of potential GenAI's nefarious applications impact us all. This
+article serves both as a synthesis of rigorous research presented on the risks
+of GenAI and misuse of LLMs and as a thought-provoking vision of the different
+types of harmful GenAI applications we might encounter in the near future, and
+some ways we can prepare for them.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to CACM (Viewpoint)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Questioning the <span class="highlight-title">Survey</span> Responses of Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.07951v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.07951v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ricardo Dominguez-Olmedo, Moritz Hardt, Celestine Mendler-Dünner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large language models increase in capability, researchers have started to
+conduct surveys of all kinds on these models with varying scientific
+motivations. In this work, we examine what we can learn from language models'
+survey responses on the basis of the well-established American Community Survey
+(ACS) by the U.S. Census Bureau. Using a de-facto standard multiple-choice
+prompting technique and evaluating 40 different language models, hundreds of
+thousands of times each on questions from the ACS, we systematically establish
+two dominant patterns. First, models have significant position and labeling
+biases, for example, towards survey responses labeled with the letter "A".
+Second, when adjusting for labeling biases through randomized answer ordering,
+models across the board trend towards uniformly random survey responses. In
+fact, binary classifiers can almost perfectly differentiate between models'
+responses to the ACS and the responses of the US census. Taken together, our
+findings suggest caution in treating survey responses from language models as
+equivalent to those of human populations at present time.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DePT: Decomposed <span class="highlight-title">Prompt</span> Tuning for Parameter-Efficient Fine-tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.05173v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.05173v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengxiang Shi, Aldo Lipani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompt tuning (PT), where a small amount of trainable soft (continuous)
+prompt vectors is affixed to the input of language models (LM), has shown
+promising results across various tasks and models for parameter-efficient
+fine-tuning (PEFT). PT stands out from other PEFT approaches because it
+maintains competitive performance with fewer trainable parameters and does not
+drastically scale up its parameters as the model size expands. However, PT
+introduces additional soft prompt tokens, leading to longer input sequences,
+which significantly impacts training and inference time and memory usage due to
+the Transformer's quadratic complexity. Particularly concerning for Large
+Language Models (LLMs) that face heavy daily querying. To address this issue,
+we propose Decomposed Prompt Tuning (DePT), which decomposes the soft prompt
+into a shorter soft prompt and a pair of low-rank matrices that are then
+optimised with two different learning rates. This allows DePT to achieve better
+performance while saving over 20% memory and time costs compared to vanilla PT
+and its variants, without changing trainable parameter sizes. Through extensive
+experiments on 23 natural language processing (NLP) and vision-language (VL)
+tasks, we demonstrate that DePT outperforms state-of-the-art PEFT approaches,
+including the full fine-tuning baseline in some scenarios. Additionally, we
+empirically show that DEPT grows more efficient as the model size increases.
+Our further study reveals that DePT integrates seamlessly with
+parameter-efficient transfer learning in the few-shot learning setting and
+highlights its adaptability to various model architectures and sizes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at https://github.com/ZhengxiangShi/DePT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Speculative Decoding with Big Little Decoder <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.07863v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.07863v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sehoon Kim, Karttikeya Mangalam, Suhong Moon, Jitendra Malik, Michael W. Mahoney, Amir Gholami, Kurt Keutzer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent emergence of Large Language Models based on the Transformer
+architecture has enabled dramatic advancements in the field of Natural Language
+Processing. However, these models have long inference latency, which limits
+their deployment and makes them prohibitively expensive for various real-time
+applications. The inference latency is further exacerbated by autoregressive
+generative tasks, as models need to run iteratively to generate tokens
+sequentially without leveraging token-level parallelization. To address this,
+we propose Big Little Decoder (BiLD), a framework that can improve inference
+efficiency and latency for a wide range of text generation applications. The
+BiLD framework contains two models with different sizes that collaboratively
+generate text. The small model runs autoregressively to generate text with a
+low inference cost, and the large model is only invoked occasionally to refine
+the small model's inaccurate predictions in a non-autoregressive manner. To
+coordinate the small and large models, BiLD introduces two simple yet effective
+policies: (1) the fallback policy that determines when to hand control over to
+the large model; and (2) the rollback policy that determines when the large
+model needs to correct the small model's inaccurate predictions. To evaluate
+our framework across different tasks and models, we apply BiLD to various text
+generation scenarios encompassing machine translation on IWSLT 2017 De-En and
+WMT 2014 De-En, and summarization on XSUM and CNN/DailyMail. On an NVIDIA T4
+GPU, our framework achieves a speedup of up to 2.12x speedup with minimal
+generation quality degradation. Furthermore, our framework is fully
+plug-and-play and can be applied without any modifications in the training
+process or model architecture. Our code is open-sourced
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ m<span class="highlight-title">GPT</span>: Few-Shot Learners Go Multilingual <span class="chip">ACL</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.07580v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.07580v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Oleh Shliazhko, Alena Fenogenova, Maria Tikhonova, Vladislav Mikhailov, Anastasia Kozlova, Tatiana Shavrina
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent studies report that autoregressive language models can successfully
+solve many NLP tasks via zero- and few-shot learning paradigms, which opens up
+new possibilities for using the pre-trained language models. This paper
+introduces two autoregressive GPT-like models with 1.3 billion and 13 billion
+parameters trained on 60 languages from 25 language families using Wikipedia
+and Colossal Clean Crawled Corpus. We reproduce the GPT-3 architecture using
+GPT-2 sources and the sparse attention mechanism; Deepspeed and Megatron
+frameworks allow us to parallelize the training and inference steps
+effectively. The resulting models show performance on par with the recently
+released XGLM models by Facebook, covering more languages and enhancing NLP
+possibilities for low resource languages of CIS countries and Russian small
+nations. We detail the motivation for the choices of the architecture design,
+thoroughly describe the data preparation pipeline, and train five small
+versions of the model to choose the most optimal multilingual tokenization
+strategy. We measure the model perplexity in all covered languages and evaluate
+it on the wide spectre of multilingual tasks, including classification,
+generative, sequence labeling and knowledge probing. The models were evaluated
+with the zero-shot and few-shot methods. Furthermore, we compared the
+classification tasks with the state-of-the-art multilingual model XGLM. source
+code and the mGPT XL model are publicly released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at Transactions of the Association for
+  Computational Linguistics (TACL) To be presented at the Conference on
+  Empirical Methods in Natural Language Processing (EMNLP 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Security Vulnerabilities of Text-to-SQL Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.15363v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.15363v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xutan Peng, Yipeng Zhang, Jingfeng Yang, Mark Stevenson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although it has been demonstrated that Natural Language Processing (NLP)
+algorithms are vulnerable to deliberate attacks, the question of whether such
+weaknesses can lead to software security threats is under-explored. To bridge
+this gap, we conducted vulnerability tests on Text-to-SQL systems that are
+commonly used to create natural language interfaces to databases. We showed
+that the Text-to-SQL modules within six commercial applications can be
+manipulated to produce malicious code, potentially leading to data breaches and
+Denial of Service attacks. This is the first demonstration that NLP models can
+be exploited as attack vectors in the wild. In addition, experiments using four
+open-source language models verified that straightforward backdoor attacks on
+Text-to-SQL systems achieve a 100% success rate without affecting their
+performance. The aim of this work is to draw the community's attention to
+potential software security issues associated with NLP algorithms and encourage
+exploration of methods to mitigate against them.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ISSRE 2023: Best Paper Candidate</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Generate Novel Scientific Directions with Contextualized
+  Literature-based Discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14259v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14259v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyun Wang, Doug Downey, Heng Ji, Tom Hope
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Literature-Based Discovery (LBD) aims to discover new scientific knowledge by
+mining papers and generating hypotheses. Standard LBD is limited to predicting
+pairwise relations between discrete concepts (e.g., drug-disease links), and
+ignores critical contexts like experimental settings (e.g., a specific patient
+population where a drug is evaluated) and background motivations (e.g., to find
+drugs without specific side effects). We address these limitations with a novel
+formulation of contextualized-LBD (C-LBD): generating scientific hypotheses in
+natural language, while grounding them in a context that controls the
+hypothesis search space. We present a modeling framework using retrieval of
+``inspirations'' from past scientific papers. Our evaluations reveal that GPT-4
+tends to generate ideas with overall low technical depth and novelty, while our
+inspiration prompting approaches partially mitigate this issue. Our work
+represents a first step toward building language models that generate new ideas
+derived from scientific literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages. Code and resource is available at
+  https://github.com/EagleW/CLBD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CIDER: Context sensitive sentiment analysis for short-form text 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07864v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07864v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        James C. Young, Rudy Arthur, Hywel T. P. Williams
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Researchers commonly perform sentiment analysis on large collections of short
+texts like tweets, Reddit posts or newspaper headlines that are all focused on
+a specific topic, theme or event. Usually, general purpose sentiment analysis
+methods are used which perform well on average but miss the variation in
+meaning that happens across different contexts, for example, the word "active"
+has a very different intention and valence in the phrase "active lifestyle"
+versus "active volcano". This work presents a new approach, CIDER (Context
+Informed Dictionary and sEntiment Reasoner), which performs context sensitive
+sentiment analysis, where the valence of sentiment laden terms is inferred from
+the whole corpus before being used to score the individual texts. In this paper
+we detail the CIDER algorithm and demonstrate that it outperforms
+state-of-the-art generalist sentiment analysis on a large collection of tweets
+about the weather. We have made our implementation of CIDER available as a
+python package: https://pypi.org/project/ciderpolarity/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 2 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Effective Slogan Generation with Noise Perturbation <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04472v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04472v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jongeun Kim, MinChung Kim, Taehwan Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Slogans play a crucial role in building the brand's identity of the firm. A
+slogan is expected to reflect firm's vision and brand's value propositions in
+memorable and likeable ways. Automating the generation of slogans with such
+characteristics is challenging. Previous studies developted and tested slogan
+generation with syntactic control and summarization models which are not
+capable of generating distinctive slogans. We introduce a a novel apporach that
+leverages pre-trained transformer T5 model with noise perturbation on newly
+proposed 1:N matching pair dataset. This approach serves as a contributing
+fator in generting distinctive and coherent slogans. Turthermore, the proposed
+approach incorporates descriptions about the firm and brand into the generation
+of slogans. We evaluate generated slogans based on ROUGE1, ROUGEL and Cosine
+Similarity metrics and also assess them with human subjects in terms of
+slogan's distinctiveness, coherence, and fluency. The results demonstrate that
+our approach yields better performance than baseline models and other
+transformer-based models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in CIKM 2023 short paper
+  https://github.com/joannekim0420/SloganGeneration</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can <span class="highlight-title">Pre-train</span>ed Vision and Language Models Answer Visual
+  Information-Seeking Questions? <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.11713v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.11713v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Chen, Hexiang Hu, Yi Luan, Haitian Sun, Soravit Changpinyo, Alan Ritter, Ming-Wei Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained vision and language models have demonstrated state-of-the-art
+capabilities over existing tasks involving images and texts, including visual
+question answering. However, it remains unclear whether these models possess
+the capability to answer questions that are not only querying visual content
+but knowledge-intensive and information-seeking. In this study, we introduce
+InfoSeek, a visual question answering dataset tailored for information-seeking
+questions that cannot be answered with only common sense knowledge. Using
+InfoSeek, we analyze various pre-trained visual question answering models and
+gain insights into their characteristics. Our findings reveal that
+state-of-the-art pre-trained multi-modal models (e.g., PaLI-X, BLIP2, etc.)
+face challenges in answering visual information-seeking questions, but
+fine-tuning on the InfoSeek dataset elicits models to use fine-grained
+knowledge that was learned during their pre-training. Furthermore, we show that
+accurate visual entity recognition can be used to improve performance on
+InfoSeek by retrieving relevant documents, showing a significant space for
+improvement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 (main conference); Our dataset and evaluation is available
+  at https://open-vision-language.github.io/infoseek/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MetaTool Benchmark for Large Language Models: Deciding Whether to Use
+  Tools and Which to Use 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03128v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03128v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Huang, Jiawen Shi, Yuan Li, Chenrui Fan, Siyuan Wu, Qihui Zhang, Yixin Liu, Pan Zhou, Yao Wan, Neil Zhenqiang Gong, Lichao Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have garnered significant attention due to their
+impressive natural language processing (NLP) capabilities. Recently, many
+studies have focused on the tool utilization ability of LLMs. They primarily
+investigated how LLMs effectively collaborate with given specific tools.
+However, in scenarios where LLMs serve as intelligent agents, as seen in
+applications like AutoGPT and MetaGPT, LLMs are expected to engage in intricate
+decision-making processes that involve deciding whether to employ a tool and
+selecting the most suitable tool(s) from a collection of available tools to
+fulfill user requests. Therefore, in this paper, we introduce MetaTool, a
+benchmark designed to evaluate whether LLMs have tool usage awareness and can
+correctly choose tools. Specifically, we create a dataset called ToolE within
+the benchmark. This dataset contains various types of user queries in the form
+of prompts that trigger LLMs to use tools, including both single-tool and
+multi-tool scenarios. Subsequently, we set the tasks for both tool usage
+awareness and tool selection. We define four subtasks from different
+perspectives in tool selection, including tool selection with similar choices,
+tool selection in specific scenarios, tool selection with possible reliability
+issues, and multi-tool selection. We conduct experiments involving nine popular
+LLMs and find that the majority of them still struggle to effectively select
+tools, highlighting the existing gaps between LLMs and genuine intelligent
+agents. However, through the error analysis, we found there is still
+significant room for improvement. Finally, we conclude with insights for tool
+developers that follow ChatGPT to provide detailed descriptions that can
+enhance the tool selection performance of LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Chain-of-Thoughts <span class="highlight-title">Prompt</span>ing with Iterative Bootstrapping in
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.11657v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.11657v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiashuo Sun, Yi Luo, Yeyun Gong, Chen Lin, Yelong Shen, Jian Guo, Nan Duan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) can achieve highly effective performance on
+various reasoning tasks by incorporating step-by-step chain-of-thought (CoT)
+prompting as demonstrations. However, the reasoning chains of demonstrations
+generated by LLMs are prone to errors, which can subsequently lead to incorrect
+reasoning during inference. Furthermore, inappropriate exemplars (overly
+simplistic or complex), can affect overall performance among varying levels of
+difficulty. We introduce Iter-CoT (Iterative bootstrapping in Chain-of-Thoughts
+Prompting), an iterative bootstrapping approach for selecting exemplars and
+generating reasoning chains. By utilizing iterative bootstrapping, our approach
+enables LLMs to autonomously rectify errors, resulting in more precise and
+comprehensive reasoning chains. Simultaneously, our approach selects
+challenging yet answerable questions accompanied by reasoning chains as
+exemplars with a moderate level of difficulty, which enhances the LLMs'
+generalizability across varying levels of difficulty. Experimental results
+indicate that Iter-CoT exhibits superiority, achieving competitive performance
+across three distinct reasoning tasks on ten datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages, 10 figures, 21 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Comprehensive <span class="highlight-title">Survey</span> of Document-level Relation Extraction (2016-2023) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16396v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16396v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Julien Delaunay, Hanh Thi Hong Tran, Carlos-Emiliano González-Gallardo, Georgeta Bordea, Nicolas Sidere, Antoine Doucet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Document-level relation extraction (DocRE) is an active area of research in
+natural language processing (NLP) concerned with identifying and extracting
+relationships between entities beyond sentence boundaries. Compared to the more
+traditional sentence-level relation extraction, DocRE provides a broader
+context for analysis and is more challenging because it involves identifying
+relationships that may span multiple sentences or paragraphs. This task has
+gained increased interest as a viable solution to build and populate knowledge
+bases automatically from unstructured large-scale documents (e.g., scientific
+papers, legal contracts, or news articles), in order to have a better
+understanding of relationships between entities. This paper aims to provide a
+comprehensive overview of recent advances in this field, highlighting its
+different applications in comparison to sentence-level relation extraction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ace<span class="highlight-title">GPT</span>, Localizing Large Language Models in Arabic 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.12053v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.12053v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huang Huang, Fei Yu, Jianqing Zhu, Xuening Sun, Hao Cheng, Dingjie Song, Zhihong Chen, Abdulmohsen Alharthi, Bang An, Ziche Liu, Zhiyi Zhang, Junying Chen, Jianquan Li, Benyou Wang, Lian Zhang, Ruoyu Sun, Xiang Wan, Haizhou Li, Jinchao Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper is devoted to the development of a localized Large Language Model
+(LLM) specifically for Arabic, a language imbued with unique cultural
+characteristics inadequately addressed by current mainstream models.
+Significant concerns emerge when addressing cultural sensitivity and local
+values. To address this, the paper proposes a comprehensive solution that
+includes further pre-training with Arabic texts, Supervised Fine-Tuning (SFT)
+utilizing native Arabic instructions, and GPT-4 responses in Arabic, alongside
+Reinforcement Learning with AI Feedback (RLAIF) employing a reward model
+attuned to local culture and values. The goal is to cultivate culturally
+cognizant and value-aligned Arabic LLMs capable of accommodating the diverse,
+application-specific needs of Arabic-speaking communities. Comprehensive
+evaluations reveal that the resulting model, dubbed 'AceGPT', sets the
+state-of-the-art standard for open Arabic LLMs across various benchmarks,
+including the instruction-following benchmark (i.e., Arabic Vicuna-80 and
+Arabic AlpacaEval), knowledge benchmark (i.e., Arabic MMLU and EXAMs), and the
+newly introduced Arabic Cultural and Value Alignment benchmark. Notably, AceGPT
+outperforms Turbo in the popular Vicuna-80 benchmark when evaluated with GPT-4,
+despite the benchmark's limited scale. Codes, data, and models are in
+https://github.com/FreedomIntelligence/AceGPT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>https://github.com/FreedomIntelligence/AceGPT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Where are We in Event-centric Emotion Analysis? Bridging Emotion Role
+  Labeling and Appraisal-based Approaches 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.02092v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.02092v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Roman Klinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The term emotion analysis in text subsumes various natural language
+processing tasks which have in common the goal to enable computers to
+understand emotions. Most popular is emotion classification in which one or
+multiple emotions are assigned to a predefined textual unit. While such setting
+is appropriate for identifying the reader's or author's emotion, emotion role
+labeling adds the perspective of mentioned entities and extracts text spans
+that correspond to the emotion cause. The underlying emotion theories agree on
+one important point; that an emotion is caused by some internal or external
+event and comprises several subcomponents, including the subjective feeling and
+a cognitive evaluation. We therefore argue that emotions and events are related
+in two ways. (1) Emotions are events; and this perspective is the fundament in
+natural language processing for emotion role labeling. (2) Emotions are caused
+by events; a perspective that is made explicit with research how to incorporate
+psychological appraisal theories in NLP models to interpret events. These two
+research directions, role labeling and (event-focused) emotion classification,
+have by and large been tackled separately. In this paper, we contextualize both
+perspectives and discuss open research questions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted to the Big Picture Workshop
+  (https://bigpictureworkshop.com/)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Efficient Multilingual Language Model Compression through Vocabulary
+  Trimming <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15020v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15020v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Asahi Ushio, Yi Zhou, Jose Camacho-Collados
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multilingual language model (LM) have become a powerful tool in NLP
+especially for non-English languages. Nevertheless, model parameters of
+multilingual LMs remain large due to the larger embedding matrix of the
+vocabulary covering tokens in different languages. On the contrary, monolingual
+LMs can be trained in a target language with the language-specific vocabulary
+only, but this requires a large budget and availability of reliable corpora to
+achieve a high-quality LM from scratch. In this paper, we propose
+vocabulary-trimming (VT), a method to reduce a multilingual LM vocabulary to a
+target language by deleting irrelevant tokens from its vocabulary. In theory,
+VT can compress any existing multilingual LM to build monolingual LMs in any
+language covered by the multilingual LM. In our experiments, we show that VT
+can retain the original performance of the multilingual LM, while being smaller
+in size (in general around 50% of the original vocabulary size is enough) than
+the original multilingual LM. The evaluation is performed over four NLP tasks
+(two generative and two classification tasks) among four widely used
+multilingual LMs in seven languages. Finally, we show that this methodology can
+keep the best of both monolingual and multilingual worlds by keeping a small
+size as monolingual models without the need for specifically retraining them,
+and even limiting potentially harmful social biases.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLMMaps -- A Visual Metaphor for Stratified Evaluation of Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.00457v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.00457v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrik Puchert, Poonam Poonam, Christian van Onzenoodt, Timo Ropinski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have revolutionized natural language processing
+and demonstrated impressive capabilities in various tasks. Unfortunately, they
+are prone to hallucinations, where the model exposes incorrect or false
+information in its responses, which renders diligent evaluation approaches
+mandatory. While LLM performance in specific knowledge fields is often
+evaluated based on question and answer (Q&A) datasets, such evaluations usually
+report only a single accuracy number for the dataset, which often covers an
+entire field. This field-based evaluation, is problematic with respect to
+transparency and model improvement. A stratified evaluation could instead
+reveal subfields, where hallucinations are more likely to occur and thus help
+to better assess LLMs' risks and guide their further development. To support
+such stratified evaluations, we propose LLMMaps as a novel visualization
+technique that enables users to evaluate LLMs' performance with respect to Q&A
+datasets. LLMMaps provide detailed insights into LLMs' knowledge capabilities
+in different subfields, by transforming Q&A datasets as well as LLM responses
+into an internal knowledge structure. An extension for comparative
+visualization furthermore, allows for the detailed comparison of multiple LLMs.
+To assess LLMMaps we use them to conduct a comparative analysis of several
+state-of-the-art LLMs, such as BLOOM, GPT-2, GPT-3, ChatGPT and LLaMa-13B, as
+well as two qualitative user evaluations. All necessary source code and data
+for generating LLMMaps to be used in scientific publications and elsewhere is
+available on GitHub: https://github.com/viscom-ulm/LLMMaps
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiLu: A Knowledge-Driven Approach to Autonomous Driving with Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16292v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16292v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Licheng Wen, Daocheng Fu, Xin Li, Xinyu Cai, Tao Ma, Pinlong Cai, Min Dou, Botian Shi, Liang He, Yu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in autonomous driving have relied on data-driven
+approaches, which are widely adopted but face challenges including dataset
+bias, overfitting, and uninterpretability. Drawing inspiration from the
+knowledge-driven nature of human driving, we explore the question of how to
+instill similar capabilities into autonomous driving systems and summarize a
+paradigm that integrates an interactive environment, a driver agent, as well as
+a memory component to address this question. Leveraging large language models
+with emergent abilities, we propose the DiLu framework, which combines a
+Reasoning and a Reflection module to enable the system to perform
+decision-making based on common-sense knowledge and evolve continuously.
+Extensive experiments prove DiLu's capability to accumulate experience and
+demonstrate a significant advantage in generalization ability over
+reinforcement learning-based methods. Moreover, DiLu is able to directly
+acquire experiences from real-world datasets which highlights its potential to
+be deployed on practical autonomous driving systems. To the best of our
+knowledge, we are the first to instill knowledge-driven capability into
+autonomous driving systems from the perspective of how humans drive.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Loose lips sink ships: Mitigating Length Bias in Reinforcement Learning
+  from Human Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05199v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05199v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Shen, Rui Zheng, Wenyu Zhan, Jun Zhao, Shihan Dou, Tao Gui, Qi Zhang, Xuanjing Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning from human feedback serves as a crucial bridge,
+aligning large language models with human and societal values. This alignment
+requires a vast corpus of human feedback to learn a reward model, which is
+subsequently used to finetune language models. However, we have identified that
+the reward model often finds shortcuts to bypass its intended objectives,
+misleadingly assuming that humans prefer longer responses. The emergence of
+length bias often induces the model to favor longer outputs, yet it doesn't
+equate to an increase in helpful information within these outputs. In this
+paper, we propose an innovative solution, applying the Product-of-Experts (PoE)
+technique to separate reward modeling from the influence of sequence length. In
+our framework, the main expert concentrates on understanding human intents,
+while the biased expert targets the identification and capture of length bias.
+To further enhance the learning of bias, we introduce perturbations into the
+bias-focused expert, disrupting the flow of semantic information. Experimental
+results validate the effectiveness of our approach, indicating that language
+model performance is improved, irrespective of sequence length.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PlatoLM: Teaching LLMs via a Socratic Questioning User Simulator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.11534v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.11534v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chuyi Kong, Yaxin Fan, Xiang Wan, Feng Jiang, Benyou Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The unparalleled performance of closed-sourced ChatGPT has sparked efforts
+towards its democratization, with notable strides made by leveraging real user
+and ChatGPT conversations, as evidenced by Vicuna. However, due to challenges
+in gathering conversations involving human participation, current endeavors
+like Baize and UltraChat aim to automatically generate conversational data.
+They primarily rely on ChatGPT conducting roleplay to simulate human behaviors
+based on instructions rather than genuine learning from humans, resulting in
+limited scope, diminished diversity, and an absence of genuine multi-round
+conversational dynamics. To address the above issues, we target human questions
+extracted from genuine human-machine conversations as a learning goal and train
+a user simulator called `Socratic' to produce a high-quality human-centric
+synthetic conversation dataset. Subsequently, this dataset was used to train
+our assistant model, named `PlatoLM'. Experimentally, PlatoLM outpaces baseline
+models in both Vicuna-Bench and MT-Bench by pairwise comparison when
+considering equivalent training set sizes, and manual evaluation also shows
+that our model is highly competitive. Impressively, when fine-tuned with the
+latest LLaMA 2 model, PlatoLM achieves the SOTA performance among 7B models
+(including LLaMA-2-7B-chat and Vicuna-7B) in MT-Bench benchmark and in
+Alpaca-Eval benchmark, it ranks second among 7B models, even beating some
+larger scale models (including LLaMA-2-13B-chat and GPT-3.5). Further in-depth
+analysis demonstrates the scalability and transferability of our approach. The
+code is available at https://github.com/FreedomIntelligence/PlatoLM.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hierarchical Evaluation Framework: Best Practices for Human Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01917v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01917v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Iva Bojic, Jessica Chen, Si Yuan Chang, Qi Chwen Ong, Shafiq Joty, Josip Car
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human evaluation plays a crucial role in Natural Language Processing (NLP) as
+it assesses the quality and relevance of developed systems, thereby
+facilitating their enhancement. However, the absence of widely accepted human
+evaluation metrics in NLP hampers fair comparisons among different systems and
+the establishment of universal assessment standards. Through an extensive
+analysis of existing literature on human evaluation metrics, we identified
+several gaps in NLP evaluation methodologies. These gaps served as motivation
+for developing our own hierarchical evaluation framework. The proposed
+framework offers notable advantages, particularly in providing a more
+comprehensive representation of the NLP system's performance. We applied this
+framework to evaluate the developed Machine Reading Comprehension system, which
+was utilized within a human-AI symbiosis model. The results highlighted the
+associations between the quality of inputs and outputs, underscoring the
+necessity to evaluate both components rather than solely focusing on outputs.
+In future work, we will investigate the potential time-saving benefits of our
+proposed framework for evaluators assessing NLP systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Analysis on Large Language Models in Healthcare: A Case Study of
+  Bio<span class="highlight-title">BERT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07282v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07282v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shyni Sharaf, V. S. Anoop
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper conducts a comprehensive investigation into applying large
+language models, particularly on BioBERT, in healthcare. It begins with
+thoroughly examining previous natural language processing (NLP) approaches in
+healthcare, shedding light on the limitations and challenges these methods
+face. Following that, this research explores the path that led to the
+incorporation of BioBERT into healthcare applications, highlighting its
+suitability for addressing the specific requirements of tasks related to
+biomedical text mining. The analysis outlines a systematic methodology for
+fine-tuning BioBERT to meet the unique needs of the healthcare domain. This
+approach includes various components, including the gathering of data from a
+wide range of healthcare sources, data annotation for tasks like identifying
+medical entities and categorizing them, and the application of specialized
+preprocessing techniques tailored to handle the complexities found in
+biomedical texts. Additionally, the paper covers aspects related to model
+evaluation, with a focus on healthcare benchmarks and functions like processing
+of natural language in biomedical, question-answering, clinical document
+classification, and medical entity recognition. It explores techniques to
+improve the model's interpretability and validates its performance compared to
+existing healthcare-focused language models. The paper thoroughly examines
+ethical considerations, particularly patient privacy and data security. It
+highlights the benefits of incorporating BioBERT into healthcare contexts,
+including enhanced clinical decision support and more efficient information
+retrieval. Nevertheless, it acknowledges the impediments and complexities of
+this integration, encompassing concerns regarding data privacy, transparency,
+resource-intensive requirements, and the necessity for model customization to
+align with diverse healthcare domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Is Chat<span class="highlight-title">GPT</span> a Good Causal Reasoner? A Comprehensive Evaluation <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.07375v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.07375v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinglong Gao, Xiao Ding, Bing Qin, Ting Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Causal reasoning ability is crucial for numerous NLP applications. Despite
+the impressive emerging ability of ChatGPT in various NLP tasks, it is unclear
+how well ChatGPT performs in causal reasoning. In this paper, we conduct the
+first comprehensive evaluation of the ChatGPT's causal reasoning capabilities.
+Experiments show that ChatGPT is not a good causal reasoner, but a good causal
+explainer. Besides, ChatGPT has a serious hallucination on causal reasoning,
+possibly due to the reporting biases between causal and non-causal
+relationships in natural language, as well as ChatGPT's upgrading processes,
+such as RLHF. The In-Context Learning (ICL) and Chain-of-Thought (CoT)
+techniques can further exacerbate such causal hallucination. Additionally, the
+causal reasoning ability of ChatGPT is sensitive to the words used to express
+the causal concept in prompts, and close-ended prompts perform better than
+open-ended prompts. For events in sentences, ChatGPT excels at capturing
+explicit causality rather than implicit causality, and performs better in
+sentences with lower event density and smaller lexical distance between events.
+The code is available on https://github.com/ArrogantL/ChatGPT4CausalReasoning .
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Evaluating Factual Consistency of Summaries with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14069v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14069v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiqi Chen, Siyang Gao, Junxian He
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting factual errors in summaries has been an important and challenging
+subject in summarization research. Inspired by the emergent ability of large
+language models (LLMs), we explore evaluating factual consistency of summaries
+by directly prompting LLMs. We present a comprehensive empirical study to
+assess the ability of LLMs as factual consistency evaluators, which consists of
+(1) analyzing different LLMs such as the GPT model series and Flan-T5; (2)
+investigating a variety of prompting methods including vanilla prompting,
+chain-of-thought prompting, and a sentence-by-sentence prompting method to
+tackle long summaries; and (3) evaluating on diverse summaries generated by
+multiple summarization systems, ranging from pre-transformer methods to SOTA
+pretrained models. Our experiments demonstrate that prompting LLMs is able to
+outperform the previous best factuality systems in all settings, by up to 12.2
+absolute points in terms of the binary classification accuracy on inconsistency
+detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MINT: Evaluating LLMs in Multi-turn Interaction with Tools and Language
+  Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10691v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10691v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingyao Wang, Zihan Wang, Jiateng Liu, Yangyi Chen, Lifan Yuan, Hao Peng, Heng Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To solve complex tasks, large language models (LLMs) often require multiple
+rounds of interactions with the user, sometimes assisted by external tools.
+However, current evaluation protocols often emphasize benchmark performance
+with single-turn exchanges, neglecting the nuanced interactions among the user,
+LLMs, and external tools, while also underestimating the importance of natural
+language feedback from users. These oversights contribute to discrepancies
+between research benchmark evaluations and real-world use cases. We introduce
+MINT, a benchmark that evaluates LLMs' ability to solve tasks with multi-turn
+interactions by (1) using tools and (2) leveraging natural language feedback.
+To ensure reproducibility, we provide an evaluation framework where LLMs can
+access tools by executing Python code and receive users' natural language
+feedback simulated by GPT-4. We repurpose a diverse set of established
+evaluation datasets focusing on reasoning, coding, and decision-making and
+carefully curate them into a compact subset for efficient evaluation. Our
+analysis of 20 open- and closed-source LLMs offers intriguing findings. (a)
+LLMs generally benefit from tools and language feedback, with performance gains
+(absolute, same below) of 1-8% for each turn of tool use and 2-17% with natural
+language feedback. (b) Better single-turn performance does not guarantee better
+multi-turn performance. (c) Surprisingly, on the LLMs evaluated, supervised
+instruction-finetuning (SIFT) and reinforcement learning from human feedback
+(RLHF) generally hurt multi-turn capabilities. We expect MINT can help measure
+progress and incentivize research in improving LLMs' capabilities in multi-turn
+interactions, especially for open-source communities where multi-turn human
+evaluation can be less accessible compared to commercial LLMs with a larger
+user base.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available on our project website:
+  https://xingyaoww.github.io/mint-bench</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking the <span class="highlight-title">BERT</span>-like <span class="highlight-title">Pretrain</span>ing for DNA Sequences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07644v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07644v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoqi Liang, Weiqiang Bai, Lifeng Qiao, Yuchen Ren, Jianle Sun, Peng Ye, Hongliang Yan, Xinzhu Ma, Wangmeng Zuo, Wanli Ouyang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the success of large-scale pretraining in NLP, there is an increasing
+trend of applying it to the domain of life sciences. In particular, pretraining
+methods based on DNA sequences have garnered growing attention due to their
+potential to capture generic information about genes. However, existing
+pretraining methods for DNA sequences largely rely on direct adoptions of BERT
+pretraining from NLP, lacking a comprehensive understanding and a specifically
+tailored approach. To address this research gap, we first conducted a series of
+exploratory experiments and gained several insightful observations: 1) In the
+fine-tuning phase of downstream tasks, when using K-mer overlapping
+tokenization instead of K-mer non-overlapping tokenization, both overlapping
+and non-overlapping pretraining weights show consistent performance
+improvement.2) During the pre-training process, using K-mer overlapping
+tokenization quickly produces clear K-mer embeddings and reduces the loss to a
+very low level, while using K-mer non-overlapping tokenization results in less
+distinct embeddings and continuously decreases the loss. 3) Using overlapping
+tokenization causes the self-attention in the intermediate layers of
+pre-trained models to tend to overly focus on certain tokens, reflecting that
+these layers are not adequately optimized. In summary, overlapping tokenization
+can benefit the fine-tuning of downstream tasks but leads to inadequate
+pretraining with fast convergence. To unleash the pretraining potential, we
+introduce a novel approach called RandomMask, which gradually increases the
+task difficulty of BERT-like pretraining by continuously expanding its mask
+boundary, forcing the model to learn more knowledge. RandomMask is simple but
+effective, achieving top-tier performance across 26 datasets of 28 datasets
+spanning 7 downstream tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SpikeCLIP: A Contrastive Language-Image <span class="highlight-title">Pretrain</span>ed Spiking Neural
+  Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06488v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06488v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianlong Li, Wenhao Liu, Changze Lv, Jianhan Xu, Cenyuan Zhang, Muling Wu, Xiaoqing Zheng, Xuanjing Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking neural networks (SNNs) have demonstrated the capability to achieve
+comparable performance to deep neural networks (DNNs) in both visual and
+linguistic domains while offering the advantages of improved energy efficiency
+and adherence to biological plausibility. However, the extension of such
+single-modality SNNs into the realm of multimodal scenarios remains an
+unexplored territory. Drawing inspiration from the concept of contrastive
+language-image pre-training (CLIP), we introduce a novel framework, named
+SpikeCLIP, to address the gap between two modalities within the context of
+spike-based computing through a two-step recipe involving ``Alignment
+Pre-training + Dual-Loss Fine-tuning". Extensive experiments demonstrate that
+SNNs achieve comparable results to their DNN counterparts while significantly
+reducing energy consumption across a variety of datasets commonly used for
+multimodal model evaluation. Furthermore, SpikeCLIP maintains robust
+performance in image classification tasks that involve class labels not
+predefined within specific categories.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Prompt</span>TTS 2: Describing and Generating Voices with Text <span class="highlight-title">Prompt</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.02285v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.02285v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichong Leng, Zhifang Guo, Kai Shen, Xu Tan, Zeqian Ju, Yanqing Liu, Yufei Liu, Dongchao Yang, Leying Zhang, Kaitao Song, Lei He, Xiang-Yang Li, Sheng Zhao, Tao Qin, Jiang Bian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech conveys more information than text, as the same word can be uttered in
+various voices to convey diverse information. Compared to traditional
+text-to-speech (TTS) methods relying on speech prompts (reference speech) for
+voice variability, using text prompts (descriptions) is more user-friendly
+since speech prompts can be hard to find or may not exist at all. TTS
+approaches based on the text prompt face two main challenges: 1) the
+one-to-many problem, where not all details about voice variability can be
+described in the text prompt, and 2) the limited availability of text prompt
+datasets, where vendors and large cost of data labeling are required to write
+text prompts for speech. In this work, we introduce PromptTTS 2 to address
+these challenges with a variation network to provide variability information of
+voice not captured by text prompts, and a prompt generation pipeline to utilize
+the large language models (LLM) to compose high quality text prompts.
+Specifically, the variation network predicts the representation extracted from
+the reference speech (which contains full information about voice variability)
+based on the text prompt representation. For the prompt generation pipeline, it
+generates text prompts for speech with a speech language understanding model to
+recognize voice attributes (e.g., gender, speed) from speech and a large
+language model to formulate text prompts based on the recognition results.
+Experiments on a large-scale (44K hours) speech dataset demonstrate that
+compared to the previous works, PromptTTS 2 generates voices more consistent
+with text prompts and supports the sampling of diverse voice variability,
+thereby offering users more choices on voice generation. Additionally, the
+prompt generation pipeline produces high-quality text prompts, eliminating the
+large labeling cost. The demo page of PromptTTS 2 is available online.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Demo page: https://speechresearch.github.io/prompttts2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TEMPO: <span class="highlight-title">Prompt</span>-based Generative <span class="highlight-title">Pre-train</span>ed <span class="highlight-title">Transformer</span> for Time Series
+  Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04948v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04948v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Defu Cao, Furong Jia, Sercan O Arik, Tomas Pfister, Yixiang Zheng, Wen Ye, Yan Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The past decade has witnessed significant advances in time series modeling
+with deep learning. While achieving state-of-the-art results, the
+best-performing architectures vary highly across applications and domains.
+Meanwhile, for natural language processing, the Generative Pre-trained
+Transformer (GPT) has demonstrated impressive performance via training one
+general-purpose model across various textual datasets. It is intriguing to
+explore whether GPT-type architectures can be effective for time series,
+capturing the intrinsic dynamic attributes and leading to significant accuracy
+improvements. In this paper, we propose a novel framework, TEMPO, that can
+effectively learn time series representations. We focus on utilizing two
+essential inductive biases of the time series task for pre-trained models: (i)
+decomposition of the complex interaction between trend, seasonal and residual
+components; and (ii) introducing the selection-based prompts to facilitate
+distribution adaptation in non-stationary time series. TEMPO expands the
+capability for dynamically modeling real-world temporal phenomena from data
+within diverse domains. Our experiments demonstrate the superior performance of
+TEMPO over state-of-the-art methods on a number of time series benchmark
+datasets. This performance gain is observed not only in standard supervised
+learning settings but also in scenarios involving previously unseen datasets as
+well as in scenarios with multi-modal inputs. This compelling finding
+highlights TEMPO's potential to constitute a foundational model-building
+framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 20 figures, 17 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analyzing And Editing Inner Mechanisms Of Backdoored Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.12461v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.12461v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Max Lamparth, Anka Reuel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Poisoning of data sets is a potential security threat to large language
+models that can lead to backdoored models. A description of the internal
+mechanisms of backdoored language models and how they process trigger inputs,
+e.g., when switching to toxic language, has yet to be found. In this work, we
+study the internal representations of transformer-based backdoored language
+models and determine early-layer MLP modules as most important for the backdoor
+mechanism in combination with the initial embedding projection. We use this
+knowledge to remove, insert, and modify backdoor mechanisms with engineered
+replacements that reduce the MLP module outputs to essentials for the backdoor
+mechanism. To this end, we introduce PCP ablation, where we replace transformer
+modules with low-rank matrices based on the principal components of their
+activations. We demonstrate our results on backdoored toy, backdoored large,
+and non-backdoored open-source models. We show that we can improve the backdoor
+robustness of large language models by locally constraining individual modules
+during fine-tuning on potentially poisonous data sets.
+  Trigger warning: Offensive language.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>included new experimental results and addressed reviewer feedback</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Typing to Listen at the Cocktail Party: Text-Guided Target Speaker
+  Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07284v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07284v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Hao, Jibin Wu, Jianwei Yu, Chenglin Xu, Kay Chen Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans possess an extraordinary ability to selectively focus on the sound
+source of interest amidst complex acoustic environments, commonly referred to
+as cocktail party scenarios. In an attempt to replicate this remarkable
+auditory attention capability in machines, target speaker extraction (TSE)
+models have been developed. These models leverage the pre-registered cues of
+the target speaker to extract the sound source of interest. However, the
+effectiveness of these models is hindered in real-world scenarios due to the
+unreliable or even absence of pre-registered cues. To address this limitation,
+this study investigates the integration of natural language description to
+enhance the feasibility, controllability, and performance of existing TSE
+models. Specifically, we propose a model named LLM-TSE, wherein a large
+language model (LLM) to extract useful semantic cues from the user's typed text
+input. These cues can serve as independent extraction cues, task selectors to
+control the TSE process, or complement the pre-registered cues. Our
+experimental results demonstrate competitive performance when only text-based
+cues are presented, the effectiveness of using input text as a task selector,
+and a new state-of-the-art when combining text-based cues with pre-registered
+cues. To our knowledge, this is the first study to successfully incorporate
+LLMs to guide target speaker extraction, which can be a cornerstone for
+cocktail party problem research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review, https://github.com/haoxiangsnr/llm-tse</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Well Begun is Half Done: Generator-agnostic Knowledge Pre-Selection for
+  Knowledge-Grounded Dialogue <span class="chip">EMNLP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07659v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07659v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lang Qin, Yao Zhang, Hongru Liang, Jun Wang, Zhenglu Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate knowledge selection is critical in knowledge-grounded dialogue
+systems. Towards a closer look at it, we offer a novel perspective to organize
+existing literature, i.e., knowledge selection coupled with, after, and before
+generation. We focus on the third under-explored category of study, which can
+not only select knowledge accurately in advance, but has the advantage to
+reduce the learning, adjustment, and interpretation burden of subsequent
+response generation models, especially LLMs. We propose GATE, a
+generator-agnostic knowledge selection method, to prepare knowledge for
+subsequent response generation models by selecting context-related knowledge
+among different knowledge structures and variable knowledge requirements.
+Experimental results demonstrate the superiority of GATE, and indicate that
+knowledge selection before generation is a lightweight yet effective way to
+facilitate LLMs (e.g., ChatGPT) to generate more informative responses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by EMNLP2023 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Ring Attention with Blockwise <span class="highlight-title">Transformer</span>s for Near-Infinite Context 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01889v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01889v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hao Liu, Matei Zaharia, Pieter Abbeel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have emerged as the architecture of choice for many
+state-of-the-art AI models, showcasing exceptional performance across a wide
+range of AI applications. However, the memory demands imposed by Transformers
+limit their ability to handle long sequences, thereby creating challenges for
+tasks involving extended sequences or long-term dependencies. We present a
+distinct approach, Ring Attention, which leverages blockwise computation of
+self-attention to distribute long sequences across multiple devices while
+overlapping the communication of key-value blocks with the computation of
+blockwise attention. Ring Attention enables training and inference of sequences
+that are up to device count times longer than those of prior memory-efficient
+Transformers, effectively eliminating the memory constraints imposed by
+individual devices. Extensive experiments on language modeling tasks
+demonstrate the effectiveness of Ring Attention in allowing large sequence
+input size and improving performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reformulating Domain Adaptation of Large Language Models as
+  Adapt-Retrieve-Revise <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03328v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03328v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhen wan, Yating Zhang, Yexiang Wang, Fei Cheng, Sadao Kurohashi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While large language models (LLMs) like GPT-4 have recently demonstrated
+astonishing zero-shot capabilities in general domain tasks, they often generate
+content with hallucinations in specific domains such as Chinese law, hindering
+their application in these areas. This is typically due to the absence of
+training data that encompasses such a specific domain, preventing GPT-4 from
+acquiring in-domain knowledge. A pressing challenge is that it's not plausible
+to continue training LLMs of such scale on in-domain data.
+  This paper introduces a simple and effective domain adaptation framework for
+GPT-4 by reformulating generation as an \textbf{adapt-retrieve-revise} process.
+The initial step is to \textbf{adapt} an affordable 7B LLM to the target domain
+by continuing learning on in-domain data. When solving a task, we leverage the
+adapted LLM to generate a draft answer given a task query. Then, the draft
+answer will be used to \textbf{retrieve} supporting evidence candidates from an
+external in-domain knowledge base. Finally, the draft answer and retrieved
+evidence are concatenated into a whole prompt to let GPT-4 assess the evidence
+and \textbf{revise} the draft answer to generate the final answer.
+  Our proposal combines the advantages of the efficiency of adapting a smaller
+7B model with the evidence-assessing capability of GPT-4 and effectively
+prevents GPT-4 from generating hallucinatory content. In the zero-shot setting
+of four Chinese legal tasks, our method improves accuracy by 33.3\% compared to
+the direct generation by GPT-4. When compared to two stronger retrieval-based
+baselines, our method outperforms them by 15.4\% and 23.9\%. Our code will be
+released
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under submission to ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Semantic Role Labeling from Compatible Label Sequences <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14600v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14600v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Li, Ghazaleh Kazeminejad, Susan W. Brown, Martha Palmer, Vivek Srikumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic role labeling (SRL) has multiple disjoint label sets, e.g., VerbNet
+and PropBank. Creating these datasets is challenging, therefore a natural
+question is how to use each one to help the other. Prior work has shown that
+cross-task interaction helps, but only explored multitask learning so far. A
+common issue with multi-task setup is that argument sequences are still
+separately decoded, running the risk of generating structurally inconsistent
+label sequences (as per lexicons like Semlink). In this paper, we eliminate
+such issue with a framework that jointly models VerbNet and PropBank labels as
+one sequence. In this setup, we show that enforcing Semlink constraints during
+decoding constantly improves the overall F1. With special input constructions,
+our joint model infers VerbNet arguments from given PropBank arguments with
+over 99 F1. For learning, we propose a constrained marginal model that learns
+with knowledge defined in Semlink to further benefit from the large amounts of
+PropBank-only data. On the joint benchmark based on CoNLL05, our models achieve
+state-of-the-art F1's, outperforming the prior best in-domain model by 3.5
+(VerbNet) and 0.8 (PropBank). For out-of-domain generalization, our models
+surpass the prior best by 3.4 (VerbNet) and 0.2 (PropBank).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Mechanism for Solving Relational Tasks in <span class="highlight-title">Transformer</span> Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16130v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16130v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jack Merullo, Carsten Eickhoff, Ellie Pavlick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A primary criticism towards language models (LMs) is their inscrutability.
+This paper presents evidence that, despite their size and complexity, LMs
+sometimes exploit a simple computational mechanism to solve one-to-one
+relational tasks (e.g., capital_of(Poland)=Warsaw). We investigate a range of
+language model sizes (from 124M parameters to 176B parameters) in an in-context
+learning setting, and find that for a variety of tasks (involving capital
+cities, upper-casing, and past-tensing) a key part of the mechanism reduces to
+a simple linear update typically applied by the feedforward (FFN) networks.
+These updates also tend to promote the output of the relation in a
+content-independent way (e.g., encoding Poland:Warsaw::China:Beijing),
+revealing a predictable pattern that these models take in solving these tasks.
+We further show that this mechanism is specific to tasks that require retrieval
+from pretraining memory, rather than retrieval from local context. Our results
+contribute to a growing body of work on the mechanistic interpretability of
+LLMs, and offer reason to be optimistic that, despite the massive and
+non-linear nature of the models, the strategies they ultimately use to solve
+tasks can sometimes reduce to familiar and even intuitive algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MusiLingo: Bridging Music and Text with <span class="highlight-title">Pre-train</span>ed Language Models for
+  Music Captioning and Query Response 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08730v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08730v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihao Deng, Yinghao Ma, Yudong Liu, Rongchen Guo, Ge Zhang, Wenhu Chen, Wenhao Huang, Emmanouil Benetos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown immense potential in multimodal
+applications, yet the convergence of textual and musical domains remains
+relatively unexplored. To address this gap, we present MusiLingo, a novel
+system for music caption generation and music-related query responses.
+MusiLingo employs a single projection layer to align music representations from
+the pre-trained frozen music audio model MERT with the frozen Vicuna-7B
+language model (an adaption of LLaMA), bridging the gap between music audio and
+textual contexts. We train it on an extensive music caption dataset and
+fine-tune it with instructional data. Due to the scarcity of high-quality music
+Q\&A datasets, we created the Music Instruct (MI) dataset from captions in the
+MusicCaps datasets, tailored for open-ended music inquiries. Empirical
+evaluations demonstrate its competitive performance in generating music
+captions and composing music-related Q&A pairs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Answering Unseen Questions With Smaller Language Models Using Rationale
+  Generation and Dense Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04711v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04711v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Hartill, Diana Benavides-Prado, Michael Witbrock, Patricia J. Riddle
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When provided with sufficient explanatory context, smaller Language Models
+have been shown to exhibit strong reasoning ability on challenging short-answer
+question-answering tasks where the questions are unseen in training. We
+evaluate two methods for further improvement in this setting. Both methods
+focus on combining rationales generated by a larger Language Model with longer
+contexts created from a multi-hop dense retrieval system. The first method
+($\textit{RR}$) involves training a Rationale Ranking model to score both
+generated rationales and retrieved contexts with respect to relevance and
+truthfulness. We then use the scores to derive combined contexts from both
+knowledge sources using a number of combinatory strategies. For the second
+method ($\textit{RATD}$) we utilise retrieval-augmented training datasets
+developed by Hartill et al. 2023 to train a smaller Reasoning model such that
+it becomes proficient at utilising relevant information from longer text
+sequences that may be only partially evidential and frequently contain many
+irrelevant sentences. We find that both methods significantly improve results.
+Our single best Reasoning model materially improves upon strong comparable
+prior baselines for unseen evaluation datasets (StrategyQA 58.9 $\rightarrow$
+61.7 acc., CommonsenseQA 63.6 $\rightarrow$ 72.7 acc., ARC-DA 31.6
+$\rightarrow$ 52.1 F1, IIRC 25.5 $\rightarrow$ 27.3 F1) and a version utilising
+our prior knowledge of each type of question in selecting a context combination
+strategy does even better. Our proposed models also generally outperform direct
+prompts against much larger models (BLOOM 175B and StableVicuna 13B) in both
+few-shot chain-of-thought and standard few-shot settings.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SciFix: Outperforming <span class="highlight-title">GPT</span>3 on Scientific Factual Error Correction <span class="chip">EMNLP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14707v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14707v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dhananjay Ashok, Atharva Kulkarni, Hai Pham, Barnabás Póczos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Due to the prohibitively high cost of creating error correction datasets,
+most Factual Claim Correction methods rely on a powerful verification model to
+guide the correction process. This leads to a significant drop in performance
+in domains like scientific claims, where good verification models do not always
+exist. In this work, we introduce SciFix, a scientific claim correction system
+that does not require a verifier but can outperform existing methods by a
+considerable margin -- achieving correction accuracy of 84% on the SciFact
+dataset, 77% on SciFact-Open and 72% on the CovidFact dataset, compared to next
+best accuracies of 7%, 5%, and 15% on the same datasets respectively. Our
+method leverages the power of prompting with LLMs during training to create a
+richly annotated dataset that can be used for fully supervised training and
+regularization. We additionally use a claim-aware decoding procedure to improve
+the quality of corrected claims. Our method outperforms the very LLM that was
+used to generate the annotated dataset -- with Few-Shot Prompting on GPT3.5
+achieving 58%, 61%, and 64% on the respective datasets, a consistently lower
+correction accuracy, despite using nearly 800 times as many parameters as our
+model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in proceedings of EMNLP2023 (findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Improving Emotional Expression and Cohesion in Image-Based Playlist
+  Description and Music Topics: A Continuous Parameterization Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01248v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01248v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuelyu Ji, Yuheng Song, Wei Wang, Ruoyi Xu, Zhongqian Xie, Huiyun Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text generation in image-based platforms, particularly for music-related
+content, requires precise control over text styles and the incorporation of
+emotional expression. However, existing approaches often need help to control
+the proportion of external factors in generated text and rely on discrete
+inputs, lacking continuous control conditions for desired text generation. This
+study proposes Continuous Parameterization for Controlled Text Generation
+(CPCTG) to overcome these limitations. Our approach leverages a Language Model
+(LM) as a style learner, integrating Semantic Cohesion (SC) and Emotional
+Expression Proportion (EEP) considerations. By enhancing the reward method and
+manipulating the CPCTG level, our experiments on playlist description and music
+topic generation tasks demonstrate significant improvements in ROUGE scores,
+indicating enhanced relevance and coherence in the generated text.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Becasue I find some important fourmulation need to change</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dataless Knowledge Fusion by Merging Weights of Language Models <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.09849v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.09849v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xisen Jin, Xiang Ren, Daniel Preotiuc-Pietro, Pengxiang Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning pre-trained language models has become the prevalent paradigm for
+building downstream NLP models. Oftentimes fine-tuned models are readily
+available but their training data is not, due to data privacy or intellectual
+property concerns. This creates a barrier to fusing knowledge across individual
+models to yield a better single model. In this paper, we study the problem of
+merging individual models built on different training data sets to obtain a
+single model that performs well both across all data set domains and can
+generalize on out-of-domain data. We propose a dataless knowledge fusion method
+that merges models in their parameter space, guided by weights that minimize
+prediction differences between the merged model and the individual models. Over
+a battery of evaluation settings, we show that the proposed method
+significantly outperforms baselines such as Fisher-weighted averaging or model
+ensembling. Further, we find that our method is a promising alternative to
+multi-task learning that can preserve or sometimes improve over the individual
+models without access to the training data. Finally, model merging is more
+efficient than training a multi-task model, thus making it applicable to a
+wider set of scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2023; The code is available at
+  https://github.com/bloomberg/dataless-model-merging</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Stylized innovation: generating timelines by interrogating incrementally
+  available randomised dictionaries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1806.07722v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1806.07722v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Kinsler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A key challenge when trying to understand innovation is that it is a dynamic,
+ongoing process, which can be highly contingent on ephemeral factors such as
+culture, economics, or luck. This means that any analysis of the real-world
+process must necessarily be historical - and thus probably too late to be most
+useful - but also cannot be sure what the properties of the web of connections
+between innovations is or was. Here I try to address this by designing and
+generating a set of synthetic innovation web "dictionaries" that can be used to
+host sampled innovation timelines, probe the overall statistics and behaviours
+of these processes, and determine the degree of their reliance on the structure
+or generating algorithm. Thus, inspired by the work of Fink, Reeves, Palma and
+Farr (2017) on innovation in language, gastronomy, and technology, I study how
+new symbol discovery manifests itself in terms of additional "word" vocabulary
+being available from dictionaries generated from a finite number of symbols.
+Several distinct dictionary generation models are investigated using numerical
+simulation, with emphasis on the scaling of knowledge as dictionary generators
+and parameters are varied, and the role of which order the symbols are
+discovered in.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Knowledge is a Region in Weight Space for Fine-tuned Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.04863v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.04863v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Almog Gueta, Elad Venezian, Colin Raffel, Noam Slonim, Yoav Katz, Leshem Choshen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Research on neural networks has focused on understanding a single model
+trained on a single dataset. However, relatively little is known about the
+relationships between different models, particularly those trained or tested on
+different datasets. We address this by studying how the weight space and the
+underlying loss landscape of different models are interconnected.
+  Specifically, we demonstrate that finetuned models that were optimized for
+high performance, reside in well-defined regions in weight space, and vice
+versa -- that any model that resides anywhere in those regions also exhibits
+high performance. Notably, we show that language models that have been
+finetuned on the same dataset form a tight cluster in the weight space, while
+models finetuned on different datasets from the same underlying task form a
+looser cluster. Moreover, traversing around the region between the models leads
+to new models that perform comparably or even better than models obtained via
+finetuning, even on tasks that the original models were not finetuned on.
+  Our findings provide insight into the relationships between models,
+demonstrating that a model positioned between two similar models can acquire
+the knowledge of both. We leverage this and design a method for selecting a
+better model for efficient finetuning. Specifically, we show that starting from
+the center of the region is as effective, if not more, than using the
+pretrained model in 11 out of 12 datasets, resulting in an average accuracy
+improvement of 3.06.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Label-free Node Classification on Graphs with Large Language Models
+  (LLMS) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04668v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04668v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhikai Chen, Haitao Mao, Hongzhi Wen, Haoyu Han, Wei Jin, Haiyang Zhang, Hui Liu, Jiliang Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent years, there have been remarkable advancements in node
+classification achieved by Graph Neural Networks (GNNs). However, they
+necessitate abundant high-quality labels to ensure promising performance. In
+contrast, Large Language Models (LLMs) exhibit impressive zero-shot proficiency
+on text-attributed graphs. Yet, they face challenges in efficiently processing
+structural data and suffer from high inference costs. In light of these
+observations, this work introduces a label-free node classification on graphs
+with LLMs pipeline, LLM-GNN. It amalgamates the strengths of both GNNs and LLMs
+while mitigating their limitations. Specifically, LLMs are leveraged to
+annotate a small portion of nodes and then GNNs are trained on LLMs'
+annotations to make predictions for the remaining large portion of nodes. The
+implementation of LLM-GNN faces a unique challenge: how can we actively select
+nodes for LLMs to annotate and consequently enhance the GNN training? How can
+we leverage LLMs to obtain annotations of high quality, representativeness, and
+diversity, thereby enhancing GNN performance with less cost? To tackle this
+challenge, we develop an annotation quality heuristic and leverage the
+confidence scores derived from LLMs to advanced node selection. Comprehensive
+experimental results validate the effectiveness of LLM-GNN. In particular,
+LLM-GNN can achieve an accuracy of 74.9% on a vast-scale dataset \products with
+a cost less than 1 dollar.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The code will be available soon via
+  https://github.com/CurryTang/LLMGNN</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Are Personalized Stochastic Parrots More Dangerous? Evaluating Persona
+  Biases in Dialogue Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05280v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05280v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixin Wan, Jieyu Zhao, Aman Chadha, Nanyun Peng, Kai-Wei Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Large Language Models empower them to follow freeform
+instructions, including imitating generic or specific demographic personas in
+conversations. Generic personas refer to an individual from a demographic group
+(e.g. an Asian person), whereas specific personas can be actual names of
+historical figures. While the adoption of personas allows dialogue systems to
+be more engaging and approachable to users, it also carries the potential risk
+of exacerbating social biases in model responses, further causing societal
+harms through interactions with users. In this paper, we systematically study
+"persona biases", which we define to be the sensitivity of harmful dialogue
+model behaviors to different persona adoptions. We categorize persona biases
+into biases in harmful expression and harmful agreement, as well as establish a
+comprehensive evaluation framework to measure persona biases in five aspects:
+Offensiveness, Toxic Continuation, Regard, Stereotype Agreement, and Toxic
+Agreement. Additionally, we propose to comprehensively investigate persona
+biases through experimenting with UniversalPersona, a systematized persona
+dataset with a comprehensive list of both generic and specific model personas.
+Through benchmarking on four different models, including Blender, ChatGPT,
+Alpaca, and Vicuna, our study uncovers significant persona biases in these
+dialogue systems.Findings of our study underscores the immediate need to
+revisit the use of persona traits in dialogue agents, to ensure their safe
+application.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Text2NKG: Fine-Grained N-ary Relation Extraction for N-ary relational
+  Knowledge Graph Construction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05185v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05185v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoran Luo, Haihong E, Yuhao Yang, Tianyu Yao, Yikai Guo, Zichen Tang, Wentai Zhang, Kaiyang Wan, Shiyao Peng, Meina Song, Wei Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Beyond traditional binary relational facts, n-ary relational knowledge graphs
+(NKGs) are comprised of n-ary relational facts containing more than two
+entities, which are closer to real-world facts with broader applications.
+However, the construction of NKGs still significantly relies on manual labor,
+and n-ary relation extraction still remains at a course-grained level, which is
+always in a single schema and fixed arity of entities. To address these
+restrictions, we propose Text2NKG, a novel fine-grained n-ary relation
+extraction framework for n-ary relational knowledge graph construction. We
+introduce a span-tuple classification approach with hetero-ordered merging to
+accomplish fine-grained n-ary relation extraction in different arity.
+Furthermore, Text2NKG supports four typical NKG schemas: hyper-relational
+schema, event-based schema, role-based schema, and hypergraph-based schema,
+with high flexibility and practicality. Experimental results demonstrate that
+Text2NKG outperforms the previous state-of-the-art model by nearly 20\% points
+in the $F_1$ scores on the fine-grained n-ary relation extraction benchmark in
+the hyper-relational schema. Our code and datasets are publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">156</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is Generalized Dynamic Novel View Synthesis from Monocular Videos
+  Possible Today? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08587v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08587v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoming Zhao, Alex Colburn, Fangchang Ma, Miguel Angel Bautista, Joshua M. Susskind, Alexander G. Schwing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rendering scenes observed in a monocular video from novel viewpoints is a
+challenging problem. For static scenes the community has studied both
+scene-specific optimization techniques, which optimize on every test scene, and
+generalized techniques, which only run a deep net forward pass on a test scene.
+In contrast, for dynamic scenes, scene-specific optimization techniques exist,
+but, to our best knowledge, there is currently no generalized method for
+dynamic novel view synthesis from a given monocular video. To answer whether
+generalized dynamic novel view synthesis from monocular videos is possible
+today, we establish an analysis framework based on existing techniques and work
+toward the generalized approach. We find a pseudo-generalized process without
+scene-specific appearance optimization is possible, but geometrically and
+temporally consistent depth estimates are needed. Despite no scene-specific
+appearance optimization, the pseudo-generalized approach improves upon some
+scene-specific methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://xiaoming-zhao.github.io/projects/pgdvs</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Octopus: Embodied Vision-Language Programmer from Environmental Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08588v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08588v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingkang Yang, Yuhao Dong, Shuai Liu, Bo Li, Ziyue Wang, Chencheng Jiang, Haoran Tan, Jiamu Kang, Yuanhan Zhang, Kaiyang Zhou, Ziwei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large vision-language models (VLMs) have achieved substantial progress in
+multimodal perception and reasoning. Furthermore, when seamlessly integrated
+into an embodied agent, it signifies a crucial stride towards the creation of
+autonomous and context-aware systems capable of formulating plans and executing
+commands with precision. In this paper, we introduce Octopus, a novel VLM
+designed to proficiently decipher an agent's vision and textual task objectives
+and to formulate intricate action sequences and generate executable code. Our
+design allows the agent to adeptly handle a wide spectrum of tasks, ranging
+from mundane daily chores in simulators to sophisticated interactions in
+complex video games. Octopus is trained by leveraging GPT-4 to control an
+explorative agent to generate training data, i.e., action blueprints and the
+corresponding executable code, within our experimental environment called
+OctoVerse. We also collect the feedback that allows the enhanced training
+scheme of Reinforcement Learning with Environmental Feedback (RLEF). Through a
+series of experiments, we illuminate Octopus's functionality and present
+compelling results, and the proposed RLEF turns out to refine the agent's
+decision-making. By open-sourcing our model architecture, simulator, and
+dataset, we aspire to ignite further innovation and foster collaborative
+applications within the broader embodied AI community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://choiszt.github.io/Octopus/, Codebase:
+  https://github.com/dongyh20/Octopus</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Im4D: High-Fidelity and Real-Time Novel View Synthesis for Dynamic
+  Scenes <span class="chip">SIGGRAPH</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08585v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08585v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haotong Lin, Sida Peng, Zhen Xu, Tao Xie, Xingyi He, Hujun Bao, Xiaowei Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper aims to tackle the challenge of dynamic view synthesis from
+multi-view videos. The key observation is that while previous grid-based
+methods offer consistent rendering, they fall short in capturing appearance
+details of a complex dynamic scene, a domain where multi-view image-based
+rendering methods demonstrate the opposite properties. To combine the best of
+two worlds, we introduce Im4D, a hybrid scene representation that consists of a
+grid-based geometry representation and a multi-view image-based appearance
+representation. Specifically, the dynamic geometry is encoded as a 4D density
+function composed of spatiotemporal feature planes and a small MLP network,
+which globally models the scene structure and facilitates the rendering
+consistency. We represent the scene appearance by the original multi-view
+videos and a network that learns to predict the color of a 3D point from image
+features, instead of memorizing detailed appearance totally with networks,
+thereby naturally making the learning of networks easier. Our method is
+evaluated on five dynamic view synthesis datasets including DyNeRF, ZJU-MoCap,
+NHR, DNA-Rendering and ENeRF-Outdoor datasets. The results show that Im4D
+exhibits state-of-the-art performance in rendering quality and can be trained
+efficiently, while realizing real-time rendering with a speed of 79.8 FPS for
+512x512 images, on a single RTX 3090 GPU.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGGRAPH Asia 2023; Project page: https://zju3dv.github.io/im4d</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PonderV2: Pave the Way for 3D Foundataion Model with A Universal
+  <span class="highlight-title">Pre-train</span>ing Paradigm 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08586v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08586v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyi Zhu, Honghui Yang, Xiaoyang Wu, Di Huang, Sha Zhang, Xianglong He, Tong He, Hengshuang Zhao, Chunhua Shen, Yu Qiao, Wanli Ouyang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In contrast to numerous NLP and 2D computer vision foundational models, the
+learning of a robust and highly generalized 3D foundational model poses
+considerably greater challenges. This is primarily due to the inherent data
+variability and the diversity of downstream tasks. In this paper, we introduce
+a comprehensive 3D pre-training framework designed to facilitate the
+acquisition of efficient 3D representations, thereby establishing a pathway to
+3D foundational models. Motivated by the fact that informative 3D features
+should be able to encode rich geometry and appearance cues that can be utilized
+to render realistic images, we propose a novel universal paradigm to learn
+point cloud representations by differentiable neural rendering, serving as a
+bridge between 3D and 2D worlds. We train a point cloud encoder within a
+devised volumetric neural renderer by comparing the rendered images with the
+real images. Notably, our approach demonstrates the seamless integration of the
+learned 3D encoder into diverse downstream tasks. These tasks encompass not
+only high-level challenges such as 3D detection and segmentation but also
+low-level objectives like 3D reconstruction and image synthesis, spanning both
+indoor and outdoor scenarios. Besides, we also illustrate the capability of
+pre-training a 2D backbone using the proposed universal methodology, surpassing
+conventional pre-training methods by a large margin. For the first time,
+\sexyname achieves state-of-the-art performance on 11 indoor and outdoor
+benchmarks. The consistent improvements in various settings imply the
+effectiveness of the proposed method. Code and models will be made available at
+https://github.com/Pointcept/Pointcept.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2301.00157</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Is ImageNet worth 1 video? Learning strong image encoders from 1 long
+  unlabelled video 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08584v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08584v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shashanka Venkataramanan, Mamshad Nayeem Rizve, João Carreira, Yuki M. Asano, Yannis Avrithis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised learning has unlocked the potential of scaling up pretraining
+to billions of images, since annotation is unnecessary. But are we making the
+best use of data? How more economical can we be? In this work, we attempt to
+answer this question by making two contributions. First, we investigate
+first-person videos and introduce a "Walking Tours" dataset. These videos are
+high-resolution, hours-long, captured in a single uninterrupted take, depicting
+a large number of objects and actions with natural scene transitions. They are
+unlabeled and uncurated, thus realistic for self-supervision and comparable
+with human learning.
+  Second, we introduce a novel self-supervised image pretraining method
+tailored for learning from continuous videos. Existing methods typically adapt
+image-based pretraining approaches to incorporate more frames. Instead, we
+advocate a "tracking to learn to recognize" approach. Our method called DoRA,
+leads to attention maps that Discover and tRAck objects over time in an
+end-to-end manner, using transformer cross-attention. We derive multiple views
+from the tracks and use them in a classical self-supervised distillation loss.
+Using our novel approach, a single Walking Tours video remarkably becomes a
+strong competitor to ImageNet for several image and video downstream tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Universal Visual Decomposer: Long-Horizon Manipulation Made Easy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08581v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08581v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zichen Zhang, Yunshuang Li, Osbert Bastani, Abhishek Gupta, Dinesh Jayaraman, Yecheng Jason Ma, Luca Weihs
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world robotic tasks stretch over extended horizons and encompass
+multiple stages. Learning long-horizon manipulation tasks, however, is a
+long-standing challenge, and demands decomposing the overarching task into
+several manageable subtasks to facilitate policy learning and generalization to
+unseen tasks. Prior task decomposition methods require task-specific knowledge,
+are computationally intensive, and cannot readily be applied to new tasks. To
+address these shortcomings, we propose Universal Visual Decomposer (UVD), an
+off-the-shelf task decomposition method for visual long horizon manipulation
+using pre-trained visual representations designed for robotic control. At a
+high level, UVD discovers subgoals by detecting phase shifts in the embedding
+space of the pre-trained representation. Operating purely on visual
+demonstrations without auxiliary information, UVD can effectively extract
+visual subgoals embedded in the videos, while incurring zero additional
+training cost on top of standard visuomotor policy training. Goal-conditioned
+policies learned with UVD-discovered subgoals exhibit significantly improved
+compositional generalization at test time to unseen tasks. Furthermore,
+UVD-discovered subgoals can be used to construct goal-based reward shaping that
+jump-starts temporally extended exploration for reinforcement learning. We
+extensively evaluate UVD on both simulation and real-world tasks, and in all
+cases, UVD substantially outperforms baselines across imitation and
+reinforcement learning settings on in-domain and out-of-domain task sequences
+alike, validating the clear advantage of automated visual task decomposition
+within the simple, compact UVD framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ OmniControl: Control Any Joint at Any Time for Human Motion Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08580v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08580v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiming Xie, Varun Jampani, Lei Zhong, Deqing Sun, Huaizu Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel approach named OmniControl for incorporating flexible
+spatial control signals into a text-conditioned human motion generation model
+based on the diffusion process. Unlike previous methods that can only control
+the pelvis trajectory, OmniControl can incorporate flexible spatial control
+signals over different joints at different times with only one model.
+Specifically, we propose analytic spatial guidance that ensures the generated
+motion can tightly conform to the input control signals. At the same time,
+realism guidance is introduced to refine all the joints to generate more
+coherent motion. Both the spatial and realism guidance are essential and they
+are highly complementary for balancing control accuracy and motion realism. By
+combining them, OmniControl generates motions that are realistic, coherent, and
+consistent with the spatial constraints. Experiments on HumanML3D and KIT-ML
+datasets show that OmniControl not only achieves significant improvement over
+state-of-the-art methods on pelvis control but also shows promising results
+when incorporating the constraints over other joints.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://neu-vi.github.io/omnicontrol/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HyperHuman: Hyper-Realistic Human Generation with Latent Structural
+  Diffusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08579v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08579v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xian Liu, Jian Ren, Aliaksandr Siarohin, Ivan Skorokhodov, Yanyu Li, Dahua Lin, Xihui Liu, Ziwei Liu, Sergey Tulyakov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite significant advances in large-scale text-to-image models, achieving
+hyper-realistic human image generation remains a desirable yet unsolved task.
+Existing models like Stable Diffusion and DALL-E 2 tend to generate human
+images with incoherent parts or unnatural poses. To tackle these challenges,
+our key insight is that human image is inherently structural over multiple
+granularities, from the coarse-level body skeleton to fine-grained spatial
+geometry. Therefore, capturing such correlations between the explicit
+appearance and latent structure in one model is essential to generate coherent
+and natural human images. To this end, we propose a unified framework,
+HyperHuman, that generates in-the-wild human images of high realism and diverse
+layouts. Specifically, 1) we first build a large-scale human-centric dataset,
+named HumanVerse, which consists of 340M images with comprehensive annotations
+like human pose, depth, and surface normal. 2) Next, we propose a Latent
+Structural Diffusion Model that simultaneously denoises the depth and surface
+normal along with the synthesized RGB image. Our model enforces the joint
+learning of image appearance, spatial relationship, and geometry in a unified
+network, where each branch in the model complements to each other with both
+structural awareness and textural richness. 3) Finally, to further boost the
+visual quality, we propose a Structure-Guided Refiner to compose the predicted
+conditions for more detailed generation of higher resolution. Extensive
+experiments demonstrate that our framework yields the state-of-the-art
+performance, generating hyper-realistic human images under diverse scenarios.
+Project Page: https://snap-research.github.io/HyperHuman/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://snap-research.github.io/HyperHuman/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual Data-Type Understanding does not emerge from Scaling
+  Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08577v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08577v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vishaal Udandarao, Max F. Burg, Samuel Albanie, Matthias Bethge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in the development of vision-language models (VLMs) are
+yielding remarkable success in recognizing visual semantic content, including
+impressive instances of compositional image understanding. Here, we introduce
+the novel task of \textit{Visual Data-Type Identification}, a basic perceptual
+skill with implications for data curation (e.g., noisy data-removal from large
+datasets, domain-specific retrieval) and autonomous vision (e.g.,
+distinguishing changing weather conditions from camera lens staining). We
+develop two datasets consisting of animal images altered across a diverse set
+of 27 visual \textit{data-types}, spanning four broad categories. An extensive
+zero-shot evaluation of 39 VLMs, ranging from 100M to 80B parameters, shows a
+nuanced performance landscape. While VLMs are reasonably good at identifying
+certain stylistic \textit{data-types}, such as cartoons and sketches, they
+struggle with simpler \textit{data-types} arising from basic manipulations like
+image rotations or additive noise. Our findings reveal that (i) model scaling
+alone yields marginal gains for contrastively-trained models like CLIP, and
+(ii) there is a pronounced drop in performance for the largest
+auto-regressively trained VLMs like OpenFlamingo. This finding points to a
+blind spot in current frontier VLMs: they excel in recognizing semantic content
+but fail to acquire an understanding of visual \textit{data-types} through
+scaling. By analyzing the pre-training distributions of these models and
+incorporating \textit{data-type} information into the captions during
+fine-tuning, we achieve a significant enhancement in performance. By exploring
+this previously uncharted task, we aim to set the stage for further advancing
+VLMs to equip them with visual data-type understanding. Code and datasets are
+released \href{https://github.com/bethgelab/DataTypeIdentification}{here}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Act from Actionless Videos through Dense Correspondences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08576v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08576v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Po-Chen Ko, Jiayuan Mao, Yilun Du, Shao-Hua Sun, Joshua B. Tenenbaum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we present an approach to construct a video-based robot policy
+capable of reliably executing diverse tasks across different robots and
+environments from few video demonstrations without using any action
+annotations. Our method leverages images as a task-agnostic representation,
+encoding both the state and action information, and text as a general
+representation for specifying robot goals. By synthesizing videos that
+``hallucinate'' robot executing actions and in combination with dense
+correspondences between frames, our approach can infer the closed-formed action
+to execute to an environment without the need of any explicit action labels.
+This unique capability allows us to train the policy solely based on RGB videos
+and deploy learned policies to various robotic tasks. We demonstrate the
+efficacy of our approach in learning policies on table-top manipulation and
+navigation tasks. Additionally, we contribute an open-source framework for
+efficient video modeling, enabling the training of high-fidelity policy models
+with four GPUs within a single day.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://flow-diffusion.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Idea2Img: Iterative Self-Refinement with <span class="highlight-title">GPT</span>-4V(ision) for Automatic
+  Image Design and Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08541v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08541v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengyuan Yang, Jianfeng Wang, Linjie Li, Kevin Lin, Chung-Ching Lin, Zicheng Liu, Lijuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce ``Idea to Image,'' a system that enables multimodal iterative
+self-refinement with GPT-4V(ision) for automatic image design and generation.
+Humans can quickly identify the characteristics of different text-to-image
+(T2I) models via iterative explorations. This enables them to efficiently
+convert their high-level generation ideas into effective T2I prompts that can
+produce good images. We investigate if systems based on large multimodal models
+(LMMs) can develop analogous multimodal self-refinement abilities that enable
+exploring unknown models or environments via self-refining tries. Idea2Img
+cyclically generates revised T2I prompts to synthesize draft images, and
+provides directional feedback for prompt revision, both conditioned on its
+memory of the probed T2I model's characteristics. The iterative self-refinement
+brings Idea2Img various advantages over vanilla T2I models. Notably, Idea2Img
+can process input ideas with interleaved image-text sequences, follow ideas
+with design instructions, and generate images of better semantic and visual
+qualities. The user preference study validates the efficacy of multimodal
+iterative self-refinement on automatic image design and generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page at https://idea2img.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Image2PCI -- A Multitask Learning Framework for Estimating Pavement
+  Condition Indices Directly from Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08538v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08538v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Neema Jakisa Owor, Hang Du, Abdulateef Daud, Armstrong Aboah, Yaw Adu-Gyamfi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Pavement Condition Index (PCI) is a widely used metric for evaluating
+pavement performance based on the type, extent and severity of distresses
+detected on a pavement surface. In recent times, significant progress has been
+made in utilizing deep-learning approaches to automate PCI estimation process.
+However, the current approaches rely on at least two separate models to
+estimate PCI values -- one model dedicated to determining the type and extent
+and another for estimating their severity. This approach presents several
+challenges, including complexities, high computational resource demands, and
+maintenance burdens that necessitate careful consideration and resolution. To
+overcome these challenges, the current study develops a unified multi-tasking
+model that predicts the PCI directly from a top-down pavement image. The
+proposed architecture is a multi-task model composed of one encoder for feature
+extraction and four decoders to handle specific tasks: two detection heads, one
+segmentation head and one PCI estimation head. By multitasking, we are able to
+extract features from the detection and segmentation heads for automatically
+estimating the PCI directly from the images. The model performs very well on
+our benchmarked and open pavement distress dataset that is annotated for
+multitask learning (the first of its kind). To our best knowledge, this is the
+first work that can estimate PCI directly from an image at real time speeds
+while maintaining excellent accuracy on all related tasks for crack detection
+and segmentation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ XAI Benchmark for Visual Explanation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08537v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08537v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifei Zhang, Siyi Gu, James Song, Bo Pan, Liang Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rise of deep learning algorithms has led to significant advancements in
+computer vision tasks, but their "black box" nature has raised concerns
+regarding interpretability. Explainable AI (XAI) has emerged as a critical area
+of research aiming to open this "black box", and shed light on the
+decision-making process of AI models. Visual explanations, as a subset of
+Explainable Artificial Intelligence (XAI), provide intuitive insights into the
+decision-making processes of AI models handling visual data by highlighting
+influential areas in an input image. Despite extensive research conducted on
+visual explanations, most evaluations are model-centered since the availability
+of corresponding real-world datasets with ground truth explanations is scarce
+in the context of image data. To bridge this gap, we introduce an XAI Benchmark
+comprising a dataset collection from diverse topics that provide both class
+labels and corresponding explanation annotations for images. We have processed
+data from diverse domains to align with our unified visual explanation
+framework. We introduce a comprehensive Visual Explanation pipeline, which
+integrates data loading, preprocessing, experimental setup, and model
+evaluation processes. This structure enables researchers to conduct fair
+comparisons of various visual explanation techniques. In addition, we provide a
+comprehensive review of over 10 evaluation methods for visual explanation to
+assist researchers in effectively utilizing our dataset collection. To further
+assess the performance of existing visual explanation methods, we conduct
+experiments on selected datasets using various model-centered and ground
+truth-centered evaluation metrics. We envision this benchmark could facilitate
+the advancement of visual explanation models. The XAI dataset collection and
+easy-to-use code for evaluation are publicly accessible at
+https://xaidataset.github.io.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Animating Street View <span class="chip">SIGGRAPH</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08534v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08534v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengyi Shan, Brian Curless, Ira Kemelmacher-Shlizerman, Steve Seitz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a system that automatically brings street view imagery to life by
+populating it with naturally behaving, animated pedestrians and vehicles. Our
+approach is to remove existing people and vehicles from the input image, insert
+moving objects with proper scale, angle, motion, and appearance, plan paths and
+traffic behavior, as well as render the scene with plausible occlusion and
+shadowing effects. The system achieves these by reconstructing the still image
+street scene, simulating crowd behavior, and rendering with consistent
+lighting, visibility, occlusions, and shadows. We demonstrate results on a
+diverse range of street scenes including regular still images and panoramas.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGGRAPH Asia 2023 Conference Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UniPose: Detecting Any Keypoints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08530v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08530v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Yang, Ailing Zeng, Ruimao Zhang, Lei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work proposes a unified framework called UniPose to detect keypoints of
+any articulated (e.g., human and animal), rigid, and soft objects via visual or
+textual prompts for fine-grained vision understanding and manipulation.
+Keypoint is a structure-aware, pixel-level, and compact representation of any
+object, especially articulated objects. Existing fine-grained promptable tasks
+mainly focus on object instance detection and segmentation but often fail to
+identify fine-grained granularity and structured information of image and
+instance, such as eyes, leg, paw, etc. Meanwhile, prompt-based keypoint
+detection is still under-explored. To bridge the gap, we make the first attempt
+to develop an end-to-end prompt-based keypoint detection framework called
+UniPose to detect keypoints of any objects. As keypoint detection tasks are
+unified in this framework, we can leverage 13 keypoint detection datasets with
+338 keypoints across 1,237 categories over 400K instances to train a generic
+keypoint detection model. UniPose can effectively align text-to-keypoint and
+image-to-keypoint due to the mutual enhancement of textual and visual prompts
+based on the cross-modality contrastive learning optimization objectives. Our
+experimental results show that UniPose has strong fine-grained localization and
+generalization abilities across image styles, categories, and poses. Based on
+UniPose as a generalist keypoint detector, we hope it could serve fine-grained
+visual perception, understanding, and generation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GaussianDreamer: Fast Generation from Text to 3D Gaussian Splatting with
+  Point Cloud Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08529v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08529v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Taoran Yi, Jiemin Fang, Guanjun Wu, Lingxi Xie, Xiaopeng Zhang, Wenyu Liu, Qi Tian, Xinggang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent times, the generation of 3D assets from text prompts has shown
+impressive results. Both 2D and 3D diffusion models can generate decent 3D
+objects based on prompts. 3D diffusion models have good 3D consistency, but
+their quality and generalization are limited as trainable 3D data is expensive
+and hard to obtain. 2D diffusion models enjoy strong abilities of
+generalization and fine generation, but the 3D consistency is hard to
+guarantee. This paper attempts to bridge the power from the two types of
+diffusion models via the recent explicit and efficient 3D Gaussian splatting
+representation. A fast 3D generation framework, named as \name, is proposed,
+where the 3D diffusion model provides point cloud priors for initialization and
+the 2D diffusion model enriches the geometry and appearance. Operations of
+noisy point growing and color perturbation are introduced to enhance the
+initialized Gaussians. Our \name can generate a high-quality 3D instance within
+25 minutes on one GPU, much faster than previous methods, while the generated
+instances can be directly rendered in real time. Demos and code are available
+at https://taoranyi.com/gaussiandreamer/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress. Project page: https://taoranyi.com/gaussiandreamer/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ 4D Gaussian Splatting for Real-Time Dynamic Scene Rendering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08528v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08528v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guanjun Wu, Taoran Yi, Jiemin Fang, Lingxi Xie, Xiaopeng Zhang, Wei Wei, Wenyu Liu, Qi Tian, Xinggang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Representing and rendering dynamic scenes has been an important but
+challenging task. Especially, to accurately model complex motions, high
+efficiency is usually hard to maintain. We introduce the 4D Gaussian Splatting
+(4D-GS) to achieve real-time dynamic scene rendering while also enjoying high
+training and storage efficiency. An efficient deformation field is constructed
+to model both Gaussian motions and shape deformations. Different adjacent
+Gaussians are connected via a HexPlane to produce more accurate position and
+shape deformations. Our 4D-GS method achieves real-time rendering under high
+resolutions, 70 FPS at a 800$\times$800 resolution on an RTX 3090 GPU, while
+maintaining comparable or higher quality than previous state-of-the-art
+methods. More demos and code are available at
+https://guanjunwu.github.io/4dgs/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in progress. Project page: https://guanjunwu.github.io/4dgs/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Learning of Object-Centric Embeddings for Cell Instance
+  Segmentation in Microscopy Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08501v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08501v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Steffen Wolf, Manan Lalit, Henry Westmacott, Katie McDole, Jan Funke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Segmentation of objects in microscopy images is required for many biomedical
+applications. We introduce object-centric embeddings (OCEs), which embed image
+patches such that the spatial offsets between patches cropped from the same
+object are preserved. Those learnt embeddings can be used to delineate
+individual objects and thus obtain instance segmentations. Here, we show
+theoretically that, under assumptions commonly found in microscopy images, OCEs
+can be learnt through a self-supervised task that predicts the spatial offset
+between image patches. Together, this forms an unsupervised cell instance
+segmentation method which we evaluate on nine diverse large-scale microscopy
+datasets. Segmentations obtained with our method lead to substantially improved
+results, compared to state-of-the-art baselines on six out of nine datasets,
+and perform on par on the remaining three datasets. If ground-truth annotations
+are available, our method serves as an excellent starting point for supervised
+training, reducing the required amount of ground-truth needed by one order of
+magnitude, thus substantially increasing the practical applicability of our
+method. Source code is available at https://github.com/funkelab/cellulus.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can We Edit Multimodal Large Language Models? <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08475v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08475v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyuan Cheng, Bozhong Tian, Qingbin Liu, Xi Chen, Yongheng Wang, Huajun Chen, Ningyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we focus on editing Multimodal Large Language Models (MLLMs).
+Compared to editing single-modal LLMs, multimodal model editing is more
+challenging, which demands a higher level of scrutiny and careful consideration
+in the editing process. To facilitate research in this area, we construct a new
+benchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite
+of innovative metrics for evaluation. We conduct comprehensive experiments
+involving various model editing baselines and analyze the impact of editing
+different components for multimodal LLMs. Empirically, we notice that previous
+baselines can implement editing multimodal LLMs to some extent, but the effect
+is still barely satisfactory, indicating the potential difficulty of this task.
+We hope that our work can provide the NLP community with insights\footnote{Code
+and dataset are available in https://github.com/zjunlp/EasyEdit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MotionDirector: Motion Customization of Text-to-Video Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08465v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08465v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Zhao, Yuchao Gu, Jay Zhangjie Wu, David Junhao Zhang, Jiawei Liu, Weijia Wu, Jussi Keppo, Mike Zheng Shou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large-scale pre-trained diffusion models have exhibited remarkable
+capabilities in diverse video generations. Given a set of video clips of the
+same motion concept, the task of Motion Customization is to adapt existing
+text-to-video diffusion models to generate videos with this motion. For
+example, generating a video with a car moving in a prescribed manner under
+specific camera movements to make a movie, or a video illustrating how a bear
+would lift weights to inspire creators. Adaptation methods have been developed
+for customizing appearance like subject or style, yet unexplored for motion. It
+is straightforward to extend mainstream adaption methods for motion
+customization, including full model tuning, parameter-efficient tuning of
+additional layers, and Low-Rank Adaptions (LoRAs). However, the motion concept
+learned by these methods is often coupled with the limited appearances in the
+training videos, making it difficult to generalize the customized motion to
+other appearances. To overcome this challenge, we propose MotionDirector, with
+a dual-path LoRAs architecture to decouple the learning of appearance and
+motion. Further, we design a novel appearance-debiased temporal loss to
+mitigate the influence of appearance on the temporal training objective.
+Experimental results show the proposed method can generate videos of diverse
+appearances for the customized motions. Our method also supports various
+downstream applications, such as the mixing of different videos with their
+appearance and motion respectively, and animating a single image with
+customized motions. Our code and model weights will be released.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://showlab.github.io/MotionDirector/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Proving the Potential of Skeleton Based Action Recognition to Automate
+  the Analysis of Manual Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08451v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08451v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marlin Berger, Frederik Cloppenburg, Jens Eufinger, Thomas Gries
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In manufacturing sectors such as textiles and electronics, manual processes
+are a fundamental part of production. The analysis and monitoring of the
+processes is necessary for efficient production design. Traditional methods for
+analyzing manual processes are complex, expensive, and inflexible. Compared to
+established approaches such as Methods-Time-Measurement (MTM), machine learning
+(ML) methods promise: Higher flexibility, self-sufficient & permanent use,
+lower costs. In this work, based on a video stream, the current motion class in
+a manual assembly process is detected. With information on the current motion,
+Key-Performance-Indicators (KPIs) can be derived easily. A skeleton-based
+action recognition approach is taken, as this field recently shows major
+success in machine vision tasks. For skeleton-based action recognition in
+manual assembly, no sufficient pre-work could be found. Therefore, a ML
+pipeline is developed, to enable extensive research on different (pre-)
+processing methods and neural nets. Suitable well generalizing approaches are
+found, proving the potential of ML to enhance analyzation of manual processes.
+Models detect the current motion, performed by an operator in manual assembly,
+but the results can be transferred to all kinds of manual processes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 6 figures. Find peer-reviewed version in Proceedings of
+  IntelliSys 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Debias the Training of Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08442v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08442v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hu Yu, Li Shen, Jie Huang, Man Zhou, Hongsheng Li, Feng Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have demonstrated compelling generation quality by
+optimizing the variational lower bound through a simple denoising score
+matching loss. In this paper, we provide theoretical evidence that the
+prevailing practice of using a constant loss weight strategy in diffusion
+models leads to biased estimation during the training phase. Simply optimizing
+the denoising network to predict Gaussian noise with constant weighting may
+hinder precise estimations of original images. To address the issue, we propose
+an elegant and effective weighting strategy grounded in the theoretically
+unbiased principle. Moreover, we conduct a comprehensive and systematic
+exploration to dissect the inherent bias problem deriving from constant
+weighting loss from the perspectives of its existence, impact and reasons.
+These analyses are expected to advance our understanding and demystify the
+inner workings of diffusion models. Through empirical evaluation, we
+demonstrate that our proposed debiased estimation method significantly enhances
+sample quality without the reliance on complex techniques, and exhibits
+improved efficiency compared to the baseline method both in training and
+sampling processes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>University of Science and Technology of China, Alibaba Group, The
+  Chinese University of Hong Kong</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Assessing of Soil Erosion Risk Through Geoinformation Sciences and
+  Remote Sensing -- A <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08430v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08430v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lachezar Filchev, Vasil Kolev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  During past decades a marked manifestation of widespread erosion phenomena
+was studied worldwide. Global conservation community has launched campaigns at
+local, regional and continental level in developing countries for preservation
+of soil resources in order not only to stop or mitigate human impact on nature
+but also to improve life in rural areas introducing new approaches for soil
+cultivation. After the adoption of Sustainable Development Goals of UNs and
+launching several world initiatives such as the Land Degradation Neutrality
+(LDN) the world came to realize the very importance of the soil resources on
+which the biosphere relies for its existence. The main goal of the chapter is
+to review different types and structures erosion models as well as their
+applications. Several methods using spatial analysis capabilities of geographic
+information systems (GIS) are in operation for soil erosion risk assessment,
+such as Universal Soil Loss Equation (USLE), Revised Universal Soil Loss
+Equation (RUSLE) in operation worldwide and in the USA and MESALES model. These
+and more models are being discussed in the present work alongside more
+experimental models and methods for assessing soil erosion risk such as
+Artificial Intelligence (AI), Machine and Deep Learning, etc. At the end of
+this work, a prospectus for the future development of soil erosion risk
+assessment is drawn.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Chapter 21 (pages 54)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Data Augmentation for Rotational Invariance in Convolutional
+  Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08429v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08429v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Facundo Manuel Quiroga, Franco Ronchetti, Laura Lanzarini, Aurelio Fernandez-Bariviera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Convolutional Neural Networks (CNN) offer state of the art performance in
+various computer vision tasks. Many of those tasks require different subtypes
+of affine invariances (scale, rotational, translational) to image
+transformations. Convolutional layers are translation equivariant by design,
+but in their basic form lack invariances. In this work we investigate how best
+to include rotational invariance in a CNN for image classification. Our
+experiments show that networks trained with data augmentation alone can
+classify rotated images nearly as well as in the normal unrotated case; this
+increase in representational power comes only at the cost of training time. We
+also compare data augmentation versus two modified CNN models for achieving
+rotational invariance or equivariance, Spatial Transformer Networks and Group
+Equivariant CNNs, finding no significant accuracy increase with these
+specialized methods. In the case of data augmented networks, we also analyze
+which layers help the network to encode the rotational invariance, which is
+important for understanding its limitations and how to best retrain a network
+with data augmentation to achieve invariance to rotation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ "SegLoc": Study on Novel Visual <span class="highlight-title">Self-supervised</span> Learning Scheme (Segment
+  Localization) Tailored for Dense Prediction Tasks of Security Inspection
+  X-ray Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08421v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08421v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shervin Halat, Mohammad Rahmati, Ehsan Nazerfard
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lately, remarkable advancements of artificial intelligence have been
+attributed to the integration of self-supervised learning scheme. Despite
+impressive achievements within NLP, yet SSL in computer vision has not been
+able to stay on track comparatively. Recently, integration of contrastive
+learning on top of existing SSL models has established considerable progress in
+computer vision through which visual SSL models have outperformed their
+supervised counterparts. Nevertheless, most of these improvements were limited
+to classification tasks, and also, few works have been dedicated to evaluation
+of SSL models in real-world scenarios of computer vision, while the majority of
+works are centered around datasets containing class-wise portrait images, most
+notably, ImageNet. Consequently, in this work, we have considered dense
+prediction task of semantic segmentation in security inspection x-ray images to
+evaluate our proposed model Segmentation Localization. Based upon the model
+Instance Localization, our model SegLoc has managed to address one of the most
+challenging downsides of contrastive learning, i.e., false negative pairs of
+query embeddings. In order to do so, in contrast to baseline model InsLoc, our
+pretraining dataset is synthesized by cropping, transforming, then pasting
+already labeled segments from an available labeled dataset, foregrounds, onto
+instances of an unlabeled dataset, backgrounds. In our case, PIDray and SIXray
+datasets are considered as labeled and unlabeled datasets, respectively.
+Moreover, we fully harness labels by avoiding false negative pairs through
+implementing the idea, one queue per class, in MoCo-v2 whereby negative pairs
+corresponding to each query are extracted from its corresponding queue within
+the memory bank. Our approach has outperformed random initialization by 3% to
+6%, while having underperformed supervised initialization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual Attention-<span class="highlight-title">Prompt</span>ed Prediction and Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08420v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08420v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifei Zhang, Siyi Gu, Bo Pan, Guangji Bai, Xiaofeng Yang, Liang Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Explanation(attention)-guided learning is a method that enhances a model's
+predictive power by incorporating human understanding during the training
+phase. While attention-guided learning has shown promising results, it often
+involves time-consuming and computationally expensive model retraining. To
+address this issue, we introduce the attention-prompted prediction technique,
+which enables direct prediction guided by the attention prompt without the need
+for model retraining. However, this approach presents several challenges,
+including: 1) How to incorporate the visual attention prompt into the model's
+decision-making process and leverage it for future predictions even in the
+absence of a prompt? and 2) How to handle the incomplete information from the
+visual attention prompt? To tackle these challenges, we propose a novel
+framework called Visual Attention-Prompted Prediction and Learning, which
+seamlessly integrates visual attention prompts into the model's decision-making
+process and adapts to images both with and without attention prompts for
+prediction. To address the incomplete information of the visual attention
+prompt, we introduce a perturbation-based attention map modification method.
+Additionally, we propose an optimization-based mask aggregation method with a
+new weight learning function for adaptive perturbed annotation aggregation in
+the attention map modification process. Our overall framework is designed to
+learn in an attention-prompt guided multi-task manner to enhance future
+predictions even for samples without attention prompts and trained in an
+alternating manner for better convergence. Extensive experiments conducted on
+two datasets demonstrate the effectiveness of our proposed framework in
+enhancing predictions for samples, both with and without provided prompts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Design and Development of an ArUco Markers-Based Quantitative
+  Surface Tactile Sensor 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08398v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08398v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ozdemir Can Kara, Charles Everson, Farshid Alambeigi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, with the goal of quantifying the qualitative image outputs of
+a Vision-based Tactile Sensor (VTS), we present the design, fabrication, and
+characterization of a novel Quantitative Surface Tactile Sensor (called QS-TS).
+QS-TS directly estimates the sensor's gel layer deformation in real-time
+enabling safe and autonomous tactile manipulation and servoing of delicate
+objects using robotic manipulators. The core of the proposed sensor is the
+utilization of miniature 1.5 mm x 1.5 mm synthetic square markers with inner
+binary patterns and a broad black border, called ArUco Markers. Each ArUco
+marker can provide real-time camera pose estimation that, in our design, is
+used as a quantitative measure for obtaining deformation of the QS-TS gel
+layer. Moreover, thanks to the use of ArUco markers, we propose a unique
+fabrication procedure that mitigates various challenges associated with the
+fabrication of the existing marker-based VTSs and offers an intuitive and
+less-arduous method for the construction of the VTS. Remarkably, the proposed
+fabrication facilitates the integration and adherence of markers with the gel
+layer to robustly and reliably obtain a quantitative measure of deformation in
+real-time regardless of the orientation of ArUco Markers. The performance and
+efficacy of the proposed QS-TS in estimating the deformation of the sensor's
+gel layer were experimentally evaluated and verified. Results demonstrate the
+phenomenal performance of the QS-TS in estimating the deformation of the gel
+layer with a relative error of <5%.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hyp-UML: Hyperbolic Image Retrieval with Uncertainty-aware Metric
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08390v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08390v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiyang Yan, Zongxuan Liu, Lin Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Metric learning plays a critical role in training image retrieval and
+classification. It is also a key algorithm in representation learning, e.g.,
+for feature learning and its alignment in metric space. Hyperbolic embedding
+has been recently developed, compared to the conventional Euclidean embedding
+in most of the previously developed models, and can be more effective in
+representing the hierarchical data structure. Second, uncertainty
+estimation/measurement is a long-lasting challenge in artificial intelligence.
+Successful uncertainty estimation can improve a machine learning model's
+performance, robustness, and security. In Hyperbolic space, uncertainty
+measurement is at least with equivalent, if not more, critical importance. In
+this paper, we develop a Hyperbolic image embedding with uncertainty-aware
+metric learning for image retrieval. We call our method Hyp-UML: Hyperbolic
+Uncertainty-aware Metric Learning. Our contribution are threefold: we propose
+an image embedding algorithm based on Hyperbolic space, with their
+corresponding uncertainty value; we propose two types of uncertainty-aware
+metric learning, for the popular Contrastive learning and conventional
+margin-based metric learning, respectively. We perform extensive experimental
+validations to prove that the proposed algorithm can achieve state-of-the-art
+results among related methods. The comprehensive ablation study validates the
+effectiveness of each component of the proposed algorithm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MeanAP-Guided Reinforced Active Learning for Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08387v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08387v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhixuan Liang, Xingyu Zeng, Rui Zhao, Ping Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Active learning presents a promising avenue for training high-performance
+models with minimal labeled data, achieved by judiciously selecting the most
+informative instances to label and incorporating them into the task learner.
+Despite notable advancements in active learning for image recognition, metrics
+devised or learned to gauge the information gain of data, crucial for query
+strategy design, do not consistently align with task model performance metrics,
+such as Mean Average Precision (MeanAP) in object detection tasks. This paper
+introduces MeanAP-Guided Reinforced Active Learning for Object Detection
+(MAGRAL), a novel approach that directly utilizes the MeanAP metric of the task
+model to devise a sampling strategy employing a reinforcement learning-based
+sampling agent. Built upon LSTM architecture, the agent efficiently explores
+and selects subsequent training instances, and optimizes the process through
+policy gradient with MeanAP serving as reward. Recognizing the time-intensive
+nature of MeanAP computation at each step, we propose fast look-up tables to
+expedite agent training. We assess MAGRAL's efficacy across popular benchmarks,
+PASCAL VOC and MS COCO, utilizing different backbone architectures. Empirical
+findings substantiate MAGRAL's superiority over recent state-of-the-art
+methods, showcasing substantial performance gains. MAGRAL establishes a robust
+baseline for reinforced active object detection, signifying its potential in
+advancing the field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AutoVP: An Automated Visual <span class="highlight-title">Prompt</span>ing Framework and Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08381v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08381v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hsi-Ai Tsao, Lei Hsiung, Pin-Yu Chen, Sijia Liu, Tsung-Yi Ho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual prompting (VP) is an emerging parameter-efficient fine-tuning approach
+to adapting pre-trained vision models to solve various downstream
+image-classification tasks. However, there has hitherto been little systematic
+study of the design space of VP and no clear benchmark for evaluating its
+performance. To bridge this gap, we propose AutoVP, an end-to-end expandable
+framework for automating VP design choices, along with 12 downstream
+image-classification tasks that can serve as a holistic VP-performance
+benchmark. Our design space covers 1) the joint optimization of the prompts; 2)
+the selection of pre-trained models, including image classifiers and text-image
+encoders; and 3) model output mapping strategies, including nonparametric and
+trainable label mapping. Our extensive experimental results show that AutoVP
+outperforms the best-known current VP methods by a substantial margin, having
+up to 6.7% improvement in accuracy; and attains a maximum performance increase
+of 27.5% compared to linear-probing (LP) baseline. AutoVP thus makes a two-fold
+contribution: serving both as an efficient tool for hyperparameter tuning on VP
+design choices, and as a comprehensive benchmark that can reasonably be
+expected to accelerate VP's development. The source code is available at
+https://github.com/IBM/AutoVP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. The code is available at https://github.com/IBM/AutoVP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Worst-Case Morphs using Wasserstein ALI and Improved MIPGAN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08371v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08371v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Una M. Kelly, Meike Nauta, Lu Liu, Luuk J. Spreeuwers, Raymond N. J. Veldhuis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A lot of progress has been made in the last years on using Generative
+Adversarial Networks (GAN) to create realistic images. However, to be able
+reconstruct images or to generate images using real data as input, an Encoder
+is needed that reverses the mapping from the GAN's latent space to image space.
+This means that three networks are needed: an Encoder, a Decoder (called
+Generator in a normal GAN) and a Discriminator. These three networks can be
+trained from scratch simultaneously (Adversarially Learned Inference), or
+alternatively an Encoder network can be trained that maps images into the
+latent space of a \textit{pretrained} GAN model (Inverse GAN). In the latter
+case, the networks are trained consecutively, so the Encoder has to make do
+with whatever model the Decoder learned during GAN training. Training three
+networks simultaneously is more unstable and therefore more challenging, but it
+is possible that the Encoder and Decoder benefit from interacting with each
+other during training. We compare the two different approaches and discuss
+whether it is worth the extra effort to train all three networks
+simultaneously.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ UniPAD: A Universal <span class="highlight-title">Pre-train</span>ing Paradigm for Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08370v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08370v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Honghui Yang, Sha Zhang, Di Huang, Xiaoyang Wu, Haoyi Zhu, Tong He, Shixiang Tang, Hengshuang Zhao, Qibo Qiu, Binbin Lin, Xiaofei He, Wanli Ouyang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the context of autonomous driving, the significance of effective feature
+learning is widely acknowledged. While conventional 3D self-supervised
+pre-training methods have shown widespread success, most methods follow the
+ideas originally designed for 2D images. In this paper, we present UniPAD, a
+novel self-supervised learning paradigm applying 3D volumetric differentiable
+rendering. UniPAD implicitly encodes 3D space, facilitating the reconstruction
+of continuous 3D shape structures and the intricate appearance characteristics
+of their 2D projections. The flexibility of our method enables seamless
+integration into both 2D and 3D frameworks, enabling a more holistic
+comprehension of the scenes. We manifest the feasibility and effectiveness of
+UniPAD by conducting extensive experiments on various downstream 3D tasks. Our
+method significantly improves lidar-, camera-, and lidar-camera-based baseline
+by 9.1, 7.7, and 6.9 NDS, respectively. Notably, our pre-training pipeline
+achieves 73.2 NDS for 3D object detection and 79.4 mIoU for 3D semantic
+segmentation on the nuScenes validation set, achieving state-of-the-art results
+in comparison with previous methods. The code will be available at
+https://github.com/Nightmare-n/UniPAD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Mapping Memes to Words for Multimodal Hateful Meme Classification <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08368v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08368v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giovanni Burbi, Alberto Baldrati, Lorenzo Agnolucci, Marco Bertini, Alberto Del Bimbo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal image-text memes are prevalent on the internet, serving as a
+unique form of communication that combines visual and textual elements to
+convey humor, ideas, or emotions. However, some memes take a malicious turn,
+promoting hateful content and perpetuating discrimination. Detecting hateful
+memes within this multimodal context is a challenging task that requires
+understanding the intertwined meaning of text and images. In this work, we
+address this issue by proposing a novel approach named ISSUES for multimodal
+hateful meme classification. ISSUES leverages a pre-trained CLIP
+vision-language model and the textual inversion technique to effectively
+capture the multimodal semantic content of the memes. The experiments show that
+our method achieves state-of-the-art results on the Hateful Memes Challenge and
+HarMeme datasets. The code and the pre-trained models are publicly available at
+https://github.com/miccunifi/ISSUES.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023 CLVL Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MCU: A Task-centric Framework for Open-ended Agent Evaluation in
+  Minecraft 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08367v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08367v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haowei Lin, Zihao Wang, Jianzhu Ma, Yitao Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To pursue the goal of creating an open-ended agent in Minecraft, an
+open-ended game environment with unlimited possibilities, this paper introduces
+a task-centric framework named MCU for Minecraft agent evaluation. The MCU
+framework leverages the concept of atom tasks as fundamental building blocks,
+enabling the generation of diverse or even arbitrary tasks. Within the MCU
+framework, each task is measured with six distinct difficulty scores (time
+consumption, operational effort, planning complexity, intricacy, creativity,
+novelty). These scores offer a multi-dimensional assessment of a task from
+different angles, and thus can reveal an agent's capability on specific facets.
+The difficulty scores also serve as the feature of each task, which creates a
+meaningful task space and unveils the relationship between tasks. For efficient
+evaluation of Minecraft agents employing the MCU framework, we maintain a
+unified benchmark, namely SkillForge, which comprises representative tasks with
+diverse categories and difficulty distribution. We also provide convenient
+filters for users to select tasks to assess specific capabilities of agents. We
+show that MCU has the high expressivity to cover all tasks used in recent
+literature on Minecraft agent, and underscores the need for advancements in
+areas such as creativity, precise control, and out-of-distribution
+generalization under the goal of open-ended Minecraft agent development.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Generic Software Framework for Distributed Topological Analysis
+  Pipelines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08339v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08339v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eve Le Guillou, Michael Will, Pierre Guillou, Jonas Lukasczyk, Pierre Fortin, Christoph Garth, Julien Tierny
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This system paper presents a software framework for the support of
+topological analysis pipelines in a distributed-memory model. While several
+recent papers introduced topology-based approaches for distributed-memory
+environments, these were reporting experiments obtained with tailored,
+mono-algorithm implementations. In contrast, we describe in this paper a
+general-purpose, generic framework for topological analysis pipelines, i.e. a
+sequence of topological algorithms interacting together, possibly on distinct
+numbers of processes. Specifically, we instantiated our framework with the MPI
+model, within the Topology ToolKit (TTK). While developing this framework, we
+faced several algorithmic and software engineering challenges, which we
+document in this paper. We provide a taxonomy for the distributed-memory
+topological algorithms supported by TTK, depending on their communication needs
+and provide examples of hybrid MPI+thread parallelizations. Detailed
+performance analyses show that parallel efficiencies range from $20\%$ to
+$80\%$ (depending on the algorithms), and that the MPI-specific preconditioning
+introduced by our framework induces a negligible computation time overhead. We
+illustrate the new distributed-memory capabilities of TTK with an example of
+advanced analysis pipeline, combining multiple algorithms, run on the largest
+publicly available dataset we have found (120 billion vertices) on a standard
+cluster with 64 nodes (for a total of 1,536 cores). Finally, we provide a
+roadmap for the completion of TTK's MPI extension, along with generic
+recommendations for each algorithm communication category.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Real-Time Neural BRDF with Spherically Distributed Primitives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08332v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08332v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yishun Dou, Zhong Zheng, Qiaoqiao Jin, Bingbing Ni, Yugang Chen, Junxiang Ke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel compact and efficient neural BRDF offering highly
+versatile material representation, yet with very-light memory and neural
+computation consumption towards achieving real-time rendering. The results in
+Figure 1, rendered at full HD resolution on a current desktop machine, show
+that our system achieves real-time rendering with a wide variety of
+appearances, which is approached by the following two designs. On the one hand,
+noting that bidirectional reflectance is distributed in a very sparse
+high-dimensional subspace, we propose to project the BRDF into two
+low-dimensional components, i.e., two hemisphere feature-grids for incoming and
+outgoing directions, respectively. On the other hand, learnable neural
+reflectance primitives are distributed on our highly-tailored spherical surface
+grid, which offer informative features for each component and alleviate the
+conventional heavy feature learning network to a much smaller one, leading to
+very fast evaluation. These primitives are centrally stored in a codebook and
+can be shared across multiple grids and even across materials, based on the
+low-cost indices stored in material-specific spherical surface grids. Our
+neural BRDF, which is agnostic to the material, provides a unified framework
+that can represent a variety of materials in consistent manner. Comprehensive
+experimental results on measured BRDF compression, Monte Carlo simulated BRDF
+acceleration, and extension to spatially varying effect demonstrate the
+superior quality and generalizability achieved by the proposed scheme.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NSM4D: Neural Scene Model Based Online 4D Point Cloud Sequence
+  Understanding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08326v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08326v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhao Dong, Zhuoyang Zhang, Yunze Liu, Li Yi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding 4D point cloud sequences online is of significant practical
+value in various scenarios such as VR/AR, robotics, and autonomous driving. The
+key goal is to continuously analyze the geometry and dynamics of a 3D scene as
+unstructured and redundant point cloud sequences arrive. And the main challenge
+is to effectively model the long-term history while keeping computational costs
+manageable. To tackle these challenges, we introduce a generic online 4D
+perception paradigm called NSM4D. NSM4D serves as a plug-and-play strategy that
+can be adapted to existing 4D backbones, significantly enhancing their online
+perception capabilities for both indoor and outdoor scenarios. To efficiently
+capture the redundant 4D history, we propose a neural scene model that
+factorizes geometry and motion information by constructing geometry tokens
+separately storing geometry and motion features. Exploiting the history becomes
+as straightforward as querying the neural scene model. As the sequence
+progresses, the neural scene model dynamically deforms to align with new
+observations, effectively providing the historical context and updating itself
+with the new observations. By employing token representation, NSM4D also
+exhibits robustness to low-level sensor noise and maintains a compact size
+through a geometric sampling scheme. We integrate NSM4D with state-of-the-art
+4D perception backbones, demonstrating significant improvements on various
+online perception benchmarks in indoor and outdoor settings. Notably, we
+achieve a 9.6% accuracy improvement for HOI4D online action segmentation and a
+3.4% mIoU improvement for SemanticKITTI online semantic segmentation.
+Furthermore, we show that NSM4D inherently offers excellent scalability to
+longer sequences beyond the training set, which is crucial for real-world
+applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Defending Our Privacy With Backdoors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08320v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08320v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominik Hintersdorf, Lukas Struppek, Daniel Neider, Kristian Kersting
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of large AI models trained on uncurated, often sensitive
+web-scraped data has raised significant privacy concerns. One of the concerns
+is that adversaries can extract information about the training data using
+privacy attacks. Unfortunately, the task of removing specific information from
+the models without sacrificing performance is not straightforward and has
+proven to be challenging. We propose a rather easy yet effective defense based
+on backdoor attacks to remove private information such as names of individuals
+from models, and focus in this work on text encoders. Specifically, through
+strategic insertion of backdoors, we align the embeddings of sensitive phrases
+with those of neutral terms-"a person" instead of the person's name. Our
+empirical results demonstrate the effectiveness of our backdoor-based defense
+on CLIP by assessing its performance using a specialized privacy attack for
+zero-shot classifiers. Our approach provides not only a new "dual-use"
+perspective on backdoor attacks, but also presents a promising avenue to
+enhance the privacy of individuals within models trained on uncurated
+web-scraped data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Extended target tracking utilizing machine-learning software -- with
+  applications to animal classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08316v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08316v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Magnus Malmström, Anton Kullberg, Isaac Skog, Daniel Axehill, Fredrik Gustafsson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper considers the problem of detecting and tracking objects in a
+sequence of images. The problem is formulated in a filtering framework, using
+the output of object-detection algorithms as measurements. An extension to the
+filtering formulation is proposed that incorporates class information from the
+previous frame to robustify the classification, even if the object-detection
+algorithm outputs an incorrect prediction. Further, the properties of the
+object-detection algorithm are exploited to quantify the uncertainty of the
+bounding box detection in each frame. The complete filtering method is
+evaluated on camera trap images of the four large Swedish carnivores, bear,
+lynx, wolf, and wolverine. The experiments show that the class tracking
+formulation leads to a more robust classification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GePSAn: Generative Procedure Step Anticipation in Cooking Videos <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08312v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08312v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamed Ashraf Abdelsalam, Samrudhdhi B. Rangrej, Isma Hadji, Nikita Dvornik, Konstantinos G. Derpanis, Afsaneh Fazly
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of future step anticipation in procedural videos. Given
+a video of an ongoing procedural activity, we predict a plausible next
+procedure step described in rich natural language. While most previous work
+focus on the problem of data scarcity in procedural video datasets, another
+core challenge of future anticipation is how to account for multiple plausible
+future realizations in natural settings. This problem has been largely
+overlooked in previous work. To address this challenge, we frame future step
+prediction as modelling the distribution of all possible candidates for the
+next step. Specifically, we design a generative model that takes a series of
+video clips as input, and generates multiple plausible and diverse candidates
+(in natural language) for the next step. Following previous work, we side-step
+the video annotation scarcity by pretraining our model on a large text-based
+corpus of procedural activities, and then transfer the model to the video
+domain. Our experiments, both in textual and video domains, show that our model
+captures diversity in the next step prediction and generates multiple plausible
+future predictions. Moreover, our model establishes new state-of-the-art
+results on YouCookII, where it outperforms existing baselines on the next step
+anticipation. Finally, we also show that our model can successfully transfer
+from text to the video domain zero-shot, ie, without fine-tuning or adaptation,
+and produces good-quality future step predictions from video.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>published at ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CHIP: Contrastive Hierarchical Image <span class="highlight-title">Pretrain</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08304v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08304v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arpit Mittal, Harshil Jhaveri, Swapnil Mallick, Abhishek Ajmera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot object classification is the task of classifying objects in an image
+with limited number of examples as supervision. We propose a one-shot/few-shot
+classification model that can classify an object of any unseen class into a
+relatively general category in an hierarchically based classification. Our
+model uses a three-level hierarchical contrastive loss based ResNet152
+classifier for classifying an object based on its features extracted from Image
+embedding, not used during the training phase. For our experimentation, we have
+used a subset of the ImageNet (ILSVRC-12) dataset that contains only the animal
+classes for training our model and created our own dataset of unseen classes
+for evaluating our trained model. Our model provides satisfactory results in
+classifying the unknown objects into a generic category which has been later
+discussed in greater detail.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal Variational Auto-encoder based Audio-Visual Segmentation <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08303v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08303v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxin Mao, Jing Zhang, Mochu Xiang, Yiran Zhong, Yuchao Dai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose an Explicit Conditional Multimodal Variational Auto-Encoder
+(ECMVAE) for audio-visual segmentation (AVS), aiming to segment sound sources
+in the video sequence. Existing AVS methods focus on implicit feature fusion
+strategies, where models are trained to fit the discrete samples in the
+dataset. With a limited and less diverse dataset, the resulting performance is
+usually unsatisfactory. In contrast, we address this problem from an effective
+representation learning perspective, aiming to model the contribution of each
+modality explicitly. Specifically, we find that audio contains critical
+category information of the sound producers, and visual data provides candidate
+sound producer(s). Their shared information corresponds to the target sound
+producer(s) shown in the visual data. In this case, cross-modal shared
+representation learning is especially important for AVS. To achieve this, our
+ECMVAE factorizes the representations of each modality with a modality-shared
+representation and a modality-specific representation. An orthogonality
+constraint is applied between the shared and specific representations to
+maintain the exclusive attribute of the factorized latent code. Further, a
+mutual information maximization regularizer is introduced to achieve extensive
+exploration of each modality. Quantitative and qualitative evaluations on the
+AVSBench demonstrate the effectiveness of our approach, leading to a new
+state-of-the-art for AVS, with a 3.84 mIOU performance leap on the challenging
+MS3 subset for multiple sound source segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICCV2023,Project
+  page(https://npucvr.github.io/MMVAE-AVS),Code(https://github.com/OpenNLPLab/MMVAE-AVS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Direction-Oriented Visual-semantic Embedding Model for Remote Sensing
+  Image-text Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08276v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08276v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qing Ma, Jiancheng Pan, Cong Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image-text retrieval has developed rapidly in recent years. However, it is
+still a challenge in remote sensing due to visual-semantic imbalance, which
+leads to incorrect matching of non-semantic visual and textual features. To
+solve this problem, we propose a novel Direction-Oriented Visual-semantic
+Embedding Model (DOVE) to mine the relationship between vision and language.
+Concretely, a Regional-Oriented Attention Module (ROAM) adaptively adjusts the
+distance between the final visual and textual embeddings in the latent semantic
+space, oriented by regional visual features. Meanwhile, a lightweight Digging
+Text Genome Assistant (DTGA) is designed to expand the range of tractable
+textual representation and enhance global word-level semantic connections using
+less attention operations. Ultimately, we exploit a global visual-semantic
+constraint to reduce single visual dependency and serve as an external
+constraint for the final visual and textual representations. The effectiveness
+and superiority of our method are verified by extensive experiments including
+parameter evaluation, quantitative comparison, ablation studies and visual
+analysis, on two benchmark datasets, RSICD and RSITMD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 11 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GraphAlign: Enhancing Accurate Feature Alignment by Graph matching for
+  Multi-Modal 3D Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08261v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08261v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziying Song, Haiyue Wei, Lin Bai, Lei Yang, Caiyan Jia
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  LiDAR and cameras are complementary sensors for 3D object detection in
+autonomous driving. However, it is challenging to explore the unnatural
+interaction between point clouds and images, and the critical factor is how to
+conduct feature alignment of heterogeneous modalities. Currently, many methods
+achieve feature alignment by projection calibration only, without considering
+the problem of coordinate conversion accuracy errors between sensors, leading
+to sub-optimal performance. In this paper, we present GraphAlign, a more
+accurate feature alignment strategy for 3D object detection by graph matching.
+Specifically, we fuse image features from a semantic segmentation encoder in
+the image branch and point cloud features from a 3D Sparse CNN in the LiDAR
+branch. To save computation, we construct the nearest neighbor relationship by
+calculating Euclidean distance within the subspaces that are divided into the
+point cloud features. Through the projection calibration between the image and
+point cloud, we project the nearest neighbors of point cloud features onto the
+image features. Then by matching the nearest neighbors with a single point
+cloud to multiple images, we search for a more appropriate feature alignment.
+In addition, we provide a self-attention module to enhance the weights of
+significant relations to fine-tune the feature alignment between heterogeneous
+modalities. Extensive experiments on nuScenes benchmark demonstrate the
+effectiveness and efficiency of our GraphAlign.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Invisible Threats: Backdoor Attack in OCR Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08259v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08259v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mauro Conti, Nicola Farronato, Stefanos Koffas, Luca Pajola, Stjepan Picek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical Character Recognition (OCR) is a widely used tool to extract text
+from scanned documents. Today, the state-of-the-art is achieved by exploiting
+deep neural networks. However, the cost of this performance is paid at the
+price of system vulnerability. For instance, in backdoor attacks, attackers
+compromise the training phase by inserting a backdoor in the victim's model
+that will be activated at testing time by specific patterns while leaving the
+overall model performance intact. This work proposes a backdoor attack for OCR
+resulting in the injection of non-readable characters from malicious input
+images. This simple but effective attack exposes the state-of-the-art OCR
+weakness, making the extracted text correct to human eyes but simultaneously
+unusable for the NLP application that uses OCR as a preprocessing step.
+Experimental results show that the attacked models successfully output
+non-readable characters for around 90% of the poisoned instances without
+harming their performance for the remaining instances.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distilling from Vision-Language Models for Improved OOD Generalization
+  in Vision Tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08255v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08255v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sravanti Addepalli, Ashish Ramayee Asokan, Lakshay Sharma, R. Venkatesh Babu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vision-Language Models (VLMs) such as CLIP are trained on large amounts of
+image-text pairs, resulting in remarkable generalization across several data
+distributions. The prohibitively expensive training and data
+collection/curation costs of these models make them valuable Intellectual
+Property (IP) for organizations. This motivates a vendor-client paradigm, where
+a vendor trains a large-scale VLM and grants only input-output access to
+clients on a pay-per-query basis in a black-box setting. The client aims to
+minimize inference cost by distilling the VLM to a student model using the
+limited available task-specific data, and further deploying this student model
+in the downstream application. While naive distillation largely improves the
+In-Domain (ID) accuracy of the student, it fails to transfer the superior
+out-of-distribution (OOD) generalization of the VLM teacher using the limited
+available labeled images. To mitigate this, we propose Vision-Language to
+Vision-Align, Distill, Predict (VL2V-ADiP), which first aligns the vision and
+language modalities of the teacher model with the vision modality of a
+pre-trained student model, and further distills the aligned VLM embeddings to
+the student. This maximally retains the pre-trained features of the student,
+while also incorporating the rich representations of the VLM image encoder and
+the superior generalization of the text embeddings. The proposed approach
+achieves state-of-the-art results on the standard Domain Generalization
+benchmarks in a black-box teacher setting, and also when weights of the VLM are
+accessible.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at https://github.com/val-iisc/VL2V-ADiP.git</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast Discrete Optimisation for Geometrically Consistent 3D Shape
+  Matching 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08230v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08230v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Roetzer, Ahmed Abbas, Dongliang Cao, Florian Bernard, Paul Swoboda
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work we propose to combine the advantages of learning-based and
+combinatorial formalisms for 3D shape matching. While learning-based shape
+matching solutions lead to state-of-the-art matching performance, they do not
+ensure geometric consistency, so that obtained matchings are locally unsmooth.
+On the contrary, axiomatic methods allow to take geometric consistency into
+account by explicitly constraining the space of valid matchings. However,
+existing axiomatic formalisms are impractical since they do not scale to
+practically relevant problem sizes, or they require user input for the
+initialisation of non-convex optimisation problems. In this work we aim to
+close this gap by proposing a novel combinatorial solver that combines a unique
+set of favourable properties: our approach is (i) initialisation free, (ii)
+massively parallelisable powered by a quasi-Newton method, (iii) provides
+optimality gaps, and (iv) delivers decreased runtime and globally optimal
+results for many instances.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paul Roetzer and Ahmed Abbas contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Structural analysis of Hindi online handwritten characters for character
+  recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08222v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08222v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anand Sharma, A. G. Ramakrishnan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Direction properties of online strokes are used to analyze them in terms of
+homogeneous regions or sub-strokes with points satisfying common geometric
+properties. Such sub-strokes are called sub-units. These properties are used to
+extract sub-units from Hindi ideal online characters. These properties along
+with some heuristics are used to extract sub-units from Hindi online
+handwritten characters.\\ A method is developed to extract point stroke,
+clockwise curve stroke, counter-clockwise curve stroke and loop stroke segments
+as sub-units from Hindi online handwritten characters. These extracted
+sub-units are close in structure to the sub-units of the corresponding Hindi
+online ideal characters.\\ Importance of local representation of online
+handwritten characters in terms of sub-units is assessed by training a
+classifier with sub-unit level local and character level global features
+extracted from characters for character recognition. The classifier has the
+recognition accuracy of 93.5\% on the testing set. This accuracy is the highest
+when compared with that of the classifiers trained only with global features
+extracted from characters in the same training set and evaluated on the same
+testing set.\\ Sub-unit extraction algorithm and the sub-unit based character
+classifier are tested on Hindi online handwritten character dataset. This
+dataset consists of samples from 96 different characters. There are 12832 and
+2821 samples in the training and testing sets, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>34 pages, 36 jpg figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TriRE: A Multi-Mechanism Learning Paradigm for Continual Knowledge
+  Retention and Promotion <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08217v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08217v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Preetha Vijayan, Prashant Bhat, Elahe Arani, Bahram Zonooz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning (CL) has remained a persistent challenge for deep neural
+networks due to catastrophic forgetting (CF) of previously learned tasks.
+Several techniques such as weight regularization, experience rehearsal, and
+parameter isolation have been proposed to alleviate CF. Despite their relative
+success, these research directions have predominantly remained orthogonal and
+suffer from several shortcomings, while missing out on the advantages of
+competing strategies. On the contrary, the brain continually learns,
+accommodates, and transfers knowledge across tasks by simultaneously leveraging
+several neurophysiological processes, including neurogenesis, active
+forgetting, neuromodulation, metaplasticity, experience rehearsal, and
+context-dependent gating, rarely resulting in CF. Inspired by how the brain
+exploits multiple mechanisms concurrently, we propose TriRE, a novel CL
+paradigm that encompasses retaining the most prominent neurons for each task,
+revising and solidifying the extracted knowledge of current and past tasks, and
+actively promoting less active neurons for subsequent tasks through rewinding
+and relearning. Across CL settings, TriRE significantly reduces task
+interference and surpasses different CL approaches considered in isolation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at 37th Conference on Neural Information Processing Systems
+  (NeurIPS 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Long-Tailed Classification Based on Coarse-Grained Leading Forest and
+  Multi-Center Loss 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08206v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08206v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinye Yang, Ji Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Long-tailed(LT) classification is an unavoidable and challenging problem in
+the real world. Most of the existing long-tailed classification methods focus
+only on solving the inter-class imbalance in which there are more samples in
+the head class than in the tail class, while ignoring the intra-lass imbalance
+in which the number of samples of the head attribute within the same class is
+much larger than the number of samples of the tail attribute. The deviation in
+the model is caused by both of these factors, and due to the fact that
+attributes are implicit in most datasets and the combination of attributes is
+very complex, the intra-class imbalance is more difficult to handle. For this
+purpose, we proposed a long-tailed classification framework, known as
+\textbf{\textsc{Cognisance}}, which is founded on Coarse-Grained Leading Forest
+(CLF) and Multi-Center Loss (MCL), aiming to build a multi-granularity joint
+solution model by means of invariant feature learning. In this method, we
+designed an unsupervised learning method, i.e., CLF, to better characterize the
+distribution of attributes within a class. Depending on the distribution of
+attributes, we can flexibly construct sampling strategies suitable for
+different environments. In addition, we introduce a new metric learning loss
+(MCL), which aims to gradually eliminate confusing attributes during the
+feature learning process. More importantly, this approach does not depend on a
+specific model structure and can be integrated with existing LT methods as an
+independent component. We have conducted extensive experiments and our approach
+has state-of-the-art performance in both existing benchmarks ImageNet-GLT and
+MSCOCO-GLT, and can improve the performance of existing LT methods. Our codes
+are available on GitHub: \url{https://github.com/jinyery/cognisance}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is another research work to apply leading tree structure along
+  with deep learning architecture</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lifelong Audio-video Masked Autoencoder with Forget-robust Localized
+  Alignments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08204v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08204v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaewoo Lee, Jaehong Yoon, Wonjae Kim, Yunji Kim, Sung Ju Hwang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a lifelong audio-video masked autoencoder that continually learns
+the multimodal representations from a video stream containing audio-video
+pairs, while its distribution continually shifts over time. Specifically, we
+propose two novel ideas to tackle the problem: (1) Localized Alignment: We
+introduce a small trainable multimodal encoder that predicts the audio and
+video tokens that are well-aligned with each other. This allows the model to
+learn only the highly correlated audiovisual patches with accurate multimodal
+relationships. (2) Forget-robust multimodal patch selection: We compare the
+relative importance of each audio-video patch between the current and past data
+pair to mitigate unintended drift of the previously learned audio-video
+representations. Our proposed method, FLAVA (Forget-robust Localized
+Audio-Video Alignment), therefore, captures the complex relationships between
+the audio and video modalities during training on a sequence of pre-training
+tasks while alleviating the forgetting of learned audiovisual correlations. Our
+experiments validate that FLAVA outperforms the state-of-the-art continual
+learning methods on several benchmark datasets under continual audio-video
+representation learning scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint, project page: https://g-jwlee.github.io/FLAVA/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ XIMAGENET-12: An Explainable AI Benchmark <span class="highlight-title">Dataset</span> for Model Robustness
+  Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08182v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08182v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiang Li, Dan Zhang, Shengzhao Lei, Xun Zhao, Shuyan Li, Porawit Kamnoedboon, WeiWei Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The lack of standardized robustness metrics and the widespread reliance on
+numerous unrelated benchmark datasets for testing have created a gap between
+academically validated robust models and their often problematic practical
+adoption. To address this, we introduce XIMAGENET-12, an explainable benchmark
+dataset with over 200K images and 15,600 manual semantic annotations. Covering
+12 categories from ImageNet to represent objects commonly encountered in
+practical life and simulating six diverse scenarios, including overexposure,
+blurring, color changing, etc., we further propose a novel robustness criterion
+that extends beyond model generation ability assessment. This benchmark
+dataset, along with related code, is available at
+https://sites.google.com/view/ximagenet-12/home. Researchers and practitioners
+can leverage this resource to evaluate the robustness of their visual models
+under challenging conditions and ultimately benefit from the demands of
+practical computer vision systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>UnderSubmission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Fast Minimum-Norm Attacks with Hyperparameter Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08177v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08177v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giuseppe Floris, Raffaele Mura, Luca Scionis, Giorgio Piras, Maura Pintor, Ambra Demontis, Battista Biggio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evaluating the adversarial robustness of machine learning models using
+gradient-based attacks is challenging. In this work, we show that
+hyperparameter optimization can improve fast minimum-norm attacks by automating
+the selection of the loss function, the optimizer and the step-size scheduler,
+along with the corresponding hyperparameters. Our extensive evaluation
+involving several robust models demonstrates the improved efficacy of fast
+minimum-norm attacks when hyper-up with hyperparameter optimization. We release
+our open-source code at https://github.com/pralab/HO-FMN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ESANN23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ COVID-19 Detection Using Swin <span class="highlight-title">Transformer</span> Approach from Computed
+  Tomography Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08165v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08165v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kenan Morani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The accurate and efficient diagnosis of COVID-19 is of paramount importance,
+particularly in the context of large-scale medical imaging datasets. In this
+preprint paper, we propose a novel approach for COVID-19 diagnosis using CT
+images that leverages the power of Swin Transformer models, state-of-the-art
+solutions in computer vision tasks. Our method includes a systematic approach
+for patient-level predictions, where individual CT slices are classified as
+COVID-19 or non-COVID, and the patient's overall diagnosis is determined
+through majority voting. The application of the Swin Transformer in this
+context results in patient-level predictions that demonstrate exceptional
+diagnostic accuracy. In terms of evaluation metrics, our approach consistently
+outperforms the baseline, as well as numerous competing methods, showcasing its
+effectiveness in COVID-19 diagnosis. The macro F1 score achieved by our model
+exceeds the baseline and offers a robust solution for accurate diagnosis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Deep Learning Framework for Spatiotemporal Ultrasound Localization
+  Microscopy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08143v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08143v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Léo Milecki, Jonathan Porée, Hatim Belgharbi, Chloé Bourquin, Rafat Damseh, Patrick Delafontaine-Martel, Frédéric Lesage, Maxime Gasse, Jean Provost
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ultrasound Localization Microscopy can resolve the microvascular bed down to
+a few micrometers. To achieve such performance microbubble contrast agents must
+perfuse the entire microvascular network. Microbubbles are then located
+individually and tracked over time to sample individual vessels, typically over
+hundreds of thousands of images. To overcome the fundamental limit of
+diffraction and achieve a dense reconstruction of the network, low microbubble
+concentrations must be used, which lead to acquisitions lasting several
+minutes. Conventional processing pipelines are currently unable to deal with
+interference from multiple nearby microbubbles, further reducing achievable
+concentrations. This work overcomes this problem by proposing a Deep Learning
+approach to recover dense vascular networks from ultrasound acquisitions with
+high microbubble concentrations. A realistic mouse brain microvascular network,
+segmented from 2-photon microscopy, was used to train a three-dimensional
+convolutional neural network based on a V-net architecture. Ultrasound data
+sets from multiple microbubbles flowing through the microvascular network were
+simulated and used as ground truth to train the 3D CNN to track microbubbles.
+The 3D-CNN approach was validated in silico using a subset of the data and in
+vivo on a rat brain acquisition. In silico, the CNN reconstructed vascular
+networks with higher precision (81%) than a conventional ULM framework (70%).
+In vivo, the CNN could resolve micro vessels as small as 10 $\mu$m with an
+increase in resolution when compared against a conventional approach.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Copyright 2021 IEEE. Personal use of this material is permitted.
+  Permission from IEEE must be obtained for all other uses, in any current or
+  future media, including reprinting/republishing this material for advertising
+  or promotional purposes, creating new collective works, for resale or
+  redistribution to servers or lists, or reuse of any copyrighted component of
+  this work in other works</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fine-Grained Annotation for Face Anti-Spoofing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08142v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08142v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Chen, Yunde Jia, Yuwei Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Face anti-spoofing plays a critical role in safeguarding facial recognition
+systems against presentation attacks. While existing deep learning methods show
+promising results, they still suffer from the lack of fine-grained annotations,
+which lead models to learn task-irrelevant or unfaithful features. In this
+paper, we propose a fine-grained annotation method for face anti-spoofing.
+Specifically, we first leverage the Segment Anything Model (SAM) to obtain
+pixel-wise segmentation masks by utilizing face landmarks as point prompts. The
+face landmarks provide segmentation semantics, which segments the face into
+regions. We then adopt these regions as masks and assemble them into three
+separate annotation maps: spoof, living, and background maps. Finally, we
+combine three separate maps into a three-channel map as annotations for model
+training. Furthermore, we introduce the Multi-Channel Region Exchange
+Augmentation (MCREA) to diversify training data and reduce overfitting.
+Experimental results demonstrate that our method outperforms existing
+state-of-the-art approaches in both intra-dataset and cross-dataset
+evaluations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DualAug: Exploiting Additional Heavy Augmentation with OOD Data
+  Rejection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08139v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08139v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zehao Wang, Yiwen Guo, Qizhang Li, Guanglei Yang, Wangmeng Zuo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data augmentation is a dominant method for reducing model overfitting and
+improving generalization. Most existing data augmentation methods tend to find
+a compromise in augmenting the data, \textit{i.e.}, increasing the amplitude of
+augmentation carefully to avoid degrading some data too much and doing harm to
+the model performance. We delve into the relationship between data augmentation
+and model performance, revealing that the performance drop with heavy
+augmentation comes from the presence of out-of-distribution (OOD) data.
+Nonetheless, as the same data transformation has different effects for
+different training samples, even for heavy augmentation, there remains part of
+in-distribution data which is beneficial to model training. Based on the
+observation, we propose a novel data augmentation method, named
+\textbf{DualAug}, to keep the augmentation in distribution as much as possible
+at a reasonable time and computational cost. We design a data mixing strategy
+to fuse augmented data from both the basic- and the heavy-augmentation
+branches. Extensive experiments on supervised image classification benchmarks
+show that DualAug improve various automated data augmentation method. Moreover,
+the experiments on semi-supervised learning and contrastive self-supervised
+learning demonstrate that our DualAug can also improve related method. Code is
+available at
+\href{https://github.com/shuguang99/DualAug}{https://github.com/shuguang99/DualAug}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tailored Visions: Enhancing Text-to-Image Generation with Personalized
+  <span class="highlight-title">Prompt</span> Rewriting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08129v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08129v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijie Chen, Lichao Zhang, Fangsheng Weng, Lili Pan, Zhenzhong Lan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel perspective of viewing large pretrained models as search
+engines, thereby enabling the repurposing of techniques previously used to
+enhance search engine performance. As an illustration, we employ a personalized
+query rewriting technique in the realm of text-to-image generation. Despite
+significant progress in the field, it is still challenging to create
+personalized visual representations that align closely with the desires and
+preferences of individual users. This process requires users to articulate
+their ideas in words that are both comprehensible to the models and accurately
+capture their vision, posing difficulties for many users. In this paper, we
+tackle this challenge by leveraging historical user interactions with the
+system to enhance user prompts. We propose a novel approach that involves
+rewriting user prompts based a new large-scale text-to-image dataset with over
+300k prompts from 3115 users. Our rewriting model enhances the expressiveness
+and alignment of user prompts with their intended visual outputs. Experimental
+results demonstrate the superiority of our methods over baseline approaches, as
+evidenced in our new offline evaluation method and online tests. Our approach
+opens up exciting possibilities of applying more search engine techniques to
+build truly personalized large pretrained models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DUSA: Decoupled Unsupervised Sim2Real Adaptation for
+  Vehicle-to-Everything Collaborative Perception <span class="chip">ACM MM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08117v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08117v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xianghao Kong, Wentao Jiang, Jinrang Jia, Yifeng Shi, Runsheng Xu, Si Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vehicle-to-Everything (V2X) collaborative perception is crucial for
+autonomous driving. However, achieving high-precision V2X perception requires a
+significant amount of annotated real-world data, which can always be expensive
+and hard to acquire. Simulated data have raised much attention since they can
+be massively produced at an extremely low cost. Nevertheless, the significant
+domain gap between simulated and real-world data, including differences in
+sensor type, reflectance patterns, and road surroundings, often leads to poor
+performance of models trained on simulated data when evaluated on real-world
+data. In addition, there remains a domain gap between real-world collaborative
+agents, e.g. different types of sensors may be installed on autonomous vehicles
+and roadside infrastructures with different extrinsics, further increasing the
+difficulty of sim2real generalization. To take full advantage of simulated
+data, we present a new unsupervised sim2real domain adaptation method for V2X
+collaborative detection named Decoupled Unsupervised Sim2Real Adaptation
+(DUSA). Our new method decouples the V2X collaborative sim2real domain
+adaptation problem into two sub-problems: sim2real adaptation and inter-agent
+adaptation. For sim2real adaptation, we design a Location-adaptive Sim2Real
+Adapter (LSA) module to adaptively aggregate features from critical locations
+of the feature map and align the features between simulated data and real-world
+data via a sim/real discriminator on the aggregated global feature. For
+inter-agent adaptation, we further devise a Confidence-aware Inter-agent
+Adapter (CIA) module to align the fine-grained features from heterogeneous
+agents under the guidance of agent-wise confidence maps. Experiments
+demonstrate the effectiveness of the proposed DUSA approach on unsupervised
+sim2real adaptation from the simulated V2XSet dataset to the real-world
+DAIR-V2X-C dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACM MM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal Active Measurement for Human Mesh Recovery in Close Proximity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08116v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08116v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Takahiro Maeda, Keisuke Takeshita, Kazuhito Tanaka
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For safe and sophisticated physical human-robot interactions (pHRI), a robot
+needs to estimate the accurate body pose or mesh of the target person. However,
+in these pHRI scenarios, the robot cannot fully observe the target person's
+body with equipped cameras because the target person is usually close to the
+robot. This leads to severe truncation and occlusions, and results in poor
+accuracy of human pose estimation. For better accuracy of human pose estimation
+or mesh recovery on this limited information from cameras, we propose an active
+measurement and sensor fusion framework of the equipped cameras and other
+sensors such as touch sensors and 2D LiDAR. These touch and LiDAR sensing are
+obtained attendantly through pHRI without additional costs. These sensor
+measurements are sparse but reliable and informative cues for human mesh
+recovery. In our active measurement process, camera viewpoints and sensor
+placements are optimized based on the uncertainty of the estimated pose, which
+is closely related to the truncated or occluded areas. In our sensor fusion
+process, we fuse the sensor measurements to the camera-based estimated pose by
+minimizing the distance between the estimated mesh and measured positions. Our
+method is agnostic to robot configurations. Experiments were conducted using
+the Toyota Human Support Robot, which has a camera, 2D LiDAR, and a touch
+sensor on the robot arm. Our proposed method demonstrated the superiority in
+the human pose estimation accuracy on the quantitative comparison. Furthermore,
+our proposed method reliably estimated the pose of the target person in
+practical settings such as target people occluded by a blanket and standing aid
+with the robot arm.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalized Logit Adjustment: Calibrating Fine-tuned Models by Removing
+  Label Bias in Foundation Models <span class="chip">NeurIPS2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08106v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08106v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Beier Zhu, Kaihua Tang, Qianru Sun, Hanwang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models like CLIP allow zero-shot transfer on various tasks without
+additional training data. Yet, the zero-shot performance is less competitive
+than a fully supervised one. Thus, to enhance the performance, fine-tuning and
+ensembling are also commonly adopted to better fit the downstream tasks.
+However, we argue that such prior work has overlooked the inherent biases in
+foundation models. Due to the highly imbalanced Web-scale training set, these
+foundation models are inevitably skewed toward frequent semantics, and thus the
+subsequent fine-tuning or ensembling is still biased. In this study, we
+systematically examine the biases in foundation models and demonstrate the
+efficacy of our proposed Generalized Logit Adjustment (GLA) method. Note that
+bias estimation in foundation models is challenging, as most pre-train data
+cannot be explicitly accessed like in traditional long-tailed classification
+tasks. To this end, GLA has an optimization-based bias estimation approach for
+debiasing foundation models. As our work resolves a fundamental flaw in the
+pre-training, the proposed GLA demonstrates significant improvements across a
+diverse range of tasks: it achieves 1.5 pp accuracy gains on ImageNet, an large
+average improvement (1.4-4.6 pp) on 11 few-shot datasets, 2.4 pp gains on
+long-tailed classification. Codes are in \url{https://github.com/BeierZhu/GLA}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SingleInsert: Inserting New Concepts from a Single Image into
+  Text-to-Image Models for Flexible Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08094v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08094v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijie Wu, Chaohui Yu, Zhen Zhu, Fan Wang, Xiang Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent progress in text-to-image (T2I) models enables high-quality image
+generation with flexible textual control. To utilize the abundant visual priors
+in the off-the-shelf T2I models, a series of methods try to invert an image to
+proper embedding that aligns with the semantic space of the T2I model. However,
+these image-to-text (I2T) inversion methods typically need multiple source
+images containing the same concept or struggle with the imbalance between
+editing flexibility and visual fidelity. In this work, we point out that the
+critical problem lies in the foreground-background entanglement when learning
+an intended concept, and propose a simple and effective baseline for
+single-image I2T inversion, named SingleInsert. SingleInsert adopts a two-stage
+scheme. In the first stage, we regulate the learned embedding to concentrate on
+the foreground area without being associated with the irrelevant background. In
+the second stage, we finetune the T2I model for better visual resemblance and
+devise a semantic loss to prevent the language drift problem. With the proposed
+techniques, SingleInsert excels in single concept generation with high visual
+fidelity while allowing flexible editing. Additionally, SingleInsert can
+perform single-image novel view synthesis and multiple concepts composition
+without requiring joint training. To facilitate evaluation, we design an
+editing prompt list and introduce a metric named Editing Success Rate (ESR) for
+quantitative assessment of editing flexibility. Our project page is:
+https://jarrentwu1031.github.io/SingleInsert-web/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://jarrentwu1031.github.io/SingleInsert-web/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Consistent123: Improve Consistency for One Image to 3D Object Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08092v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08092v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haohan Weng, Tianyu Yang, Jianan Wang, Yu Li, Tong Zhang, C. L. Philip Chen, Lei Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large image diffusion models enable novel view synthesis with high quality
+and excellent zero-shot capability. However, such models based on
+image-to-image translation have no guarantee of view consistency, limiting the
+performance for downstream tasks like 3D reconstruction and image-to-3D
+generation. To empower consistency, we propose Consistent123 to synthesize
+novel views simultaneously by incorporating additional cross-view attention
+layers and the shared self-attention mechanism. The proposed attention
+mechanism improves the interaction across all synthesized views, as well as the
+alignment between the condition view and novel views. In the sampling stage,
+such architecture supports simultaneously generating an arbitrary number of
+views while training at a fixed length. We also introduce a progressive
+classifier-free guidance strategy to achieve the trade-off between texture and
+geometry for synthesized object views. Qualitative and quantitative experiments
+show that Consistent123 outperforms baselines in view consistency by a large
+margin. Furthermore, we demonstrate a significant improvement of Consistent123
+on varying downstream tasks, showing its great potential in the 3D generation
+field. The project page is available at consistent-123.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>For more qualitative results, please see
+  https://consistent-123.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Volumetric Medical Image Segmentation via Scribble Annotations and Shape
+  Priors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08084v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08084v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiuhui Chen, Haiying Lyu, Xinyue Hu, Yong Lu, Yi Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, weakly-supervised image segmentation using weak annotations like
+scribbles has gained great attention in computer vision and medical image
+analysis, since such annotations are much easier to obtain compared to
+time-consuming and labor-intensive labeling at the pixel/voxel level. However,
+due to a lack of structure supervision on regions of interest (ROIs), existing
+scribble-based methods suffer from poor boundary localization. Furthermore,
+most current methods are designed for 2D image segmentation, which do not fully
+leverage the volumetric information if directly applied to each image slice. In
+this paper, we propose a scribble-based volumetric image segmentation,
+Scribble2D5, which tackles 3D anisotropic image segmentation and aims to its
+improve boundary prediction. To achieve this, we augment a 2.5D attention UNet
+with a proposed label propagation module to extend semantic information from
+scribbles and use a combination of static and active boundary prediction to
+learn ROI's boundary and regularize its shape. Also, we propose an optional
+add-on component, which incorporates the shape prior information from unpaired
+segmentation masks to further improve model accuracy. Extensive experiments on
+three public datasets and one private dataset demonstrate our Scribble2D5
+achieves state-of-the-art performance on volumetric image segmentation using
+scribbles and shape prior if available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: text overlap with arXiv:2205.06779</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Jointly Optimized Global-Local Visual Localization of UAVs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08082v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08082v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoling Li, Jiuniu Wang, Zhiwei Wei, Wenjia Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Navigation and localization of UAVs present a challenge when global
+navigation satellite systems (GNSS) are disrupted and unreliable. Traditional
+techniques, such as simultaneous localization and mapping (SLAM) and visual
+odometry (VO), exhibit certain limitations in furnishing absolute coordinates
+and mitigating error accumulation. Existing visual localization methods achieve
+autonomous visual localization without error accumulation by matching with
+ortho satellite images. However, doing so cannot guarantee real-time
+performance due to the complex matching process. To address these challenges,
+we propose a novel Global-Local Visual Localization (GLVL) network. Our GLVL
+network is a two-stage visual localization approach, combining a large-scale
+retrieval module that finds similar regions with the UAV flight scene, and a
+fine-grained matching module that localizes the precise UAV coordinate,
+enabling real-time and precise localization. The training process is jointly
+optimized in an end-to-end manner to further enhance the model capability.
+Experiments on six UAV flight scenes encompassing both texture-rich and
+texture-sparse regions demonstrate the ability of our model to achieve the
+real-time precise localization requirements of UAVs. Particularly, our method
+achieves a localization error of only 2.39 meters in 0.48 seconds in a village
+scene with sparse texture features.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RT-SRTS: Angle-Agnostic Real-Time Simultaneous 3D Reconstruction and
+  Tumor Segmentation from Single X-Ray Projection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08080v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08080v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miao Zhu, Qiming Fu, Bo Liu, Mengxi Zhang, Bojian Li, Xiaoyan Luo, Fugen Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Radiotherapy is one of the primary treatment methods for tumors, but the
+organ movement caused by respiratory motion limits its accuracy. Recently, 3D
+imaging from single X-ray projection receives extensive attentions as a
+promising way to address this issue. However, current methods can only
+reconstruct 3D image without direct location of the tumor and are only
+validated for fixed-angle imaging, which fails to fully meet the requirement of
+motion control in radiotherapy. In this study, we propose a novel imaging
+method RT-SRTS which integrates 3D imaging and tumor segmentation into one
+network based on the multi-task learning (MTL) and achieves real-time
+simultaneous 3D reconstruction and tumor segmentation from single X-ray
+projection at any angle. Futhermore, we propose the attention enhanced
+calibrator (AEC) and uncertain-region elaboration (URE) modules to aid feature
+extraction and improve segmentation accuracy. We evaluated the proposed method
+on ten patient cases and compared it with two state-of-the-art methods. Our
+approach not only delivered superior 3D reconstruction but also demonstrated
+commendable tumor segmentation results. The simultaneous reconstruction and
+segmentation could be completed in approximately 70 ms, significantly faster
+than the required time threshold for real-time tumor tracking. The efficacy of
+both AEC and URE was also validated through ablation studies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Samples on Thin Ice: Re-Evaluating Adversarial Pruning of Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08073v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08073v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giorgio Piras, Maura Pintor, Ambra Demontis, Battista Biggio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural network pruning has shown to be an effective technique for reducing
+the network size, trading desirable properties like generalization and
+robustness to adversarial attacks for higher sparsity. Recent work has claimed
+that adversarial pruning methods can produce sparse networks while also
+preserving robustness to adversarial examples. In this work, we first
+re-evaluate three state-of-the-art adversarial pruning methods, showing that
+their robustness was indeed overestimated. We then compare pruned and dense
+versions of the same models, discovering that samples on thin ice, i.e., closer
+to the unpruned model's decision boundary, are typically misclassified after
+pruning. We conclude by discussing how this intuition may lead to designing
+more effective adversarial pruning methods in future work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Transferable Conceptual Prototypes for Interpretable
+  Unsupervised Domain Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08071v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08071v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyu Gao, Xinhong Ma, Changsheng Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the great progress of unsupervised domain adaptation (UDA) with the
+deep neural networks, current UDA models are opaque and cannot provide
+promising explanations, limiting their applications in the scenarios that
+require safe and controllable model decisions. At present, a surge of work
+focuses on designing deep interpretable methods with adequate data annotations
+and only a few methods consider the distributional shift problem. Most existing
+interpretable UDA methods are post-hoc ones, which cannot facilitate the model
+learning process for performance enhancement. In this paper, we propose an
+inherently interpretable method, named Transferable Conceptual Prototype
+Learning (TCPL), which could simultaneously interpret and improve the processes
+of knowledge transfer and decision-making in UDA. To achieve this goal, we
+design a hierarchically prototypical module that transfers categorical basic
+concepts from the source domain to the target domain and learns domain-shared
+prototypes for explaining the underlying reasoning process. With the learned
+transferable prototypes, a self-predictive consistent pseudo-label strategy
+that fuses confidence, predictions, and prototype information, is designed for
+selecting suitable target samples for pseudo annotations and gradually
+narrowing down the domain gap. Comprehensive experiments show that the proposed
+method can not only provide effective and intuitive explanations but also
+outperform previous state-of-the-arts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE TIP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Frequency-Aware Re-Parameterization for Over-Fitting Based Image
+  Compression <span class="chip">ICIP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08068v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08068v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yun Ye, Yanjie Pan, Qually Jiang, Ming Lu, Xiaoran Fang, Beryl Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over-fitting-based image compression requires weights compactness for
+compression and fast convergence for practical use, posing challenges for deep
+convolutional neural networks (CNNs) based methods. This paper presents a
+simple re-parameterization method to train CNNs with reduced weights storage
+and accelerated convergence. The convolution kernels are re-parameterized as a
+weighted sum of discrete cosine transform (DCT) kernels enabling direct
+optimization in the frequency domain. Combined with L1 regularization, the
+proposed method surpasses vanilla convolutions by achieving a significantly
+improved rate-distortion with low computational cost. The proposed method is
+verified with extensive experiments of over-fitting-based image restoration on
+various datasets, achieving up to -46.12% BD-rate on top of HEIF with only 200
+iterations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>to be published at ICIP 2023, this version fixed a mistake in Eq. (1)
+  in the proceeding version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Age Estimation Based on Graph Convolutional Networks and Multi-head
+  Attention Mechanisms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08064v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08064v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Miaomiao Yang, Changwei Yao, Shijin Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Age estimation technology is a part of facial recognition and has been
+applied to identity authentication. This technology achieves the development
+and application of a juvenile anti-addiction system by authenticating users in
+the game. Convolutional Neural Network (CNN) and Transformer algorithms are
+widely used in this application scenario. However, these two models cannot
+flexibly extract and model features of faces with irregular shapes, and they
+are ineffective in capturing key information. Furthermore, the above methods
+will contain a lot of background information while extracting features, which
+will interfere with the model. In consequence, it is easy to extract redundant
+information from images. In this paper, a new modeling idea is proposed to
+solve this problem, which can flexibly model irregular objects. The Graph
+Convolutional Network (GCN) is used to extract features from irregular face
+images effectively, and multi-head attention mechanisms are added to avoid
+redundant features and capture key region information in the image. This model
+can effectively improve the accuracy of age estimation and reduce the MAE error
+value to about 3.64, which is better than the effect of today's age estimation
+model, to improve the accuracy of face recognition and identity authentication.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ EC-Depth: Exploring the consistency of <span class="highlight-title">self-supervised</span> monocular depth
+  estimation under challenging scenes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08044v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08044v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruijie Zhu, Ziyang Song, Chuxin Wang, Jianfeng He, Tianzhu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-supervised monocular depth estimation holds significant importance in
+the fields of autonomous driving and robotics. However, existing methods are
+typically designed to train and test on clear and pristine datasets,
+overlooking the impact of various adverse conditions prevalent in real-world
+scenarios. As a result, it is commonly observed that most self-supervised
+monocular depth estimation methods struggle to perform adequately under
+challenging conditions. To address this issue, we present EC-Depth, a novel
+self-supervised two-stage training framework to achieve a robust depth
+estimation, starting from the foundation of depth prediction consistency under
+different perturbations. Leveraging the proposed perturbation-invariant depth
+consistency constraint module and the consistency-based pseudo-label selection
+module, our model attains accurate and consistent depth predictions in both
+standard and challenging scenarios. Extensive experiments substantiate the
+effectiveness of the proposed method. Moreover, our method surpasses existing
+state-of-the-art methods on KITTI, KITTI-C and DrivingStereo benchmarks,
+demonstrating its potential for enhancing the reliability of self-supervised
+monocular depth estimation models in real-world applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ X-HRNet: Towards Lightweight Human Pose Estimation with Spatially
+  Unidimensional Self-Attention <span class="chip">ICME 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08042v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08042v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixuan Zhou, Xuanhan Wang, Xing Xu, Lei Zhao, Jingkuan Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  High-resolution representation is necessary for human pose estimation to
+achieve high performance, and the ensuing problem is high computational
+complexity. In particular, predominant pose estimation methods estimate human
+joints by 2D single-peak heatmaps. Each 2D heatmap can be horizontally and
+vertically projected to and reconstructed by a pair of 1D heat vectors.
+Inspired by this observation, we introduce a lightweight and powerful
+alternative, Spatially Unidimensional Self-Attention (SUSA), to the pointwise
+(1x1) convolution that is the main computational bottleneck in the depthwise
+separable 3c3 convolution. Our SUSA reduces the computational complexity of the
+pointwise (1x1) convolution by 96% without sacrificing accuracy. Furthermore,
+we use the SUSA as the main module to build our lightweight pose estimation
+backbone X-HRNet, where `X' represents the estimated cross-shape attention
+vectors. Extensive experiments on the COCO benchmark demonstrate the
+superiority of our X-HRNet, and comprehensive ablation studies show the
+effectiveness of the SUSA modules. The code is publicly available at
+https://github.com/cool-xuan/x-hrnet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ICME 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continual Learning via Manifold Expansion Replay 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08038v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08038v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihao Xu, Xuan Tang, Yufei Shi, Jianfeng Zhang, Jian Yang, Mingsong Chen, Xian Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In continual learning, the learner learns multiple tasks in sequence, with
+data being acquired only once for each task. Catastrophic forgetting is a major
+challenge to continual learning. To reduce forgetting, some existing
+rehearsal-based methods use episodic memory to replay samples of previous
+tasks. However, in the process of knowledge integration when learning a new
+task, this strategy also suffers from catastrophic forgetting due to an
+imbalance between old and new knowledge. To address this problem, we propose a
+novel replay strategy called Manifold Expansion Replay (MaER). We argue that
+expanding the implicit manifold of the knowledge representation in the episodic
+memory helps to improve the robustness and expressiveness of the model. To this
+end, we propose a greedy strategy to keep increasing the diameter of the
+implicit manifold represented by the knowledge in the buffer during memory
+management. In addition, we introduce Wasserstein distance instead of cross
+entropy as distillation loss to preserve previous knowledge. With extensive
+experimental validation on MNIST, CIFAR10, CIFAR100, and TinyImageNet, we show
+that the proposed method significantly improves the accuracy in continual
+learning setup, outperforming the state of the arts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BaSAL: Size Balanced Warm Start Active Learning for LiDAR Semantic
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08035v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08035v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiarong Wei, Yancong Lin, Holger Caesar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Active learning strives to reduce the need for costly data annotation, by
+repeatedly querying an annotator to label the most informative samples from a
+pool of unlabeled data and retraining a model from these samples. We identify
+two problems with existing active learning methods for LiDAR semantic
+segmentation. First, they ignore the severe class imbalance inherent in LiDAR
+semantic segmentation datasets. Second, to bootstrap the active learning loop,
+they train their initial model from randomly selected data samples, which leads
+to low performance and is referred to as the cold start problem. To address
+these problems we propose BaSAL, a size-balanced warm start active learning
+model, based on the observation that each object class has a characteristic
+size. By sampling object clusters according to their size, we can thus create a
+size-balanced dataset that is also more class-balanced. Furthermore, in
+contrast to existing information measures like entropy or CoreSet, size-based
+sampling does not require an already trained model and thus can be used to
+address the cold start problem. Results show that we are able to improve the
+performance of the initial model by a large margin. Combining size-balanced
+sampling and warm start with established information measures, our approach
+achieves a comparable performance to training on the entire SemanticKITTI
+dataset, despite using only 5% of the annotations, which outperforms existing
+active learning methods. We also match the existing state-of-the-art in active
+learning on nuScenes. Our code will be made available upon paper acceptance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Large Language Models for Multi-Modal Out-of-Distribution
+  Detection <span class="chip">EMNLP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08027v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08027v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi Dai, Hao Lang, Kaisheng Zeng, Fei Huang, Yongbin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-distribution (OOD) detection is essential for reliable and trustworthy
+machine learning. Recent multi-modal OOD detection leverages textual
+information from in-distribution (ID) class names for visual OOD detection, yet
+it currently neglects the rich contextual information of ID classes. Large
+language models (LLMs) encode a wealth of world knowledge and can be prompted
+to generate descriptive features for each class. Indiscriminately using such
+knowledge causes catastrophic damage to OOD detection due to LLMs'
+hallucinations, as is observed by our analysis. In this paper, we propose to
+apply world knowledge to enhance OOD detection performance through selective
+generation from LLMs. Specifically, we introduce a consistency-based
+uncertainty calibration method to estimate the confidence score of each
+generation. We further extract visual objects from each image to fully
+capitalize on the aforementioned world knowledge. Extensive experiments
+demonstrate that our method consistently outperforms the state-of-the-art.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP2023 Findings Long Paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Sharing Weights in Decoupling Feature Learning Network for UAV
+  RGB-Infrared Vehicle Re-Identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08026v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08026v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingyue Liu, Jiahao Qi, Chen Chen, Kangcheng Bin, Ping Zhong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Owing to the capacity of performing full-time target search, cross-modality
+vehicle re-identification (Re-ID) based on unmanned aerial vehicle (UAV) is
+gaining more attention in both video surveillance and public security. However,
+this promising and innovative research has not been studied sufficiently due to
+the data inadequacy issue. Meanwhile, the cross-modality discrepancy and
+orientation discrepancy challenges further aggravate the difficulty of this
+task. To this end, we pioneer a cross-modality vehicle Re-ID benchmark named
+UAV Cross-Modality Vehicle Re-ID (UCM-VeID), containing 753 identities with
+16015 RGB and 13913 infrared images. Moreover, to meet cross-modality
+discrepancy and orientation discrepancy challenges, we present a hybrid weights
+decoupling network (HWDNet) to learn the shared discriminative
+orientation-invariant features. For the first challenge, we proposed a hybrid
+weights siamese network with a well-designed weight restrainer and its
+corresponding objective function to learn both modality-specific and modality
+shared information. In terms of the second challenge, three effective
+decoupling structures with two pretext tasks are investigated to learn
+orientation-invariant feature. Comprehensive experiments are carried out to
+validate the effectiveness of the proposed method. The dataset and codes will
+be released at https://github.com/moonstarL/UAV-CM-VeID.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 10 figures, 64 citations, submitted to TMM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual-Stream Knowledge-Preserving Hashing for Unsupervised Video
+  Retrieval <span class="chip">ECCV 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08009v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08009v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pandeng Li, Hongtao Xie, Jiannan Ge, Lei Zhang, Shaobo Min, Yongdong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised video hashing usually optimizes binary codes by learning to
+reconstruct input videos. Such reconstruction constraint spends much effort on
+frame-level temporal context changes without focusing on video-level global
+semantics that are more useful for retrieval. Hence, we address this problem by
+decomposing video information into reconstruction-dependent and
+semantic-dependent information, which disentangles the semantic extraction from
+reconstruction constraint. Specifically, we first design a simple dual-stream
+structure, including a temporal layer and a hash layer. Then, with the help of
+semantic similarity knowledge obtained from self-supervision, the hash layer
+learns to capture information for semantic retrieval, while the temporal layer
+learns to capture the information for reconstruction. In this way, the model
+naturally preserves the disentangled semantics into binary codes. Validated by
+comprehensive experiments, our method consistently outperforms the
+state-of-the-arts on three video benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 8 figures, ECCV 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MLP-AMDC: An MLP Architecture for Adaptive-Mask-based Dual-Camera
+  snapshot hyperspectral imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08002v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08002v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyu Cai, Can Zhang, Xunhao Chen, Shanghuan Liu, Chengqian Jin, Feipeng Da
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Coded Aperture Snapshot Spectral Imaging (CASSI) system has great advantages
+over traditional methods in dynamically acquiring Hyper-Spectral Image (HSI),
+but there are the following problems. 1) Traditional mask relies on random
+patterns or analytical design, both of which limit the performance improvement
+of CASSI. 2) Existing high-quality reconstruction algorithms are slow in
+reconstruction and can only reconstruct scene information offline. To address
+the above two problems, this paper designs the AMDC-CASSI system, introducing
+RGB camera with CASSI based on Adaptive-Mask as multimodal input to improve the
+reconstruction quality. The existing SOTA reconstruction schemes are based on
+transformer, but the operation of self-attention pulls down the operation
+efficiency of the network. In order to improve the inference speed of the
+reconstruction network, this paper proposes An MLP Architecture for
+Adaptive-Mask-based Dual-Camera (MLP-AMDC) to replace the transformer structure
+of the network. Numerous experiments have shown that MLP performs no less well
+than transformer-based structures for HSI reconstruction, while MLP greatly
+improves the network inference speed and has less number of parameters and
+operations, our method has a 8 db improvement over SOTA and at least a 5-fold
+improvement in reconstruction speed. (https://github.com/caizeyu1992/MLP-AMDC.)
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2308.01541</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Point-NeuS: Point-Guided Neural Implicit Surface Reconstruction by
+  Volume Rendering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07997v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07997v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Zhang, Wanjuan Su, Wenbing Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, learning neural implicit surface by volume rendering has been a
+promising way for multi-view reconstruction. However, limited accuracy and
+excessive time complexity remain bottlenecks that current methods urgently need
+to overcome. To address these challenges, we propose a new method called
+Point-NeuS, utilizing point-guided mechanisms to achieve accurate and efficient
+reconstruction. Point modeling is organically embedded into the volume
+rendering to enhance and regularize the representation of implicit surface.
+Specifically, to achieve precise point guidance and noise robustness, aleatoric
+uncertainty of the point cloud is modeled to capture the distribution of noise
+and estimate the reliability of points. Additionally, a Neural Projection
+module connecting points and images is introduced to add geometric constraints
+to the Signed Distance Function (SDF). To better compensate for geometric bias
+between volume rendering and point modeling, high-fidelity points are filtered
+into an Implicit Displacement Network to improve the representation of SDF.
+Benefiting from our effective point guidance, lightweight networks are employed
+to achieve an impressive 11x speedup compared to NeuS. Extensive experiments
+show that our method yields high-quality surfaces, especially for fine-grained
+details and smooth regions. Moreover, it exhibits strong robustness to both
+noisy and sparse data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Reset It and Forget It: Relearning Last-Layer Weights Improves Continual
+  and Transfer Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07996v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07996v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lapo Frati, Neil Traft, Jeff Clune, Nick Cheney
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work identifies a simple pre-training mechanism that leads to
+representations exhibiting better continual and transfer learning. This
+mechanism -- the repeated resetting of weights in the last layer, which we
+nickname "zapping" -- was originally designed for a meta-continual-learning
+procedure, yet we show it is surprisingly applicable in many settings beyond
+both meta-learning and continual learning. In our experiments, we wish to
+transfer a pre-trained image classifier to a new set of classes, in a few
+shots. We show that our zapping procedure results in improved transfer accuracy
+and/or more rapid adaptation in both standard fine-tuning and continual
+learning settings, while being simple to implement and computationally
+efficient. In many cases, we achieve performance on par with state of the art
+meta-learning without needing the expensive higher-order gradients, by using a
+combination of zapping and sequential learning. An intuitive explanation for
+the effectiveness of this zapping procedure is that representations trained
+with repeated zapping learn features that are capable of rapidly adapting to
+newly initialized classifiers. Such an approach may be considered a
+computationally cheaper type of, or alternative to, meta-learning rapidly
+adaptable features with higher-order gradients. This adds to recent work on the
+usefulness of resetting neural network parameters during training, and invites
+further investigation of this mechanism.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HeightFormer: A Multilevel Interaction and Image-adaptive
+  Classification-regression Network for Monocular Height Estimation with Aerial
+  Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07995v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07995v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhan Chen, Yidan Zhang, Xiyu Qi, Yongqiang Mao, Xin Zhou, Lulu Niu, Hui Wu, Lei Wang, Yunping Ge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Height estimation has long been a pivotal topic within measurement and remote
+sensing disciplines, proving critical for endeavours such as 3D urban
+modelling, MR and autonomous driving. Traditional methods utilise stereo
+matching or multisensor fusion, both well-established techniques that typically
+necessitate multiple images from varying perspectives and adjunct sensors like
+SAR, leading to substantial deployment costs. Single image height estimation
+has emerged as an attractive alternative, boasting a larger data source variety
+and simpler deployment. However, current methods suffer from limitations such
+as fixed receptive fields, a lack of global information interaction, leading to
+noticeable instance-level height deviations. The inherent complexity of height
+prediction can result in a blurry estimation of object edge depth when using
+mainstream regression methods based on fixed height division. This paper
+presents a comprehensive solution for monocular height estimation in remote
+sensing, termed HeightFormer, combining multilevel interactions and
+image-adaptive classification-regression. It features the Multilevel
+Interaction Backbone (MIB) and Image-adaptive Classification-regression Height
+Generator (ICG). MIB supplements the fixed sample grid in CNN of the
+conventional backbone network with tokens of different interaction ranges. It
+is complemented by a pixel-, patch-, and feature map-level hierarchical
+interaction mechanism, designed to relay spatial geometry information across
+different scales and introducing a global receptive field to enhance the
+quality of instance-level height estimation. The ICG dynamically generates
+height partition for each image and reframes the traditional regression task,
+using a refinement from coarse to fine classification-regression that
+significantly mitigates the innate ill-posedness issue and drastically improves
+edge sharpness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-supervised</span> visual learning for analyzing firearms trafficking
+  activities on the Web 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07975v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07975v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sotirios Konstantakos, Despina Ioanna Chalkiadaki, Ioannis Mademlis, Adamantia Anna Rebolledo Chrysochoou, Georgios Th. Papadopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Automated visual firearms classification from RGB images is an important
+real-world task with applications in public space security, intelligence
+gathering and law enforcement investigations. When applied to images massively
+crawled from the World Wide Web (including social media and dark Web sites), it
+can serve as an important component of systems that attempt to identify
+criminal firearms trafficking networks, by analyzing Big Data from open-source
+intelligence. Deep Neural Networks (DNN) are the state-of-the-art methodology
+for achieving this, with Convolutional Neural Networks (CNN) being typically
+employed. The common transfer learning approach consists of pretraining on a
+large-scale, generic annotated dataset for whole-image classification, such as
+ImageNet-1k, and then finetuning the DNN on a smaller, annotated,
+task-specific, downstream dataset for visual firearms classification. Neither
+Visual Transformer (ViT) neural architectures nor Self-Supervised Learning
+(SSL) approaches have been so far evaluated on this critical task. SSL
+essentially consists of replacing the traditional supervised pretraining
+objective with an unsupervised pretext task that does not require ground-truth
+labels..
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CleftGAN: Adapting A Style-Based Generative Adversarial Network To
+  Create Images Depicting Cleft Lip Deformity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07969v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07969v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdullah Hayajneh, Erchin Serpedin, Mohammad Shaqfeh, Graeme Glass, Mitchell A. Stotland
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A major obstacle when attempting to train a machine learning system to
+evaluate facial clefts is the scarcity of large datasets of high-quality,
+ethics board-approved patient images. In response, we have built a deep
+learning-based cleft lip generator designed to produce an almost unlimited
+number of artificial images exhibiting high-fidelity facsimiles of cleft lip
+with wide variation. We undertook a transfer learning protocol testing
+different versions of StyleGAN-ADA (a generative adversarial network image
+generator incorporating adaptive data augmentation (ADA)) as the base model.
+Training images depicting a variety of cleft deformities were pre-processed to
+adjust for rotation, scaling, color adjustment and background blurring. The ADA
+modification of the primary algorithm permitted construction of our new
+generative model while requiring input of a relatively small number of training
+images. Adversarial training was carried out using 514 unique frontal
+photographs of cleft-affected faces to adapt a pre-trained model based on
+70,000 normal faces. The Frechet Inception Distance (FID) was used to measure
+the similarity of the newly generated facial images to the cleft training
+dataset, while Perceptual Path Length (PPL) and the novel Divergence Index of
+Severity Histograms (DISH) measures were also used to assess the performance of
+the image generator that we dub CleftGAN. We found that StyleGAN3 with
+translation invariance (StyleGAN3-t) performed optimally as a base model.
+Generated images achieved a low FID reflecting a close similarity to our
+training input dataset of genuine cleft images. Low PPL and DISH measures
+reflected a smooth and semantically valid interpolation of images through the
+transfer learning process and a similar distribution of severity in the
+training and generated images, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating the Robustness and Properties of Detection <span class="highlight-title">Transformer</span>s
+  (DETR) Toward Difficult Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08772v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08772v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhao Ning Zou, Yuhang Zhang, Robert Wijaya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based object detectors (DETR) have shown significant performance
+across machine vision tasks, ultimately in object detection. This detector is
+based on a self-attention mechanism along with the transformer encoder-decoder
+architecture to capture the global context in the image. The critical issue to
+be addressed is how this model architecture can handle different image
+nuisances, such as occlusion and adversarial perturbations. We studied this
+issue by measuring the performance of DETR with different experiments and
+benchmarking the network with convolutional neural network (CNN) based
+detectors like YOLO and Faster-RCNN. We found that DETR performs well when it
+comes to resistance to interference from information loss in occlusion images.
+Despite that, we found that the adversarial stickers put on the image require
+the network to produce a new unnecessary set of keys, queries, and values,
+which in most cases, results in a misdirection of the network. DETR also
+performed poorer than YOLOv5 in the image corruption benchmark. Furthermore, we
+found that DETR depends heavily on the main query when making a prediction,
+which leads to imbalanced contributions between queries since the main query
+receives most of the gradient flow.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Intelligent Scoliosis Screening and Diagnosis: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08756v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08756v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhang Zhenlin, Pu Lixin, Li Ang, Zhang Jun, Li Xianjie, Fan Jipeng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scoliosis is a three-dimensional spinal deformity, which may lead to abnormal
+morphologies, such as thoracic deformity, and pelvic tilt. Severe patients may
+suffer from nerve damage and urinary abnormalities. At present, the number of
+scoliosis patients in primary and secondary schools has exceeded five million
+in China, the incidence rate is about 3% to 5% which is growing every year. The
+research on scoliosis, therefore, has important clinical value. This paper
+systematically introduces computer-assisted scoliosis screening and diagnosis
+as well as analyzes the advantages and limitations of different algorithm
+models in the current issue field. Moreover, the paper also discusses the
+current development bottlenecks in this field and looks forward to future
+development trends.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>in Chinese language</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PU-Ray: Point Cloud Upsampling via Ray Marching on Implicit Surface 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08755v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08755v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sangwon Lim, Karim El-Basyouny, Yee Hong Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While the recent advancements in deep-learning-based point cloud upsampling
+methods improve the input to autonomous driving systems, they still suffer from
+the uncertainty of denser point generation resulting from end-to-end learning.
+For example, due to the vague training objectives of the models, their
+performance depends on the point distributions of the input and the ground
+truth. This causes problems of domain dependency between synthetic and
+real-scanned point clouds and issues with substantial model sizes and dataset
+requirements. Additionally, many existing methods upsample point clouds with a
+fixed scaling rate, making them inflexible and computationally redundant. This
+paper addresses the above problems by proposing a ray-based upsampling approach
+with an arbitrary rate, where a depth prediction is made for each query ray.
+The method simulates the ray marching algorithm to achieve more precise and
+stable ray-depth predictions through implicit surface learning. The rule-based
+mid-point query sampling method enables a uniform output point distribution
+without requiring model training using the Chamfer distance loss function,
+which can exhibit bias towards the training dataset. Self-supervised learning
+becomes possible with accurate ground truths within the input point cloud. The
+results demonstrate the method's versatility across different domains and
+training scenarios with limited computational resources and training data. This
+allows the upsampling task to transition from academic research to real-world
+applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages (10 main + 3 supplement), 19 figures (10 main + 9
+  supplement), 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AcTExplore: Active Tactile Exploration on Unknown Objects 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08745v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08745v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amir-Hossein Shahidzadeh, Seong Jong Yoo, Pavan Mantripragada, Chahat Deep Singh, Cornelia Fermüller, Yiannis Aloimonos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Tactile exploration plays a crucial role in understanding object structures
+for fundamental robotics tasks such as grasping and manipulation. However,
+efficiently exploring such objects using tactile sensors is challenging,
+primarily due to the large-scale unknown environments and limited sensing
+coverage of these sensors. To this end, we present AcTExplore, an active
+tactile exploration method driven by reinforcement learning for object
+reconstruction at scales that automatically explores the object surfaces in a
+limited number of steps. Through sufficient exploration, our algorithm
+incrementally collects tactile data and reconstructs 3D shapes of the objects
+as well, which can serve as a representation for higher-level downstream tasks.
+Our method achieves an average of 95.97% IoU coverage on unseen YCB objects
+while just being trained on primitive shapes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Development and Validation of a Deep Learning-Based Microsatellite
+  Instability Predictor from Prostate Cancer Whole-Slide Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08743v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08743v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiyuan Hu, Abbas A. Rizvi, Geoffery Schau, Kshitij Ingale, Yoni Muller, Rachel Baits, Sebastian Pretzer, Aïcha BenTaieb, Abigail Gordhamer, Roberto Nussenzveig, Adam Cole, Matthew O. Leavitt, Rohan P. Joshi, Nike Beaubier, Martin C. Stumpe, Kunal Nagpal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Microsatellite instability-high (MSI-H) is a tumor agnostic biomarker for
+immune checkpoint inhibitor therapy. However, MSI status is not routinely
+tested in prostate cancer, in part due to low prevalence and assay cost. As
+such, prediction of MSI status from hematoxylin and eosin (H&E) stained
+whole-slide images (WSIs) could identify prostate cancer patients most likely
+to benefit from confirmatory testing and becoming eligible for immunotherapy.
+Prostate biopsies and surgical resections from de-identified records of
+consecutive prostate cancer patients referred to our institution were analyzed.
+Their MSI status was determined by next generation sequencing. Patients before
+a cutoff date were split into an algorithm development set (n=4015, MSI-H 1.8%)
+and a paired validation set (n=173, MSI-H 19.7%) that consisted of two serial
+sections from each sample, one stained and scanned internally and the other at
+an external site. Patients after the cutoff date formed the temporal validation
+set (n=1350, MSI-H 2.3%). Attention-based multiple instance learning models
+were trained to predict MSI-H from H&E WSIs. The MSI-H predictor achieved area
+under the receiver operating characteristic curve values of 0.78 (95% CI
+[0.69-0.86]), 0.72 (95% CI [0.63-0.81]), and 0.72 (95% CI [0.62-0.82]) on the
+internally prepared, externally prepared, and temporal validation sets,
+respectively. While MSI-H status is significantly correlated with Gleason
+score, the model remained predictive within each Gleason score subgroup. In
+summary, we developed and validated an AI-based MSI-H diagnostic model on a
+large real-world cohort of routine H&E slides, which effectively generalized to
+externally stained and scanned samples and a temporally independent validation
+cohort. This algorithm has the potential to direct prostate cancer patients
+toward immunotherapy and to identify MSI-H cases secondary to Lynch syndrome.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Benchmarking Protocol for SAR Colorization: From Regression to Deep
+  Learning Approaches 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08705v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08705v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kangqing Shen, Gemine Vivone, Xiaoyuan Yang, Simone Lolli, Michael Schmitt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Synthetic aperture radar (SAR) images are widely used in remote sensing.
+Interpreting SAR images can be challenging due to their intrinsic speckle noise
+and grayscale nature. To address this issue, SAR colorization has emerged as a
+research direction to colorize gray scale SAR images while preserving the
+original spatial information and radiometric information. However, this
+research field is still in its early stages, and many limitations can be
+highlighted. In this paper, we propose a full research line for supervised
+learning-based approaches to SAR colorization. Our approach includes a protocol
+for generating synthetic color SAR images, several baselines, and an effective
+method based on the conditional generative adversarial network (cGAN) for SAR
+colorization. We also propose numerical assessment metrics for the problem at
+hand. To our knowledge, this is the first attempt to propose a research line
+for SAR colorization that includes a protocol, a benchmark, and a complete
+performance evaluation. Our extensive tests demonstrate the effectiveness of
+our proposed cGAN-based network for SAR colorization. The code will be made
+publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 16 figures, 6 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fed-Safe: Securing Federated Learning in Healthcare Against Adversarial
+  Attacks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08681v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08681v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Erfan Darzi, Nanna M. Sijtsema, P. M. A van Ooijen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper explores the security aspects of federated learning applications
+in medical image analysis. Current robustness-oriented methods like adversarial
+training, secure aggregation, and homomorphic encryption often risk privacy
+compromises. The central aim is to defend the network against potential privacy
+breaches while maintaining model robustness against adversarial manipulations.
+We show that incorporating distributed noise, grounded in the privacy
+guarantees in federated settings, enables the development of a adversarially
+robust model that also meets federated privacy standards. We conducted
+comprehensive evaluations across diverse attack scenarios, parameters, and use
+cases in cancer imaging, concentrating on pathology, meningioma, and glioma.
+The results reveal that the incorporation of distributed noise allows for the
+attainment of security levels comparable to those of conventional adversarial
+training while requiring fewer retraining samples to establish a robust model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SSG2: A new modelling paradigm for semantic segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08671v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08671v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Foivos I. Diakogiannis, Suzanne Furby, Peter Caccetta, Xiaoliang Wu, Rodrigo Ibata, Ondrej Hlinka, John Taylor
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  State-of-the-art models in semantic segmentation primarily operate on single,
+static images, generating corresponding segmentation masks. This one-shot
+approach leaves little room for error correction, as the models lack the
+capability to integrate multiple observations for enhanced accuracy. Inspired
+by work on semantic change detection, we address this limitation by introducing
+a methodology that leverages a sequence of observables generated for each
+static input image. By adding this "temporal" dimension, we exploit strong
+signal correlations between successive observations in the sequence to reduce
+error rates. Our framework, dubbed SSG2 (Semantic Segmentation Generation 2),
+employs a dual-encoder, single-decoder base network augmented with a sequence
+model. The base model learns to predict the set intersection, union, and
+difference of labels from dual-input images. Given a fixed target input image
+and a set of support images, the sequence model builds the predicted mask of
+the target by synthesizing the partial views from each sequence step and
+filtering out noise. We evaluate SSG2 across three diverse datasets:
+UrbanMonitor, featuring orthoimage tiles from Darwin, Australia with five
+spectral bands and 0.2m spatial resolution; ISPRS Potsdam, which includes true
+orthophoto images with multiple spectral bands and a 5cm ground sampling
+distance; and ISIC2018, a medical dataset focused on skin lesion segmentation,
+particularly melanoma. The SSG2 model demonstrates rapid convergence within the
+first few tens of epochs and significantly outperforms UNet-like baseline
+models with the same number of gradient updates. However, the addition of the
+temporal dimension results in an increased memory footprint. While this could
+be a limitation, it is offset by the advent of higher-memory GPUs and coding
+optimizations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multimodal Large Language Model for Visual Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08669v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08669v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yao-Hung Hubert Tsai, Vansh Dhar, Jialu Li, Bowen Zhang, Jian Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent efforts to enable visual navigation using large language models have
+mainly focused on developing complex prompt systems. These systems incorporate
+instructions, observations, and history into massive text prompts, which are
+then combined with pre-trained large language models to facilitate visual
+navigation. In contrast, our approach aims to fine-tune large language models
+for visual navigation without extensive prompt engineering. Our design involves
+a simple text prompt, current observations, and a history collector model that
+gathers information from previous observations as input. For output, our design
+provides a probability distribution of possible actions that the agent can take
+during navigation. We train our model using human demonstrations and collision
+signals from the Habitat-Matterport 3D Dataset (HM3D). Experimental results
+demonstrate that our method outperforms state-of-the-art behavior cloning
+methods and effectively reduces collision rates.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Histogram- and Diffusion-Based Medical Out-of-Distribution Detection <span class="chip">MICCAI 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08654v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08654v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Evi M. C. Huijben, Sina Amirrajab, Josien P. W. Pluim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-distribution (OOD) detection is crucial for the safety and reliability
+of artificial intelligence algorithms, especially in the medical domain. In the
+context of the Medical OOD (MOOD) detection challenge 2023, we propose a
+pipeline that combines a histogram-based method and a diffusion-based method.
+The histogram-based method is designed to accurately detect homogeneous
+anomalies in the toy examples of the challenge, such as blobs with constant
+intensity values. The diffusion-based method is based on one of the latest
+methods for unsupervised anomaly detection, called DDPM-OOD. We explore this
+method and propose extensive post-processing steps for pixel-level and
+sample-level anomaly detection on brain MRI and abdominal CT data provided by
+the challenge. Our results show that the proposed DDPM method is sensitive to
+blur and bias field samples, but faces challenges with anatomical deformation,
+black slice, and swapped patches. These findings suggest that further research
+is needed to improve the performance of DDPM for OOD detection in medical
+images.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 5 figures, submission to Medical Out-of-Distribution (MOOD)
+  challenge at MICCAI 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Defect Analysis of 3D Printed Cylinder Object Using Transfer Learning
+  Approaches 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08645v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08645v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Manjurul Ahsan, Shivakumar Raman, Zahed Siddique
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Additive manufacturing (AM) is gaining attention across various industries
+like healthcare, aerospace, and automotive. However, identifying defects early
+in the AM process can reduce production costs and improve productivity - a key
+challenge. This study explored the effectiveness of machine learning (ML)
+approaches, specifically transfer learning (TL) models, for defect detection in
+3D-printed cylinders. Images of cylinders were analyzed using models including
+VGG16, VGG19, ResNet50, ResNet101, InceptionResNetV2, and MobileNetV2.
+Performance was compared across two datasets using accuracy, precision, recall,
+and F1-score metrics. In the first study, VGG16, InceptionResNetV2, and
+MobileNetV2 achieved perfect scores. In contrast, ResNet50 had the lowest
+performance, with an average F1-score of 0.32. Similarly, in the second study,
+MobileNetV2 correctly classified all instances, while ResNet50 struggled with
+more false positives and fewer true positives, resulting in an F1-score of
+0.75. Overall, the findings suggest certain TL models like MobileNetV2 can
+deliver high accuracy for AM defect classification, although performance varies
+across algorithms. The results provide insights into model optimization and
+integration needs for reliable automated defect analysis during 3D printing. By
+identifying the top-performing TL techniques, this study aims to enhance AM
+product quality through robust image-based monitoring and inspection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeltaSpace: A Semantic-aligned Feature Space for Flexible Text-guided
+  Image Editing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08785v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08785v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yueming Lyu, Kang Zhao, Bo Peng, Yue Jiang, Yingya Zhang, Jing Dong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-guided image editing faces significant challenges to training and
+inference flexibility. Much literature collects large amounts of annotated
+image-text pairs to train text-conditioned generative models from scratch,
+which is expensive and not efficient. After that, some approaches that leverage
+pre-trained vision-language models are put forward to avoid data collection,
+but they are also limited by either per text-prompt optimization or
+inference-time hyper-parameters tuning. To address these issues, we investigate
+and identify a specific space, referred to as CLIP DeltaSpace, where the CLIP
+visual feature difference of two images is semantically aligned with the CLIP
+textual feature difference of their corresponding text descriptions. Based on
+DeltaSpace, we propose a novel framework called DeltaEdit, which maps the CLIP
+visual feature differences to the latent space directions of a generative model
+during the training phase, and predicts the latent space directions from the
+CLIP textual feature differences during the inference phase. And this design
+endows DeltaEdit with two advantages: (1) text-free training; (2)
+generalization to various text prompts for zero-shot inference. Extensive
+experiments validate the effectiveness and versatility of DeltaEdit with
+different generative models, including both the GAN model and the diffusion
+model, in achieving flexible text-guided image editing. Code is available at
+https://github.com/Yueming6568/DeltaEdit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages. arXiv admin note: text overlap with arXiv:2303.06285</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Soundify: Matching Sound Effects to Video <span class="chip">NeurIPS 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.09726v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.09726v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Chuan-En Lin, Anastasis Germanidis, Cristóbal Valenzuela, Yining Shi, Nikolas Martelaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the art of video editing, sound helps add character to an object and
+immerse the viewer within a space. Through formative interviews with
+professional editors (N=10), we found that the task of adding sounds to video
+can be challenging. This paper presents Soundify, a system that assists editors
+in matching sounds to video. Given a video, Soundify identifies matching
+sounds, synchronizes the sounds to the video, and dynamically adjusts panning
+and volume to create spatial audio. In a human evaluation study (N=889), we
+show that Soundify is capable of matching sounds to video out-of-the-box for a
+diverse range of audio categories. In a within-subjects expert study (N=12), we
+demonstrate the usefulness of Soundify in helping video editors match sounds to
+video with lighter workload, reduced task completion time, and improved
+usability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Full paper in UIST 2023; Short paper in NeurIPS 2021 ML4CD Workshop;
+  Online demo: https://soundify.cc</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ StoryBench: A Multifaceted Benchmark for Continuous Story Visualization <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.11606v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.11606v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Emanuele Bugliarello, Hernan Moraldo, Ruben Villegas, Mohammad Babaeizadeh, Mohammad Taghi Saffar, Han Zhang, Dumitru Erhan, Vittorio Ferrari, Pieter-Jan Kindermans, Paul Voigtlaender
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generating video stories from text prompts is a complex task. In addition to
+having high visual quality, videos need to realistically adhere to a sequence
+of text prompts whilst being consistent throughout the frames. Creating a
+benchmark for video generation requires data annotated over time, which
+contrasts with the single caption used often in video datasets. To fill this
+gap, we collect comprehensive human annotations on three existing datasets, and
+introduce StoryBench: a new, challenging multi-task benchmark to reliably
+evaluate forthcoming text-to-video models. Our benchmark includes three video
+generation tasks of increasing difficulty: action execution, where the next
+action must be generated starting from a conditioning video; story
+continuation, where a sequence of actions must be executed starting from a
+conditioning video; and story generation, where a video must be generated from
+only text prompts. We evaluate small yet strong text-to-video baselines, and
+show the benefits of training on story-like data algorithmically generated from
+existing video captions. Finally, we establish guidelines for human evaluation
+of video stories, and reaffirm the need of better automatic metrics for video
+generation. StoryBench aims at encouraging future research efforts in this
+exciting new area.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS D&B 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DePT: Decomposed <span class="highlight-title">Prompt</span> Tuning for Parameter-Efficient Fine-tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.05173v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.05173v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengxiang Shi, Aldo Lipani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompt tuning (PT), where a small amount of trainable soft (continuous)
+prompt vectors is affixed to the input of language models (LM), has shown
+promising results across various tasks and models for parameter-efficient
+fine-tuning (PEFT). PT stands out from other PEFT approaches because it
+maintains competitive performance with fewer trainable parameters and does not
+drastically scale up its parameters as the model size expands. However, PT
+introduces additional soft prompt tokens, leading to longer input sequences,
+which significantly impacts training and inference time and memory usage due to
+the Transformer's quadratic complexity. Particularly concerning for Large
+Language Models (LLMs) that face heavy daily querying. To address this issue,
+we propose Decomposed Prompt Tuning (DePT), which decomposes the soft prompt
+into a shorter soft prompt and a pair of low-rank matrices that are then
+optimised with two different learning rates. This allows DePT to achieve better
+performance while saving over 20% memory and time costs compared to vanilla PT
+and its variants, without changing trainable parameter sizes. Through extensive
+experiments on 23 natural language processing (NLP) and vision-language (VL)
+tasks, we demonstrate that DePT outperforms state-of-the-art PEFT approaches,
+including the full fine-tuning baseline in some scenarios. Additionally, we
+empirically show that DEPT grows more efficient as the model size increases.
+Our further study reveals that DePT integrates seamlessly with
+parameter-efficient transfer learning in the few-shot learning setting and
+highlights its adaptability to various model architectures and sizes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at https://github.com/ZhengxiangShi/DePT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CheXmask: a large-scale <span class="highlight-title">dataset</span> of anatomical segmentation masks for
+  multi-center chest x-ray images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03293v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03293v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicolás Gaggion, Candelaria Mosquera, Lucas Mansilla, Martina Aineseder, Diego H. Milone, Enzo Ferrante
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The development of successful artificial intelligence models for chest X-ray
+analysis relies on large, diverse datasets with high-quality annotations. While
+several databases of chest X-ray images have been released, most include
+disease diagnosis labels but lack detailed pixel-level anatomical segmentation
+labels. To address this gap, we introduce an extensive chest X-ray multi-center
+segmentation dataset with uniform and fine-grain anatomical annotations for
+images coming from six well-known publicly available databases: CANDID-PTX,
+ChestX-ray8, Chexpert, MIMIC-CXR-JPG, Padchest, and VinDr-CXR, resulting in
+676,803 segmentation masks. Our methodology utilizes the HybridGNet model to
+ensure consistent and high-quality segmentations across all datasets. Rigorous
+validation, including expert physician evaluation and automatic quality
+control, was conducted to validate the resulting masks. Additionally, we
+provide individualized quality indices per mask and an overall quality
+estimation per dataset. This dataset serves as a valuable resource for the
+broader scientific community, streamlining the development and assessment of
+innovative methodologies in chest X-ray analysis. The CheXmask dataset is
+publicly available at:
+https://physionet.org/content/chexmask-cxr-segmentation-data/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The CheXmask dataset is publicly available at
+  https://physionet.org/content/chexmask-cxr-segmentation-data/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NECO: NEural Collapse Based Out-of-distribution detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06823v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06823v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mouïn Ben Ammar, Nacim Belkhir, Sebastian Popescu, Antoine Manzanera, Gianni Franchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting out-of-distribution (OOD) data is a critical challenge in machine
+learning due to model overconfidence, often without awareness of their
+epistemological limits. We hypothesize that ``neural collapse'', a phenomenon
+affecting in-distribution data for models trained beyond loss convergence, also
+influences OOD data. To benefit from this interplay, we introduce NECO, a novel
+post-hoc method for OOD detection, which leverages the geometric properties of
+``neural collapse'' and of principal component spaces to identify OOD data. Our
+extensive experiments demonstrate that NECO achieves state-of-the-art results
+on both small and large-scale OOD detection tasks while exhibiting strong
+generalization capabilities across different network architectures.
+Furthermore, we provide a theoretical explanation for the effectiveness of our
+method in OOD detection. We plan to release the code after the anonymity
+period.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On Scale Space Radon Transform, Properties and Application in CT Image
+  Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.05188v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.05188v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nafaa Nacereddine, Djemel Ziou, Aicha Baya Goumeidane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since the Radon transform (RT) consists in a line integral function, some
+modeling assumptions are made on Computed Tomography (CT) system, making image
+reconstruction analytical methods, such as Filtered Backprojection (FBP),
+sensitive to artifacts and noise. In the other hand, recently, a new integral
+transform, called Scale Space Radon Transform (SSRT), is introduced where, RT
+is a particular case. Thanks to its interesting properties, such as good scale
+space behavior, the SSRT has known number of new applications. In this paper,
+with the aim to improve the reconstructed image quality for these methods, we
+propose to model the X-ray beam with the Scale Space Radon Transform (SSRT)
+where, the assumptions done on the physical dimensions of the CT system
+elements reflect better the reality. After depicting the basic properties and
+the inversion of SSRT, the FBP algorithm is used to reconstruct the image from
+the SSRT sinogram where the RT spectrum used in FBP is replaced by SSRT and the
+Gaussian kernel, expressed in their frequency domain. PSNR and SSIM, as quality
+measures, are used to compare RT and SSRT-based image reconstruction on
+Shepp-Logan head and anthropomorphic abdominal phantoms. The first findings
+show that the SSRT-based method outperforms the methods based on RT,
+especially, when the number of projections is reduced, making it more
+appropriate for applications requiring low-dose radiation, such as medical
+X-ray CT. While SSRT-FBP and RT-FBP have utmost the same runtime, the
+experiments show that SSRT-FBP is more robust to Poisson-Gaussian noise
+corrupting CT data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can <span class="highlight-title">Pre-train</span>ed Vision and Language Models Answer Visual
+  Information-Seeking Questions? <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.11713v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.11713v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yang Chen, Hexiang Hu, Yi Luan, Haitian Sun, Soravit Changpinyo, Alan Ritter, Ming-Wei Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained vision and language models have demonstrated state-of-the-art
+capabilities over existing tasks involving images and texts, including visual
+question answering. However, it remains unclear whether these models possess
+the capability to answer questions that are not only querying visual content
+but knowledge-intensive and information-seeking. In this study, we introduce
+InfoSeek, a visual question answering dataset tailored for information-seeking
+questions that cannot be answered with only common sense knowledge. Using
+InfoSeek, we analyze various pre-trained visual question answering models and
+gain insights into their characteristics. Our findings reveal that
+state-of-the-art pre-trained multi-modal models (e.g., PaLI-X, BLIP2, etc.)
+face challenges in answering visual information-seeking questions, but
+fine-tuning on the InfoSeek dataset elicits models to use fine-grained
+knowledge that was learned during their pre-training. Furthermore, we show that
+accurate visual entity recognition can be used to improve performance on
+InfoSeek by retrieving relevant documents, showing a significant space for
+improvement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 (main conference); Our dataset and evaluation is available
+  at https://open-vision-language.github.io/infoseek/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ High-fidelity Pseudo-labels for Boosting Weakly-Supervised Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.02621v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.02621v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arvi Jonnarth, Yushan Zhang, Michael Felsberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image-level weakly-supervised semantic segmentation (WSSS) reduces the
+usually vast data annotation cost by surrogate segmentation masks during
+training. The typical approach involves training an image classification
+network using global average pooling (GAP) on convolutional feature maps. This
+enables the estimation of object locations based on class activation maps
+(CAMs), which identify the importance of image regions. The CAMs are then used
+to generate pseudo-labels, in the form of segmentation masks, to supervise a
+segmentation model in the absence of pixel-level ground truth. Our work is
+based on two techniques for improving CAMs; importance sampling, which is a
+substitute for GAP, and the feature similarity loss, which utilizes a heuristic
+that object contours almost always align with color edges in images. However,
+both are based on the multinomial posterior with softmax, and implicitly assume
+that classes are mutually exclusive, which turns out suboptimal in our
+experiments. Thus, we reformulate both techniques based on binomial posteriors
+of multiple independent binary problems. This has two benefits; their
+performance is improved and they become more general, resulting in an add-on
+method that can boost virtually any WSSS method. This is demonstrated on a wide
+variety of baselines on the PASCAL VOC dataset, improving the region similarity
+and contour quality of all implemented state-of-the-art methods. Experiments on
+the MS COCO dataset show that our proposed add-on is well-suited for
+large-scale settings. Our code is available at https://github.com/arvijj/hfpl.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Emotional Adaptation for Audio-Driven Talking-Head Generation <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.04946v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.04946v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Gan, Zongxin Yang, Xihang Yue, Lingyun Sun, Yi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio-driven talking-head synthesis is a popular research topic for virtual
+human-related applications. However, the inflexibility and inefficiency of
+existing methods, which necessitate expensive end-to-end training to transfer
+emotions from guidance videos to talking-head predictions, are significant
+limitations. In this work, we propose the Emotional Adaptation for Audio-driven
+Talking-head (EAT) method, which transforms emotion-agnostic talking-head
+models into emotion-controllable ones in a cost-effective and efficient manner
+through parameter-efficient adaptations. Our approach utilizes a pretrained
+emotion-agnostic talking-head transformer and introduces three lightweight
+adaptations (the Deep Emotional Prompts, Emotional Deformation Network, and
+Emotional Adaptation Module) from different perspectives to enable precise and
+realistic emotion controls. Our experiments demonstrate that our approach
+achieves state-of-the-art performance on widely-used benchmarks, including LRW
+and MEAD. Additionally, our parameter-efficient adaptations exhibit remarkable
+generalization ability, even in scenarios where emotional training videos are
+scarce or nonexistent. Project website: https://yuangan.github.io/eat/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023. Project page: https://yuangan.github.io/eat/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ClusVPR: Efficient Visual Place Recognition with Clustering-based
+  Weighted <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04099v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04099v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yifan Xu, Pourya Shamsolmoali, Jie Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual place recognition (VPR) is a highly challenging task that has a wide
+range of applications, including robot navigation and self-driving vehicles.
+VPR is particularly difficult due to the presence of duplicate regions and the
+lack of attention to small objects in complex scenes, resulting in recognition
+deviations. In this paper, we present ClusVPR, a novel approach that tackles
+the specific issues of redundant information in duplicate regions and
+representations of small objects. Different from existing methods that rely on
+Convolutional Neural Networks (CNNs) for feature map generation, ClusVPR
+introduces a unique paradigm called Clustering-based Weighted Transformer
+Network (CWTNet). CWTNet leverages the power of clustering-based weighted
+feature maps and integrates global dependencies to effectively address visual
+deviations encountered in large-scale VPR problems. We also introduce the
+optimized-VLAD (OptLAD) layer that significantly reduces the number of
+parameters and enhances model efficiency. This layer is specifically designed
+to aggregate the information obtained from scale-wise image patches.
+Additionally, our pyramid self-supervised strategy focuses on extracting
+representative and diverse information from scale-wise image patches instead of
+entire images, which is crucial for capturing representative and diverse
+information in VPR. Extensive experiments on four VPR datasets show our model's
+superior performance compared to existing models while being less complex.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LDRNet: Enabling Real-time Document Localization on Mobile Devices <span class="chip">ECML-PKDD 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.02136v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.02136v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Wu, Holland Qian, Huaming Wu, Aad van Moorsel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While Identity Document Verification (IDV) technology on mobile devices
+becomes ubiquitous in modern business operations, the risk of identity theft
+and fraud is increasing. The identity document holder is normally required to
+participate in an online video interview to circumvent impostors. However, the
+current IDV process depends on an additional human workforce to support online
+step-by-step guidance which is inefficient and expensive. The performance of
+existing AI-based approaches cannot meet the real-time and lightweight demands
+of mobile devices. In this paper, we address those challenges by designing an
+edge intelligence-assisted approach for real-time IDV. Aiming at improving the
+responsiveness of the IDV process, we propose a new document localization model
+for mobile devices, LDRNet, to Localize the identity Document in Real-time. On
+the basis of a lightweight backbone network, we build three prediction branches
+for LDRNet, the corner points prediction, the line borders prediction and the
+document classification. We design novel supplementary targets, the
+equal-division points, and use a new loss function named Line Loss, to improve
+the speed and accuracy of our approach. In addition to the IDV process, LDRNet
+is an efficient and reliable document localization alternative for all kinds of
+mobile applications. As a matter of proof, we compare the performance of LDRNet
+with other popular approaches on localizing general document datasets. The
+experimental results show that LDRNet runs at a speed up to 790 FPS which is
+47x faster, while still achieving comparable Jaccard Index(JI) in single-model
+and single-scale tests.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ECML-PKDD 2022 https://doi.org/10.1007/978-3-031-23618-1_42</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Characterising representation dynamics in recurrent neural networks for
+  object recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.12435v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.12435v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sushrut Thorat, Adrien Doerig, Tim C. Kietzmann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recurrent neural networks (RNNs) have yielded promising results for both
+recognizing objects in challenging conditions and modeling aspects of primate
+vision. However, the representational dynamics of recurrent computations remain
+poorly understood, especially in large-scale visual models. Here, we studied
+such dynamics in RNNs trained for object classification on MiniEcoset, a novel
+subset of ecoset. We report two main insights. First, upon inference,
+representations continued to evolve after correct classification, suggesting a
+lack of the notion of being ``done with classification''. Second, focusing on
+``readout zones'' as a way to characterize the activation trajectories, we
+observe that misclassified representations exhibit activation patterns with
+lower L2 norm, and are positioned more peripherally in the readout zones. Such
+arrangements help the misclassified representations move into the correct zones
+as time progresses. Our findings generalize to networks with lateral and
+top-down connections, and include both additive and multiplicative interactions
+with the bottom-up sweep. The results therefore contribute to a general
+understanding of RNN dynamics in naturalistic tasks. We hope that the analysis
+framework will aid future investigations of other types of RNNs, including
+understanding of representational dynamics in primate vision.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures; revision of our Conference on Cognitive
+  Computational Neuroscience (CCN) 2023 paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Machine Learning Paradigm for Studying Pictorial Realism: Are
+  Constable's Clouds More Real than His Contemporaries? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.09348v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.09348v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuomin Zhang, Elizabeth C. Mansfield, Jia Li, John Russell, George S. Young, Catherine Adams, James Z. Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The British landscape painter John Constable is considered foundational for
+the Realist movement in 19th-century European painting. Constable's painted
+skies, in particular, were seen as remarkably accurate by his contemporaries,
+an impression shared by many viewers today. Yet, assessing the accuracy of
+realist paintings like Constable's is subjective or intuitive, even for
+professional art historians, making it difficult to say with certainty what set
+Constable's skies apart from those of his contemporaries. Our goal is to
+contribute to a more objective understanding of Constable's realism. We propose
+a new machine-learning-based paradigm for studying pictorial realism in an
+explainable way. Our framework assesses realism by measuring the similarity
+between clouds painted by artists noted for their skies, like Constable, and
+photographs of clouds. The experimental results of cloud classification show
+that Constable approximates more consistently than his contemporaries the
+formal features of actual clouds in his paintings. The study, as a novel
+interdisciplinary approach that combines computer vision and machine learning,
+meteorology, and art history, is a springboard for broader and deeper analyses
+of pictorial realism.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Supplementary materials are available from the authors or
+  http://wang.ist.psu.edu</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimizing Convolutional Neural Networks for Chronic Obstructive
+  Pulmonary Disease Detection in Clinical Computed Tomography Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.07189v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.07189v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tina Dorosti, Manuel Schultheiss, Felix Hofmann, Johannes Thalhammer, Luisa Kirchner, Theresa Urban, Franz Pfeiffer, Florian Schaff, Tobias Lasser, Daniela Pfeiffer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We aim to optimize the binary detection of Chronic Obstructive Pulmonary
+Disease (COPD) based on emphysema presence in the lung with convolutional
+neural networks (CNN) by exploring manually adjusted versus automated
+window-setting optimization (WSO) on computed tomography (CT) images. 7,194 CT
+images (3,597 with COPD; 3,597 healthy controls) from 78 subjects (43 with
+COPD; 35 healthy controls) were selected retrospectively (10.2018-12.2019) and
+preprocessed. For each image, intensity values were manually clipped to the
+emphysema window setting and a baseline 'full-range' window setting.
+Class-balanced train, validation, and test sets contained 3,392, 1,114, and
+2,688 images. The network backbone was optimized by comparing various CNN
+architectures. Furthermore, automated WSO was implemented by adding a
+customized layer to the model. The image-level area under the Receiver
+Operating Characteristics curve (AUC) [lower, upper limit 95% confidence] was
+utilized to compare model variations. Repeated inference (n=7) on the test set
+showed that the DenseNet was the most efficient backbone and achieved a mean
+AUC of 0.80 [0.76, 0.85] without WSO. Comparably, with input images manually
+adjusted to the emphysema window, the DenseNet model predicted COPD with a mean
+AUC of 0.86 [0.82, 0.89]. By adding a customized WSO layer to the DenseNet, an
+optimal window in the proximity of the emphysema window setting was learned
+automatically, and a mean AUC of 0.82 [0.78, 0.86] was achieved. Detection of
+COPD with DenseNet models was improved by WSO of CT data to the emphysema
+window setting range.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Primitive Simultaneous Optimization of Similarity Metrics for Image
+  Registration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.01601v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.01601v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Diana Waldmannstetter, Benedikt Wiestler, Julian Schwarting, Ivan Ezhov, Marie Metz, Spyridon Bakas, Bhakti Baheti, Satrajit Chakrabarty, Daniel Rueckert, Jan S. Kirschke, Rolf A. Heckemann, Marie Piraud, Bjoern H. Menze, Florian Kofler
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Even though simultaneous optimization of similarity metrics is a standard
+procedure in the field of semantic segmentation, surprisingly, this is much
+less established for image registration. To help closing this gap in the
+literature, we investigate in a complex multi-modal 3D setting whether
+simultaneous optimization of registration metrics, here implemented by means of
+primitive summation, can benefit image registration. We evaluate two
+challenging datasets containing collections of pre- to post-operative and pre-
+to intra-operative MR images of glioma. Employing the proposed optimization, we
+demonstrate improved registration accuracy in terms of TRE on expert
+neuroradiologists' landmark annotations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Comparative study of multi-person tracking methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04825v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04825v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Denis Mbey Akola
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a study of two tracking algorithms (SORT~\cite{7533003}
+and Tracktor++~\cite{2019}) that were ranked first positions on the MOT
+Challenge leaderboard (The MOTChallenge web page: https://motchallenge.net ).
+The purpose of this study is to discover the techniques used and to provide
+useful insights about these algorithms in the tracking pipeline that could
+improve the performance of MOT tracking algorithms. To this end, we adopted the
+popular tracking-by-detection approach. We trained our own Pedestrian Detection
+model using the MOT17Det dataset (MOT17Det :
+https://motchallenge.net/data/MOT17Det/ ). We also used a re-identification
+model trained on MOT17 dataset (MOT17 : https://motchallenge.net/data/MOT17/ )
+for Tracktor++ to reduce the false re-identification alarms. We then present
+experimental results which shows that Tracktor++ is a better multi-person
+tracking algorithm than SORT. We also performed ablation studies to discover
+the contribution of re-identification(RE-ID) network and motion to the results
+of Tracktor++. We finally conclude by providing some recommendations for future
+research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ How to effectively train an ensemble of Faster R-CNN object detectors to
+  quantify uncertainty 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04829v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04829v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Denis Mbey Akola, Gianni Franchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a new approach for training two-stage object detection
+ensemble models, more specifically, Faster R-CNN models to estimate
+uncertainty. We propose training one Region Proposal
+Network(RPN)~\cite{https://doi.org/10.48550/arxiv.1506.01497} and multiple Fast
+R-CNN prediction heads is all you need to build a robust deep ensemble network
+for estimating uncertainty in object detection. We present this approach and
+provide experiments to show that this approach is much faster than the naive
+method of fully training all $n$ models in an ensemble. We also estimate the
+uncertainty by measuring this ensemble model's Expected Calibration Error
+(ECE). We then further compare the performance of this model with that of
+Gaussian YOLOv3, a variant of YOLOv3 that models uncertainty using predicted
+bounding box coordinates. The source code is released at
+\url{https://github.com/Akola-Mbey-Denis/EfficientEnsemble}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OWAdapt: An adaptive loss function for deep learning using OWA operators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19443v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19443v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastián Maldonado, Carla Vairetti, Katherine Jara, Miguel Carrasco, Julio López
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a fuzzy adaptive loss function for enhancing deep
+learning performance in classification tasks. Specifically, we redefine the
+cross-entropy loss to effectively address class-level noise conditions,
+including the challenging problem of class imbalance. Our approach introduces
+aggregation operators, leveraging the power of fuzzy logic to improve
+classification accuracy. The rationale behind our proposed method lies in the
+iterative up-weighting of class-level components within the loss function,
+focusing on those with larger errors. To achieve this, we employ the ordered
+weighted average (OWA) operator and combine it with an adaptive scheme for
+gradient-based learning. Through extensive experimentation, our method
+outperforms other commonly used loss functions, such as the standard
+cross-entropy or focal loss, across various binary and multiclass
+classification tasks. Furthermore, we explore the influence of hyperparameters
+associated with the OWA operators and present a default configuration that
+performs well across different experimental settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 1 figure, published</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Computer Vision Pipeline for Automated Antarctic Krill Analysis <span class="chip">BMVC 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.06188v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.06188v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mazvydas Gudelis, Michal Mackiewicz, Julie Bremner, Sophie Fielding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  British Antarctic Survey (BAS) researchers launch annual expeditions to the
+Antarctic in order to estimate Antarctic Krill biomass and assess the change
+from previous years. These comparisons provide insight into the effects of the
+current environment on this key component of the marine food chain. In this
+work we have developed tools for automating the data collection and analysis
+process, using web-based image annotation tools and deep learning image
+classification and regression models. We achieve highly accurate krill instance
+segmentation results with an average 77.28% AP score, as well as separate
+maturity stage and length estimation of krill specimens with 62.99% accuracy
+and a 1.98mm length error respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to MVEO @ BMVC 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Real-time Strawberry Detection Based on Improved YOLOv5s Architecture
+  for Robotic Harvesting in open-field environment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03998v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03998v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixuan He, Salik Ram Khanal, Xin Zhang, Manoj Karkee, Qin Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study proposed a YOLOv5-based custom object detection model to detect
+strawberries in an outdoor environment. The original architecture of the
+YOLOv5s was modified by replacing the C3 module with the C2f module in the
+backbone network, which provided a better feature gradient flow. Secondly, the
+Spatial Pyramid Pooling Fast in the final layer of the backbone network of
+YOLOv5s was combined with Cross Stage Partial Net to improve the generalization
+ability over the strawberry dataset in this study. The proposed architecture
+was named YOLOv5s-Straw. The RGB images dataset of the strawberry canopy with
+three maturity classes (immature, nearly mature, and mature) was collected in
+open-field environment and augmented through a series of operations including
+brightness reduction, brightness increase, and noise adding. To verify the
+superiority of the proposed method for strawberry detection in open-field
+environment, four competitive detection models (YOLOv3-tiny, YOLOv5s,
+YOLOv5s-C2f, and YOLOv8s) were trained, and tested under the same computational
+environment and compared with YOLOv5s-Straw. The results showed that the
+highest mean average precision of 80.3% was achieved using the proposed
+architecture whereas the same was achieved with YOLOv3-tiny, YOLOv5s,
+YOLOv5s-C2f, and YOLOv8s were 73.4%, 77.8%, 79.8%, 79.3%, respectively.
+Specifically, the average precision of YOLOv5s-Straw was 82.1% in the immature
+class, 73.5% in the nearly mature class, and 86.6% in the mature class, which
+were 2.3% and 3.7%, respectively, higher than that of the latest YOLOv8s. The
+model included 8.6*10^6 network parameters with an inference speed of 18ms per
+image while the inference speed of YOLOv8s had a slower inference speed of
+21.0ms and heavy parameters of 11.1*10^6, which indicates that the proposed
+model is fast enough for real time strawberry detection and localization for
+the robotic picking.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages; 15 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bengali Document Layout Analysis -- A YOLOV8 Based Ensembling Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.00848v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.00848v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nazmus Sakib Ahmed, Saad Sakib Noor, Ashraful Islam Shanto Sikder, Abhijit Paul
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper focuses on enhancing Bengali Document Layout Analysis (DLA) using
+the YOLOv8 model and innovative post-processing techniques. We tackle
+challenges unique to the complex Bengali script by employing data augmentation
+for model robustness. After meticulous validation set evaluation, we fine-tune
+our approach on the complete dataset, leading to a two-stage prediction
+strategy for accurate element segmentation. Our ensemble model, combined with
+post-processing, outperforms individual base architectures, addressing issues
+identified in the BaDLAD dataset. By leveraging this approach, we aim to
+advance Bengali document analysis, contributing to improved OCR and document
+comprehension and BaDLAD serves as a foundational resource for this endeavor,
+aiding future research in the field. Furthermore, our experiments provided key
+insights to incorporate new strategies into the established solution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Automated Chest X-Ray Report Generator Using Multi-Model Deep Learning
+  Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05969v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05969v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arief Purnama Muharram, Hollyana Puteri Haryono, Abassi Haji Juma, Ira Puspasari, Nugraha Priya Utama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reading and interpreting chest X-ray images is one of the most radiologist's
+routines. However, it still can be challenging, even for the most experienced
+ones. Therefore, we proposed a multi-model deep learning-based automated chest
+X-ray report generator system designed to assist radiologists in their work.
+The basic idea of the proposed system is by utilizing multi
+binary-classification models for detecting multi abnormalities, with each model
+responsible for detecting one abnormality, in a single image. In this study, we
+limited the radiology abnormalities detection to only cardiomegaly, lung
+effusion, and consolidation. The system generates a radiology report by
+performing the following three steps: image pre-processing, utilizing deep
+learning models to detect abnormalities, and producing a report. The aim of the
+image pre-processing step is to standardize the input by scaling it to 128x128
+pixels and slicing it into three segments, which covers the upper, lower, and
+middle parts of the lung. After pre-processing, each corresponding model
+classifies the image, resulting in a 0 (zero) for no abnormality detected and a
+1 (one) for the presence of an abnormality. The prediction outputs of each
+model are then concatenated to form a 'result code'. The 'result code' is used
+to construct a report by selecting the appropriate pre-determined sentence for
+each detected abnormality in the report generation step. The proposed system is
+expected to reduce the workload of radiologists and increase the accuracy of
+chest X-ray diagnosis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented in the 2023 IEEE International Conference on Data and
+  Software Engineering (ICoDSE 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FedDrive v2: an Analysis of the Impact of Label Skewness in Federated
+  Semantic Segmentation for Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.13336v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.13336v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eros Fanì, Marco Ciccone, Barbara Caputo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose FedDrive v2, an extension of the Federated Learning benchmark for
+Semantic Segmentation in Autonomous Driving. While the first version aims at
+studying the effect of domain shift of the visual features across clients, in
+this work, we focus on the distribution skewness of the labels. We propose six
+new federated scenarios to investigate how label skewness affects the
+performance of segmentation models and compare it with the effect of domain
+shift. Finally, we study the impact of using the domain information during
+testing. Official website: https://feddrive.github.io
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5th Italian Conference on Robotics and Intelligent Machines (I-RIM)
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PoRF: Pose Residual Field for Accurate Neural Surface Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07449v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07449v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia-Wang Bian, Wenjing Bian, Victor Adrian Prisacariu, Philip Torr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural surface reconstruction is sensitive to the camera pose noise, even if
+state-of-the-art pose estimators like COLMAP or ARKit are used. More
+importantly, existing Pose-NeRF joint optimisation methods have struggled to
+improve pose accuracy in challenging real-world scenarios. To overcome the
+challenges, we introduce the pose residual field (\textbf{PoRF}), a novel
+implicit representation that uses an MLP for regressing pose updates. This is
+more robust than the conventional pose parameter optimisation due to parameter
+sharing that leverages global information over the entire sequence.
+Furthermore, we propose an epipolar geometry loss to enhance the supervision
+that leverages the correspondences exported from COLMAP results without the
+extra computational overhead. Our method yields promising results. On the DTU
+dataset, we reduce the rotation error by 78\% for COLMAP poses, leading to the
+decreased reconstruction Chamfer distance from 3.48mm to 0.85mm. On the
+MobileBrick dataset that contains casually captured unbounded 360-degree
+videos, our method refines ARKit poses and improves the reconstruction F1 score
+from 69.18 to 75.67, outperforming that with the dataset provided ground-truth
+pose (75.14). These achievements demonstrate the efficacy of our approach in
+refining camera poses and improving the accuracy of neural surface
+reconstruction in real-world scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Spiking Denoising Diffusion Probabilistic Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.17046v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.17046v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiahang Cao, Ziqing Wang, Hanzhong Guo, Hao Cheng, Qiang Zhang, Renjing Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking neural networks (SNNs) have ultra-low energy consumption and high
+biological plausibility due to their binary and bio-driven nature compared with
+artificial neural networks (ANNs). While previous research has primarily
+focused on enhancing the performance of SNNs in classification tasks, the
+generative potential of SNNs remains relatively unexplored. In our paper, we
+put forward Spiking Denoising Diffusion Probabilistic Models (SDDPM), a new
+class of SNN-based generative models that achieve high sample quality. To fully
+exploit the energy efficiency of SNNs, we propose a purely Spiking U-Net
+architecture, which achieves comparable performance to its ANN counterpart
+using only 4 time steps, resulting in significantly reduced energy consumption.
+Extensive experimental results reveal that our approach achieves
+state-of-the-art on the generative tasks and substantially outperforms other
+SNN-based generative models, achieving up to $12\times$ and $6\times$
+improvement on the CIFAR-10 and the CelebA datasets, respectively. Moreover, we
+propose a threshold-guided strategy that can further improve the performances
+by 16.7% in a training-free manner. The SDDPM symbolizes a significant
+advancement in the field of SNN generation, injecting new perspectives and
+potential avenues of exploration.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vision <span class="highlight-title">Transformer</span>s: From Semantic Segmentation to Dense Prediction <span class="chip">CVPR 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.09339v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.09339v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Zhang, Jiachen Lu, Sixiao Zheng, Xinxuan Zhao, Xiatian Zhu, Yanwei Fu, Tao Xiang, Jianfeng Feng, Philip H. S. Torr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The emergence of vision transformers (ViTs) in image classification has
+shifted the methodologies for visual representation learning. In particular,
+ViTs learn visual representation at full receptive field per layer across all
+the image patches, in comparison to the increasing receptive fields of CNNs
+across layers and other alternatives (e.g., large kernels and atrous
+convolution). In this work, for the first time we explore the global context
+learning potentials of ViTs for dense visual prediction (e.g., semantic
+segmentation). Our motivation is that through learning global context at full
+receptive field layer by layer, ViTs may capture stronger long-range dependency
+information, critical for dense prediction tasks. We first demonstrate that
+encoding an image as a sequence of patches, a vanilla ViT without local
+convolution and resolution reduction can yield stronger visual representation
+for semantic segmentation. For example, our model, termed as SEgmentation
+TRansformer (SETR), excels on ADE20K (50.28% mIoU, the first position in the
+test leaderboard on the day of submission) and Pascal Context (55.83% mIoU),
+and performs competitively on Cityscapes. For tackling general dense visual
+prediction tasks in a cost-effective manner, we further formulate a family of
+Hierarchical Local-Global (HLG) Transformers, characterized by local attention
+within windows and global-attention across windows in a pyramidal architecture.
+Extensive experiments show that our methods achieve appealing performance on a
+variety of dense prediction tasks (e.g., object detection and instance
+segmentation and semantic segmentation) as well as image classification. Our
+code and models are available at https://github.com/fudan-zvg/SETR.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extended version of CVPR 2021 paper arXiv:2012.15840</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pushing the Limits of Fewshot Anomaly Detection in Industry Vision:
+  Graphcore 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.12082v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.12082v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoyang Xie, Jinbao Wang, Jiaqi Liu, Feng Zheng, Yaochu Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the area of fewshot anomaly detection (FSAD), efficient visual feature
+plays an essential role in memory bank M-based methods. However, these methods
+do not account for the relationship between the visual feature and its rotated
+visual feature, drastically limiting the anomaly detection performance. To push
+the limits, we reveal that rotation-invariant feature property has a
+significant impact in industrial-based FSAD. Specifically, we utilize graph
+representation in FSAD and provide a novel visual isometric invariant feature
+(VIIF) as anomaly measurement feature. As a result, VIIF can robustly improve
+the anomaly discriminating ability and can further reduce the size of redundant
+features stored in M by a large amount. Besides, we provide a novel model
+GraphCore via VIIFs that can fast implement unsupervised FSAD training and can
+improve the performance of anomaly detection. A comprehensive evaluation is
+provided for comparing GraphCore and other SOTA anomaly detection models under
+our proposed fewshot anomaly detection setting, which shows GraphCore can
+increase average AUC by 5.8%, 4.1%, 3.4%, and 1.6% on MVTec AD and by 25.5%,
+22.0%, 16.9%, and 14.1% on MPDD for 1, 2, 4, and 8-shot cases, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Locate before Answering: Answer Guided Question Localization for Video
+  Question Answering 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.02081v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.02081v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianwen Qian, Ran Cui, Jingjing Chen, Pai Peng, Xiaowei Guo, Yu-Gang Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video question answering (VideoQA) is an essential task in vision-language
+understanding, which has attracted numerous research attention recently.
+Nevertheless, existing works mostly achieve promising performances on short
+videos of duration within 15 seconds. For VideoQA on minute-level long-term
+videos, those methods are likely to fail because of lacking the ability to deal
+with noise and redundancy caused by scene changes and multiple actions in the
+video. Considering the fact that the question often remains concentrated in a
+short temporal range, we propose to first locate the question to a segment in
+the video and then infer the answer using the located segment only. Under this
+scheme, we propose "Locate before Answering" (LocAns), a novel approach that
+integrates a question locator and an answer predictor into an end-to-end model.
+During the training phase, the available answer label not only serves as the
+supervision signal of the answer predictor, but also is used to generate pseudo
+temporal labels for the question locator. Moreover, we design a decoupled
+alternative training strategy to update the two modules separately. In the
+experiments, LocAns achieves state-of-the-art performance on two modern
+long-term VideoQA datasets NExT-QA and ActivityNet-QA, and its qualitative
+examples show the reliable performance of the question localization.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Industrial Image Anomaly Detection: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.11514v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.11514v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaqi Liu, Guoyang Xie, Jinbao Wang, Shangnian Li, Chengjie Wang, Feng Zheng, Yaochu Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent rapid development of deep learning has laid a milestone in
+industrial Image Anomaly Detection (IAD). In this paper, we provide a
+comprehensive review of deep learning-based image anomaly detection techniques,
+from the perspectives of neural network architectures, levels of supervision,
+loss functions, metrics and datasets. In addition, we extract the new setting
+from industrial manufacturing and review the current IAD approaches under our
+proposed our new setting. Moreover, we highlight several opening challenges for
+image anomaly detection. The merits and downsides of representative network
+architectures under varying supervision are discussed. Finally, we summarize
+the research findings and point out future research directions. More resources
+are available at
+https://github.com/M-3LAB/awesome-industrial-anomaly-detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IM-IAD: Industrial Image Anomaly Detection Benchmark in Manufacturing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.13359v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.13359v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guoyang Xie, Jinbao Wang, Jiaqi Liu, Jiayi Lyu, Yong Liu, Chengjie Wang, Feng Zheng, Yaochu Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image anomaly detection (IAD) is an urgent issue that needs to be addressed
+in modern industrial manufacturing (IM). Recently, many advanced algorithms
+have been released, but their performance varies greatly due to non-uniformed
+settings. That is, researchers find it difficult to analyze because they are
+designed for different or specific cases in IM. To eliminate this problem, we
+first propose a uniform IAD setting to systematically assess the effectiveness
+of these algorithms, mainly considering three aspects of supervision level
+(unsupervised, fully supervised), learning paradigm (few-shot, continual, noisy
+label), and efficiency (memory usage, inference speed). Then, we skillfully
+construct a comprehensive image anomaly detection benchmark (IM-IAD), which
+includes 19 algorithms on 7 major datasets with the same setting. Our extensive
+experiments (17,017 total) provide new insights into the redesign or selection
+of the IAD algorithm under uniform conditions. Importantly, the proposed IM-IAD
+presents feasible challenges and future directions for further work. We believe
+that this work can have a significant impact on the IAD field. To foster
+reproducibility and accessibility, the source code of IM-IAD is uploaded on the
+website, https://github.com/M-3LAB/IM-IAD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ RFAConv: Innovating Spatial Attention and Standard Convolutional
+  Operation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03198v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03198v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Zhang, Chen Liu, Degang Yang, Tingting Song, Yichen Ye, Ke Li, Yingze Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatial attention has been widely used to improve the performance of
+convolutional neural networks. However, it has certain limitations. In this
+paper, we propose a new perspective on the effectiveness of spatial attention,
+which is that the spatial attention mechanism essentially solves the problem of
+convolutional kernel parameter sharing. However, the information contained in
+the attention map generated by spatial attention is not sufficient for
+large-size convolutional kernels. Therefore, we propose a novel attention
+mechanism called Receptive-Field Attention (RFA). Existing spatial attention,
+such as Convolutional Block Attention Module (CBAM) and Coordinated Attention
+(CA) focus only on spatial features, which does not fully address the problem
+of convolutional kernel parameter sharing. In contrast, RFA not only focuses on
+the receptive-field spatial feature but also provides effective attention
+weights for large-size convolutional kernels. The Receptive-Field Attention
+convolutional operation (RFAConv), developed by RFA, represents a new approach
+to replace the standard convolution operation. It offers nearly negligible
+increment of computational cost and parameters, while significantly improving
+network performance. We conducted a series of experiments on ImageNet-1k, COCO,
+and VOC datasets to demonstrate the superiority of our approach. Of particular
+importance, we believe that it is time to shift focus from spatial features to
+receptive-field spatial features for current spatial attention mechanisms. In
+this way, we can further improve network performance and achieve even better
+results. The code and pre-trained models for the relevant tasks can be found at
+https://github.com/Liuchen1997/RFAConv.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 11figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ 3DDesigner: Towards Photorealistic 3D Object Generation and Editing with
+  Text-guided Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.14108v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.14108v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gang Li, Heliang Zheng, Chaoyue Wang, Chang Li, Changwen Zheng, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text-guided diffusion models have shown superior performance in image/video
+generation and editing. While few explorations have been performed in 3D
+scenarios. In this paper, we discuss three fundamental and interesting problems
+on this topic. First, we equip text-guided diffusion models to achieve
+3D-consistent generation. Specifically, we integrate a NeRF-like neural field
+to generate low-resolution coarse results for a given camera view. Such results
+can provide 3D priors as condition information for the following diffusion
+process. During denoising diffusion, we further enhance the 3D consistency by
+modeling cross-view correspondences with a novel two-stream (corresponding to
+two different views) asynchronous diffusion process. Second, we study 3D local
+editing and propose a two-step solution that can generate 360-degree
+manipulated results by editing an object from a single view. Step 1, we propose
+to perform 2D local editing by blending the predicted noises. Step 2, we
+conduct a noise-to-text inversion process that maps 2D blended noises into the
+view-independent text embedding space. Once the corresponding text embedding is
+obtained, 360-degree images can be generated. Last but not least, we extend our
+model to perform one-shot novel view synthesis by fine-tuning on a single
+image, firstly showing the potential of leveraging text guidance for novel view
+synthesis. Extensive experiments and various applications show the prowess of
+our 3DDesigner. The project page is available at
+https://3ddesigner-diffusion.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IJCV</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Can <span class="highlight-title">Pre-train</span>ed Networks Detect Familiar Out-of-Distribution Data? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00847v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00847v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Atsuyuki Miyai, Qing Yu, Go Irie, Kiyoharu Aizawa
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Out-of-distribution (OOD) detection is critical for safety-sensitive machine
+learning applications and has been extensively studied, yielding a plethora of
+methods developed in the literature. However, most studies for OOD detection
+did not use pre-trained models and trained a backbone from scratch. In recent
+years, transferring knowledge from large pre-trained models to downstream tasks
+by lightweight tuning has become mainstream for training in-distribution (ID)
+classifiers. To bridge the gap between the practice of OOD detection and
+current classifiers, the unique and crucial problem is that the samples whose
+information networks know often come as OOD input. We consider that such data
+may significantly affect the performance of large pre-trained networks because
+the discriminability of these OOD data depends on the pre-training algorithm.
+Here, we define such OOD data as PT-OOD (Pre-Trained OOD) data. In this paper,
+we aim to reveal the effect of PT-OOD on the OOD detection performance of
+pre-trained networks from the perspective of pre-training algorithms. To
+achieve this, we explore the PT-OOD detection performance of supervised and
+self-supervised pre-training algorithms with linear-probing tuning, the most
+common efficient tuning method. Through our experiments and analysis, we find
+that the low linear separability of PT-OOD in the feature space heavily
+degrades the PT-OOD detection performance, and self-supervised models are more
+vulnerable to PT-OOD than supervised pre-trained models, even with
+state-of-the-art detection methods. To solve this vulnerability, we further
+propose a unique solution to large-scale pre-trained models: Leveraging
+powerful instance-by-instance discriminative representations of pre-trained
+models and detecting OOD in the feature space independent of the ID decision
+boundaries. The code will be available via https://github.com/AtsuMiyai/PT-OOD.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ S4C: <span class="highlight-title">Self-Supervised</span> Semantic Scene Completion with Neural Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07522v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07522v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrian Hayler, Felix Wimbauer, Dominik Muhle, Christian Rupprecht, Daniel Cremers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D semantic scene understanding is a fundamental challenge in computer
+vision. It enables mobile agents to autonomously plan and navigate arbitrary
+environments. SSC formalizes this challenge as jointly estimating dense
+geometry and semantic information from sparse observations of a scene. Current
+methods for SSC are generally trained on 3D ground truth based on aggregated
+LiDAR scans. This process relies on special sensors and annotation by hand
+which are costly and do not scale well. To overcome this issue, our work
+presents the first self-supervised approach to SSC called S4C that does not
+rely on 3D ground truth data. Our proposed method can reconstruct a scene from
+a single image and only relies on videos and pseudo segmentation ground truth
+generated from off-the-shelf image segmentation network during training. Unlike
+existing methods, which use discrete voxel grids, we represent scenes as
+implicit semantic fields. This formulation allows querying any point within the
+camera frustum for occupancy and semantic class. Our architecture is trained
+through rendering-based self-supervised losses. Nonetheless, our method
+achieves performance close to fully supervised state-of-the-art methods.
+Additionally, our method demonstrates strong generalization capabilities and
+can synthesize accurate segmentation maps for far away viewpoints.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Universal Object Detection with Large Vision Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.09408v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.09408v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Feng Lin, Wenze Hu, Yaowei Wang, Yonghong Tian, Guangming Lu, Fanglin Chen, Yong Xu, Xiaoyu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Over the past few years, there has been growing interest in developing a
+broad, universal, and general-purpose computer vision system. Such systems have
+the potential to address a wide range of vision tasks simultaneously, without
+being limited to specific problems or data domains. This universality is
+crucial for practical, real-world computer vision applications. In this study,
+our focus is on a specific challenge: the large-scale, multi-domain universal
+object detection problem, which contributes to the broader goal of achieving a
+universal vision system. This problem presents several intricate challenges,
+including cross-dataset category label duplication, label conflicts, and the
+necessity to handle hierarchical taxonomies. To address these challenges, we
+introduce our approach to label handling, hierarchy-aware loss design, and
+resource-efficient model training utilizing a pre-trained large vision model.
+Our method has demonstrated remarkable performance, securing a prestigious
+second-place ranking in the object detection track of the Robust Vision
+Challenge 2022 (RVC 2022) on a million-scale cross-dataset object detection
+benchmark. We believe that our comprehensive study will serve as a valuable
+reference and offer an alternative approach for addressing similar challenges
+within the computer vision community. The source code for our work is openly
+available at https://github.com/linfeng93/Large-UniDet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by International Journal of Computer Vision (IJCV). The 2nd
+  place in the object detection track of the Robust Vision Challenge (RVC 2022)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Finite Scalar Quantization: VQ-VAE Made Simple 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15505v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15505v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian Mentzer, David Minnen, Eirikur Agustsson, Michael Tschannen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose to replace vector quantization (VQ) in the latent representation
+of VQ-VAEs with a simple scheme termed finite scalar quantization (FSQ), where
+we project the VAE representation down to a few dimensions (typically less than
+10). Each dimension is quantized to a small set of fixed values, leading to an
+(implicit) codebook given by the product of these sets. By appropriately
+choosing the number of dimensions and values each dimension can take, we obtain
+the same codebook size as in VQ. On top of such discrete representations, we
+can train the same models that have been trained on VQ-VAE representations. For
+example, autoregressive and masked transformer models for image generation,
+multimodal generation, and dense prediction computer vision tasks. Concretely,
+we employ FSQ with MaskGIT for image generation, and with UViM for depth
+estimation, colorization, and panoptic segmentation. Despite the much simpler
+design of FSQ, we obtain competitive performance in all these tasks. We
+emphasize that FSQ does not suffer from codebook collapse and does not need the
+complex machinery employed in VQ (commitment losses, codebook reseeding, code
+splitting, entropy penalties, etc.) to learn expressive discrete
+representations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code:
+  https://github.com/google-research/google-research/tree/master/fsq</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Leveraging Single-View Images for Unsupervised 3D Point Cloud Completion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.00564v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.00564v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lintai Wu, Qijian Zhang, Junhui Hou, Yong Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point clouds captured by scanning devices are often incomplete due to
+occlusion. To overcome this limitation, point cloud completion methods have
+been developed to predict the complete shape of an object based on its partial
+input. These methods can be broadly classified as supervised or unsupervised.
+However, both categories require a large number of 3D complete point clouds,
+which may be difficult to capture. In this paper, we propose Cross-PCC, an
+unsupervised point cloud completion method without requiring any 3D complete
+point clouds. We only utilize 2D images of the complete objects, which are
+easier to capture than 3D complete and clean point clouds. Specifically, to
+take advantage of the complementary information from 2D images, we use a
+single-view RGB image to extract 2D features and design a fusion module to fuse
+the 2D and 3D features extracted from the partial point cloud. To guide the
+shape of predicted point clouds, we project the predicted points of the object
+to the 2D plane and use the foreground pixels of its silhouette maps to
+constrain the position of the projected points. To reduce the outliers of the
+predicted point clouds, we propose a view calibrator to move the points
+projected to the background into the foreground by the single-view silhouette
+image. To the best of our knowledge, our approach is the first point cloud
+completion method that does not require any 3D supervision. The experimental
+results of our method are superior to those of the state-of-the-art
+unsupervised methods by a large margin. Moreover, our method even achieves
+comparable performance to some supervised methods. We will make the source code
+publicly available at https://github.com/ltwu6/cross-pcc.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A <span class="highlight-title">Survey</span> of Computer Vision Technologies In Urban and
+  Controlled-environment Agriculture 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.11318v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.11318v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayun Luo, Boyang Li, Cyril Leung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the evolution of agriculture to its next stage, Agriculture 5.0,
+artificial intelligence will play a central role. Controlled-environment
+agriculture, or CEA, is a special form of urban and suburban agricultural
+practice that offers numerous economic, environmental, and social benefits,
+including shorter transportation routes to population centers, reduced
+environmental impact, and increased productivity. Due to its ability to control
+environmental factors, CEA couples well with computer vision (CV) in the
+adoption of real-time monitoring of the plant conditions and autonomous
+cultivation and harvesting. The objective of this paper is to familiarize CV
+researchers with agricultural applications and agricultural practitioners with
+the solutions offered by CV. We identify five major CV applications in CEA,
+analyze their requirements and motivation, and survey the state of the art as
+reflected in 68 technical papers using deep learning methods. In addition, we
+discuss five key subareas of computer vision and how they related to these CEA
+problems, as well as eleven vision-based CEA datasets. We hope the survey will
+help researchers quickly gain a bird-eye view of the striving research area and
+will spark inspiration for new research and development.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>1 overview figures, 37 pages, 8 tables, accepted by ACM Computing
+  Surveys</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Tightly-Coupled LiDAR-Visual SLAM Based on Geometric Features for Mobile
+  Agents 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.07763v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.07763v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Cao, Ruiping Liu, Ze Wang, Kunyu Peng, Jiaming Zhang, Junwei Zheng, Zhifeng Teng, Kailun Yang, Rainer Stiefelhagen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The mobile robot relies on SLAM (Simultaneous Localization and Mapping) to
+provide autonomous navigation and task execution in complex and unknown
+environments. However, it is hard to develop a dedicated algorithm for mobile
+robots due to dynamic and challenging situations, such as poor lighting
+conditions and motion blur. To tackle this issue, we propose a tightly-coupled
+LiDAR-visual SLAM based on geometric features, which includes two sub-systems
+(LiDAR and monocular visual SLAM) and a fusion framework. The fusion framework
+associates the depth and semantics of the multi-modal geometric features to
+complement the visual line landmarks and to add direction optimization in
+Bundle Adjustment (BA). This further constrains visual odometry. On the other
+hand, the entire line segment detected by the visual subsystem overcomes the
+limitation of the LiDAR subsystem, which can only perform the local calculation
+for geometric features. It adjusts the direction of linear feature points and
+filters out outliers, leading to a higher accurate odometry system. Finally, we
+employ a module to detect the subsystem's operation, providing the LiDAR
+subsystem's output as a complementary trajectory to our system while visual
+subsystem tracking fails. The evaluation results on the public dataset M2DGR,
+gathered from ground robots across various indoor and outdoor scenarios, show
+that our system achieves more accurate and robust pose estimation compared to
+current state-of-the-art multi-modal methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ROBIO 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Variational Imbalanced Regression: Fair Uncertainty Quantification via
+  Probabilistic Smoothing <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06599v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06599v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyan Wang, Hao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing regression models tend to fall short in both accuracy and
+uncertainty estimation when the label distribution is imbalanced. In this
+paper, we propose a probabilistic deep learning model, dubbed variational
+imbalanced regression (VIR), which not only performs well in imbalanced
+regression but naturally produces reasonable uncertainty estimation as a
+byproduct. Different from typical variational autoencoders assuming I.I.D.
+representations (a data point's representation is not directly affected by
+other data points), our VIR borrows data with similar regression labels to
+compute the latent representation's variational distribution; furthermore,
+different from deterministic regression models producing point estimates, VIR
+predicts the entire normal-inverse-gamma distributions and modulates the
+associated conjugate distributions to impose probabilistic reweighting on the
+imbalanced data, thereby providing better uncertainty estimation. Experiments
+in several real-world datasets show that our VIR can outperform
+state-of-the-art imbalanced regression models in terms of both accuracy and
+uncertainty estimation. Code will soon be available at
+\url{https://github.com/Wang-ML-Lab/variational-imbalanced-regression}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TransUPR: A <span class="highlight-title">Transformer</span>-based Uncertain Point Refiner for LiDAR Point
+  Cloud Semantic Segmentation <span class="chip">IROS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.08594v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.08594v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zifan Yu, Meida Chen, Zhikang Zhang, Suya You, Raghuveer Rao, Sanjeev Agarwal, Fengbo Ren
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Common image-based LiDAR point cloud semantic segmentation (LiDAR PCSS)
+approaches have bottlenecks resulting from the boundary-blurring problem of
+convolution neural networks (CNNs) and quantitation loss of spherical
+projection. In this work, we propose a transformer-based plug-and-play
+uncertain point refiner, i.e., TransUPR, to refine selected uncertain points in
+a learnable manner, which leads to an improved segmentation performance.
+Uncertain points are sampled from coarse semantic segmentation results of 2D
+image segmentation where uncertain points are located close to the object
+boundaries in the 2D range image representation and 3D spherical projection
+background points. Following that, the geometry and coarse semantic features of
+uncertain points are aggregated by neighbor points in 3D space without adding
+expensive computation and memory footprint. Finally, the transformer-based
+refiner, which contains four stacked self-attention layers, along with an MLP
+module, is utilized for uncertain point classification on the concatenated
+features of self-attention layers. As the proposed refiner is independent of 2D
+CNNs, our TransUPR can be easily integrated into any existing image-based LiDAR
+PCSS approaches, e.g., CENet. Our TransUPR with the CENet achieves
+state-of-the-art performance, i.e., 68.2% mean Intersection over Union (mIoU)
+on the Semantic KITTI benchmark, which provides a performance improvement of
+0.6% on the mIoU compared to the original CENet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages; Accepted by 2023 IROS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Locality-Aware Generalizable Implicit Neural Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05624v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05624v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Doyup Lee, Chiheon Kim, Minsu Cho, Wook-Shin Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generalizable implicit neural representation (INR) enables a single
+continuous function, i.e., a coordinate-based neural network, to represent
+multiple data instances by modulating its weights or intermediate features
+using latent codes. However, the expressive power of the state-of-the-art
+modulation is limited due to its inability to localize and capture fine-grained
+details of data entities such as specific pixels and rays. To address this
+issue, we propose a novel framework for generalizable INR that combines a
+transformer encoder with a locality-aware INR decoder. The transformer encoder
+predicts a set of latent tokens from a data instance to encode local
+information into each latent token. The locality-aware INR decoder extracts a
+modulation vector by selectively aggregating the latent tokens via
+cross-attention for a coordinate input and then predicts the output by
+progressively decoding with coarse-to-fine modulation through multiple
+frequency bandwidths. The selective token aggregation and the multi-band
+feature modulation enable us to learn locality-aware representation in spatial
+and spectral aspects, respectively. Our framework significantly outperforms
+previous generalizable INRs and validates the usefulness of the locality-aware
+latents for downstream tasks such as image generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Data-and Knowledge-Driven Artificial Intelligence: A <span class="highlight-title">Survey</span> on
+  Neuro-Symbolic Computing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.15889v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.15889v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenguan Wang, Yi Yang, Fei Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural-symbolic computing (NeSy), which pursues the integration of the
+symbolic and statistical paradigms of cognition, has been an active research
+area of Artificial Intelligence (AI) for many years. As NeSy shows promise of
+reconciling the advantages of reasoning and interpretability of symbolic
+representation and robust learning in neural networks, it may serve as a
+catalyst for the next generation of AI. In the present paper, we provide a
+systematic overview of the recent developments and important contributions of
+NeSy research. Firstly, we introduce study history of this area, covering early
+work and foundations. We further discuss background concepts and identify key
+driving factors behind the development of NeSy. Afterward, we categorize recent
+landmark approaches along several main characteristics that underline this
+research paradigm, including neural-symbolic integration, knowledge
+representation, knowledge embedding, and functionality. Next, we briefly
+discuss the successful application of modern NeSy approaches in several
+domains. Then, we benchmark several NeSy methods on three representative
+application tasks. Finally, we identify the open problems together with
+potential future research directions. This survey is expected to help new
+researchers enter this rapidly evolving field and accelerate the progress
+towards data-and knowledge-driven AI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Ongoing project</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nest-DGIL: Nesterov-optimized Deep Geometric Incremental Learning for CS
+  Image Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03807v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03807v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaohong Fan, Yin Yang, Ke Chen, Yujie Feng, Jianping Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Proximal gradient-based optimization is one of the most common strategies to
+solve inverse problem of images, and it is easy to implement. However, these
+techniques often generate heavy artifacts in image reconstruction. One of the
+most popular refinement methods is to fine-tune the regularization parameter to
+alleviate such artifacts, but it may not always be sufficient or applicable due
+to increased computational costs. In this work, we propose a deep geometric
+incremental learning framework based on the second Nesterov proximal gradient
+optimization. The proposed end-to-end network not only has the powerful
+learning ability for high-/low-frequency image features, but also can
+theoretically guarantee that geometric texture details will be reconstructed
+from preliminary linear reconstruction. Furthermore, it can avoid the risk of
+intermediate reconstruction results falling outside the geometric decomposition
+domains and achieve fast convergence. Our reconstruction framework is
+decomposed into four modules including general linear reconstruction, cascade
+geometric incremental restoration, Nesterov acceleration, and post-processing.
+In the image restoration step, a cascade geometric incremental learning module
+is designed to compensate for missing texture information from different
+geometric spectral decomposition domains. Inspired by the overlap-tile
+strategy, we also develop a post-processing module to remove the block effect
+in patch-wise-based natural image reconstruction. All parameters in the
+proposed model are learnable, an adaptive initialization technique of physical
+parameters is also employed to make model flexibility and ensure converging
+smoothly. We compare the reconstruction performance of the proposed method with
+existing state-of-the-art methods to demonstrate its superiority. Our source
+codes are available at https://github.com/fanxiaohong/Nest-DGIL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages,our source codes are available at
+  https://github.com/fanxiaohong/Nest-DGIL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SpikeCLIP: A Contrastive Language-Image <span class="highlight-title">Pretrain</span>ed Spiking Neural
+  Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06488v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06488v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianlong Li, Wenhao Liu, Changze Lv, Jianhan Xu, Cenyuan Zhang, Muling Wu, Xiaoqing Zheng, Xuanjing Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking neural networks (SNNs) have demonstrated the capability to achieve
+comparable performance to deep neural networks (DNNs) in both visual and
+linguistic domains while offering the advantages of improved energy efficiency
+and adherence to biological plausibility. However, the extension of such
+single-modality SNNs into the realm of multimodal scenarios remains an
+unexplored territory. Drawing inspiration from the concept of contrastive
+language-image pre-training (CLIP), we introduce a novel framework, named
+SpikeCLIP, to address the gap between two modalities within the context of
+spike-based computing through a two-step recipe involving ``Alignment
+Pre-training + Dual-Loss Fine-tuning". Extensive experiments demonstrate that
+SNNs achieve comparable results to their DNN counterparts while significantly
+reducing energy consumption across a variety of datasets commonly used for
+multimodal model evaluation. Furthermore, SpikeCLIP maintains robust
+performance in image classification tasks that involve class labels not
+predefined within specific categories.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LF-VISLAM: A SLAM Framework for Large Field-of-View Cameras with
+  Negative Imaging Plane on Mobile Agents <span class="chip">IROS2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.05167v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.05167v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ze Wang, Kailun Yang, Hao Shi, Peng Li, Fei Gao, Jian Bai, Kaiwei Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Simultaneous Localization And Mapping (SLAM) has become a crucial aspect in
+the fields of autonomous driving and robotics. One crucial component of visual
+SLAM is the Field-of-View (FoV) of the camera, as a larger FoV allows for a
+wider range of surrounding elements and features to be perceived. However, when
+the FoV of the camera reaches the negative half-plane, traditional methods for
+representing image feature points using [u,v,1]^T become ineffective. While the
+panoramic FoV is advantageous for loop closure, its benefits are not easily
+realized under large-attitude-angle differences where loop-closure frames
+cannot be easily matched by existing methods. As loop closure on wide-FoV
+panoramic data further comes with a large number of outliers, traditional
+outlier rejection methods are not directly applicable. To address these issues,
+we propose LF-VISLAM, a Visual Inertial SLAM framework for cameras with
+extremely Large FoV with loop closure. A three-dimensional vector with unit
+length is introduced to effectively represent feature points even on the
+negative half-plane. The attitude information of the SLAM system is leveraged
+to guide the feature point detection of the loop closure. Additionally, a new
+outlier rejection method based on the unit length representation is integrated
+into the loop closure module. We collect the PALVIO dataset using a Panoramic
+Annular Lens (PAL) system with an entire FoV of 360{\deg}x(40{\deg}~120{\deg})
+and an Inertial Measurement Unit (IMU) for Visual Inertial Odometry (VIO) to
+address the lack of panoramic SLAM datasets. Experiments on the established
+PALVIO and public datasets show that the proposed LF-VISLAM outperforms
+state-of-the-art SLAM methods. Our code will be open-sourced at
+https://github.com/flysoaryun/LF-VISLAM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to IEEE Transactions on Automation Science and Engineering
+  (T-ASE). Extended version of IROS2022 paper arXiv:2202.12613. Code and
+  dataset will be open-sourced at https://github.com/flysoaryun/LF-SLAM</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Effective Motion-Centric Paradigm for 3D Single Object Tracking in
+  Point Clouds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.12535v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.12535v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoda Zheng, Xu Yan, Haiming Zhang, Baoyuan Wang, Shenghui Cheng, Shuguang Cui, Zhen Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D single object tracking in LiDAR point clouds (LiDAR SOT) plays a crucial
+role in autonomous driving. Current approaches all follow the Siamese paradigm
+based on appearance matching. However, LiDAR point clouds are usually
+textureless and incomplete, which hinders effective appearance matching.
+Besides, previous methods greatly overlook the critical motion clues among
+targets. In this work, beyond 3D Siamese tracking, we introduce a
+motion-centric paradigm to handle LiDAR SOT from a new perspective. Following
+this paradigm, we propose a matching-free two-stage tracker M^2-Track. At the
+1st-stage, M^2-Track localizes the target within successive frames via motion
+transformation. Then it refines the target box through motion-assisted shape
+completion at the 2nd-stage. Due to the motion-centric nature, our method shows
+its impressive generalizability with limited training labels and provides good
+differentiability for end-to-end cycle training. This inspires us to explore
+semi-supervised LiDAR SOT by incorporating a pseudo-label-based motion
+augmentation and a self-supervised loss term. Under the fully-supervised
+setting, extensive experiments confirm that M^2-Track significantly outperforms
+previous state-of-the-arts on three large-scale datasets while running at 57FPS
+(~3%, ~11% and ~22% precision gains on KITTI, NuScenes, and Waymo Open Dataset
+respectively). While under the semi-supervised setting, our method performs on
+par with or even surpasses its fully-supervised counterpart using fewer than
+half of the labels from KITTI. Further analysis verifies each component's
+effectiveness and shows the motion-centric paradigm's promising potential for
+auto-labeling and unsupervised domain adaptation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted version of the journal extension of M^2-Track. Accepted by
+  TPAMI. arXiv admin note: substantial text overlap with arXiv:2203.01730</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MemSAC: Memory Augmented Sample Consistency for Large Scale Unsupervised
+  Domain Adaptation <span class="chip">ECCV 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.12389v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.12389v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tarun Kalluri, Astuti Sharma, Manmohan Chandraker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Practical real world datasets with plentiful categories introduce new
+challenges for unsupervised domain adaptation like small inter-class
+discriminability, that existing approaches relying on domain invariance alone
+cannot handle sufficiently well. In this work we propose MemSAC, which exploits
+sample level similarity across source and target domains to achieve
+discriminative transfer, along with architectures that scale to a large number
+of categories. For this purpose, we first introduce a memory augmented approach
+to efficiently extract pairwise similarity relations between labeled source and
+unlabeled target domain instances, suited to handle an arbitrary number of
+classes. Next, we propose and theoretically justify a novel variant of the
+contrastive loss to promote local consistency among within-class cross domain
+samples while enforcing separation between classes, thus preserving
+discriminative transfer from source to target. We validate the advantages of
+MemSAC with significant improvements over previous state-of-the-art on multiple
+challenging transfer tasks designed for large-scale adaptation, such as
+DomainNet with 345 classes and fine-grained adaptation on Caltech-UCSD birds
+dataset with 200 classes. We also provide in-depth analysis and insights into
+the effectiveness of MemSAC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ECCV 2022. Project Webpage:
+  https://tarun005.github.io/MemSAC/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Event-based Temporally Dense Optical Flow Estimation with Sequential
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.01244v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.01244v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wachirawit Ponghiran, Chamika Mihiranga Liyanagedera, Kaushik Roy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event cameras provide an advantage over traditional frame-based cameras when
+capturing fast-moving objects without a motion blur. They achieve this by
+recording changes in light intensity (known as events), thus allowing them to
+operate at a much higher frequency and making them suitable for capturing
+motions in a highly dynamic scene. Many recent studies have proposed methods to
+train neural networks (NNs) for predicting optical flow from events. However,
+they often rely on a spatio-temporal representation constructed from events
+over a fixed interval, such as 10Hz used in training on the DSEC dataset. This
+limitation restricts the flow prediction to the same interval (10Hz) whereas
+the fast speed of event cameras, which can operate up to 3kHz, has not been
+effectively utilized. In this work, we show that a temporally dense flow
+estimation at 100Hz can be achieved by treating the flow estimation as a
+sequential problem using two different variants of recurrent networks -
+Long-short term memory (LSTM) and spiking neural network (SNN). First, We
+utilize the NN model constructed similar to the popular EV-FlowNet but with
+LSTM layers to demonstrate the efficiency of our training method. The model not
+only produces 10x more frequent optical flow than the existing ones, but the
+estimated flows also have 13% lower errors than predictions from the baseline
+EV-FlowNet. Second, we construct an EV-FlowNet SNN but with leaky integrate and
+fire neurons to efficiently capture the temporal dynamics. We found that simple
+inherent recurrent dynamics of SNN lead to significant parameter reduction
+compared to the LSTM model. In addition, because of its event-driven
+computation, the spiking model is estimated to consume only 1.5% energy of the
+LSTM model, highlighting the efficiency of SNN in processing events and the
+potential for achieving temporally dense flow.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EfficientDM: Efficient Quantization-Aware Fine-Tuning of Low-Bit
+  Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03270v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03270v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yefei He, Jing Liu, Weijia Wu, Hong Zhou, Bohan Zhuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have demonstrated remarkable capabilities in image synthesis
+and related generative tasks. Nevertheless, their practicality for low-latency
+real-world applications is constrained by substantial computational costs and
+latency issues. Quantization is a dominant way to compress and accelerate
+diffusion models, where post-training quantization (PTQ) and quantization-aware
+training (QAT) are two main approaches, each bearing its own properties. While
+PTQ exhibits efficiency in terms of both time and data usage, it may lead to
+diminished performance in low bit-width. On the other hand, QAT can alleviate
+performance degradation but comes with substantial demands on computational and
+data resources. To capitalize on the advantages while avoiding their respective
+drawbacks, we introduce a data-free and parameter-efficient fine-tuning
+framework for low-bit diffusion models, dubbed EfficientDM, to achieve
+QAT-level performance with PTQ-like efficiency. Specifically, we propose a
+quantization-aware variant of the low-rank adapter (QALoRA) that can be merged
+with model weights and jointly quantized to low bit-width. The fine-tuning
+process distills the denoising capabilities of the full-precision model into
+its quantized counterpart, eliminating the requirement for training data. We
+also introduce scale-aware optimization and employ temporal learned step-size
+quantization to further enhance performance. Extensive experimental results
+demonstrate that our method significantly outperforms previous PTQ-based
+diffusion models while maintaining similar time and data efficiency.
+Specifically, there is only a marginal 0.05 sFID increase when quantizing both
+weights and activations of LDM-4 to 4-bit on ImageNet 256x256. Compared to
+QAT-based methods, our EfficientDM also boasts a 16.2x faster quantization
+speed with comparable generation quality.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NeuWigs: A Neural Dynamic Model for Volumetric Hair Capture and
+  Animation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.00613v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.00613v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyan Wang, Giljoo Nam, Tuur Stuyck, Stephen Lombardi, Chen Cao, Jason Saragih, Michael Zollhoefer, Jessica Hodgins, Christoph Lassner
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The capture and animation of human hair are two of the major challenges in
+the creation of realistic avatars for the virtual reality. Both problems are
+highly challenging, because hair has complex geometry and appearance, as well
+as exhibits challenging motion. In this paper, we present a two-stage approach
+that models hair independently from the head to address these challenges in a
+data-driven manner. The first stage, state compression, learns a
+low-dimensional latent space of 3D hair states containing motion and
+appearance, via a novel autoencoder-as-a-tracker strategy. To better
+disentangle the hair and head in appearance learning, we employ multi-view hair
+segmentation masks in combination with a differentiable volumetric renderer.
+The second stage learns a novel hair dynamics model that performs temporal hair
+transfer based on the discovered latent codes. To enforce higher stability
+while driving our dynamics model, we employ the 3D point-cloud autoencoder from
+the compression stage for de-noising of the hair state. Our model outperforms
+the state of the art in novel view synthesis and is capable of creating novel
+hair animations without having to rely on hair observations as a driving
+signal. Project page is here https://ziyanw1.github.io/neuwigs/.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-supervised</span> Learning for Segmentation and Quantification of Dopamine
+  Neurons in Parkinson's Disease 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.08141v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.08141v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fatemeh Haghighi, Soumitra Ghosh, Hai Ngu, Sarah Chu, Han Lin, Mohsen Hejrati, Baris Bingol, Somaye Hashemifar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parkinson's Disease (PD) is the second most common neurodegenerative disease
+in humans. PD is characterized by the gradual loss of dopaminergic neurons in
+the Substantia Nigra (SN). Counting the number of dopaminergic neurons in the
+SN is one of the most important indexes in evaluating drug efficacy in PD
+animal models. Currently, analyzing and quantifying dopaminergic neurons is
+conducted manually by experts through analysis of digital pathology images
+which is laborious, time-consuming, and highly subjective. As such, a reliable
+and unbiased automated system is demanded for the quantification of
+dopaminergic neurons in digital pathology images. Recent years have seen a
+surge in adopting deep learning solutions in medical image processing. However,
+developing high-performing deep learning models hinges on the availability of
+large-scale, high-quality annotated data, which can be expensive to acquire,
+especially in applications like digital pathology image analysis. To this end,
+we propose an end-to-end deep learning framework based on self-supervised
+learning for the segmentation and quantification of dopaminergic neurons in PD
+animal models. To the best of our knowledge, this is the first deep learning
+model that detects the cell body of dopaminergic neurons, counts the number of
+dopaminergic neurons, and provides characteristics of individual dopaminergic
+neurons as a numerical output. Extensive experiments demonstrate the
+effectiveness of our model in quantifying neurons with high precision, which
+can provide a faster turnaround for drug efficacy studies, better understanding
+of dopaminergic neuronal health status, and unbiased results in PD pre-clinical
+research. As part of our contributions, we also provide the first publicly
+available dataset of histology digital images along with expert annotations for
+the segmentation of TH-positive DA neuronal soma.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SIMMF: Semantics-aware Interactive Multiagent Motion Forecasting for
+  Autonomous Vehicle Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14941v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14941v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vidyaa Krishnan Nivash, Ahmed H. Qureshi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous vehicles require motion forecasting of their surrounding
+multiagents (pedestrians and vehicles) to make optimal decisions for
+navigation. The existing methods focus on techniques to utilize the positions
+and velocities of these agents and fail to capture semantic information from
+the scene. Moreover, to mitigate the increase in computational complexity
+associated with the number of agents in the scene, some works leverage
+Euclidean distance to prune far-away agents. However, distance-based metric
+alone is insufficient to select relevant agents and accurately perform their
+predictions. To resolve these issues, we propose the Semantics-aware
+Interactive Multiagent Motion Forecasting (SIMMF) method to capture semantics
+along with spatial information and optimally select relevant agents for motion
+prediction. Specifically, we achieve this by implementing a semantic-aware
+selection of relevant agents from the scene and passing them through an
+attention mechanism to extract global encodings. These encodings along with
+agents' local information, are passed through an encoder to obtain
+time-dependent latent variables for a motion policy predicting the future
+trajectories. Our results show that the proposed approach outperforms
+state-of-the-art baselines and provides more accurate and scene-consistent
+predictions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Prediction of MET Overexpression in Non-Small Cell Lung Adenocarcinomas
+  from Hematoxylin and Eosin Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07682v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07682v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kshitij Ingale, Sun Hae Hong, Josh S. K. Bell, Abbas Rizvi, Amy Welch, Lingdao Sha, Irvin Ho, Kunal Nagpal, Aicha BenTaieb, Rohan P Joshi, Martin C Stumpe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  MET protein overexpression is a targetable event in non-small cell lung
+cancer (NSCLC) and is the subject of active drug development. Challenges in
+identifying patients for these therapies include lack of access to validated
+testing, such as standardized immunohistochemistry (IHC) assessment, and
+consumption of valuable tissue for a single gene/protein assay. Development of
+pre-screening algorithms using routinely available digitized hematoxylin and
+eosin (H&E)-stained slides to predict MET overexpression could promote testing
+for those who will benefit most. While assessment of MET expression using IHC
+is currently not routinely performed in NSCLC, next-generation sequencing is
+common and in some cases includes RNA expression panel testing. In this work,
+we leveraged a large database of matched H&E slides and RNA expression data to
+train a weakly supervised model to predict MET RNA overexpression directly from
+H&E images. This model was evaluated on an independent holdout test set of 300
+over-expressed and 289 normal patients, demonstrating an ROC-AUC of 0.70 (95th
+percentile interval: 0.66 - 0.74) with stable performance characteristics
+across different patient clinical variables and robust to synthetic noise on
+the test set. These results suggest that H&E-based predictive models could be
+useful to prioritize patients for confirmatory testing of MET protein or MET
+gene expression status.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ <span class="highlight-title">★</span> A Generalist Framework for Panoptic Segmentation of Images and Videos <span class="chip">ICCV'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.06366v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.06366v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ting Chen, Lala Li, Saurabh Saxena, <span class="highlight-author">Geoffrey Hinton</span>, David J. Fleet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Panoptic segmentation assigns semantic and instance ID labels to every pixel
+of an image. As permutations of instance IDs are also valid solutions, the task
+requires learning of high-dimensional one-to-many mapping. As a result,
+state-of-the-art approaches use customized architectures and task-specific loss
+functions. We formulate panoptic segmentation as a discrete data generation
+problem, without relying on inductive bias of the task. A diffusion model is
+proposed to model panoptic masks, with a simple architecture and generic loss
+function. By simply adding past predictions as a conditioning signal, our
+method is capable of modeling video (in a streaming setting) and thereby learns
+to track object instances automatically. With extensive experiments, we
+demonstrate that our simple approach can perform competitively to
+state-of-the-art specialist methods in similar settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV'23. Code at https://github.com/google-research/pix2seq</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Visual Dexterity: In-hand Dexterous Manipulation from Depth 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.11744v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.11744v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Chen, Megha Tippur, Siyang Wu, Vikash Kumar, Edward Adelson, Pulkit Agrawal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In-hand object reorientation is necessary for performing many dexterous
+manipulation tasks, such as tool use in less structured environments that
+remain beyond the reach of current robots. Prior works built reorientation
+systems assuming one or many of the following: reorienting only specific
+objects with simple shapes, limited range of reorientation, slow or quasistatic
+manipulation, simulation-only results, the need for specialized and costly
+sensor suites, and other constraints which make the system infeasible for
+real-world deployment. We present a general object reorientation controller
+that does not make these assumptions. It uses readings from a single commodity
+depth camera to dynamically reorient complex and new object shapes by any
+rotation in real-time, with the median reorientation time being close to seven
+seconds. The controller is trained using reinforcement learning in simulation
+and evaluated in the real world on new object shapes not used for training,
+including the most challenging scenario of reorienting objects held in the air
+by a downward-facing hand that must counteract gravity during reorientation.
+Our hardware platform only uses open-source components that cost less than five
+thousand dollars. While we demonstrate the ability to overcome assumptions in
+prior work, there is ample scope for improving absolute performance. For
+instance, the challenging duck-shaped object not used for training was dropped
+in 56% of the trials. When it was not dropped, our controller reoriented the
+object within 0.4 radians (i.e., 23 degrees) 75% of the time. Videos are
+available at: https://taochenshh.github.io/projects/visual-dexterity.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CUTS: A Framework for Multigranular Unsupervised Medical Image
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.11359v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.11359v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Liu, Matthew Amodio, Liangbo L. Shen, Feng Gao, Arman Avesta, Sanjay Aneja, Jay C. Wang, Lucian V. Del Priore, Smita Krishnaswamy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Segmenting medical images is critical to facilitating both patient diagnoses
+and quantitative research. A major limiting factor is the lack of labeled data,
+as obtaining expert annotations for each new set of imaging data or task can be
+expensive, labor intensive, and inconsistent among annotators. To address this,
+we present CUTS (Contrastive and Unsupervised Training for multi-granular
+medical image Segmentation), a fully unsupervised deep learning framework for
+medical image segmentation to better utilize the vast majority of imaging data
+that are not labeled or annotated. CUTS works by leveraging a novel two-stage
+approach. First, it produces an image-specific embedding map via intra-image
+contrastive loss and a local patch reconstruction objective. Second, these
+embeddings are partitioned at dynamic levels of granularity that correspond to
+the data topology. Ultimately, CUTS yields a series of coarse-to-fine-grained
+segmentations that highlight image features at various scales. We apply CUTS to
+retinal fundus images and two types of brain MRI images in order to delineate
+structures and patterns at different scales, providing distinct information
+relevant for clinicians. When evaluated against predefined anatomical masks at
+a given granularity, CUTS demonstrates improvements ranging from 10% to 200% on
+dice coefficient and Hausdorff distance compared to existing unsupervised
+methods. Further, CUTS shows performance on par with the latest Segment
+Anything Model which was pre-trained in a supervised fashion on 11 million
+images and 1.1 billion masks. In summary, with CUTS we demonstrate that medical
+image segmentation can be effectively solved without relying on large, labeled
+datasets or vast computational resources.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Additional experiments and updated figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generative modeling of living cells with SO(3)-equivariant implicit
+  neural representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.08960v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.08960v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Wiesner, Julian Suk, Sven Dummer, Tereza Nečasová, Vladimír Ulman, David Svoboda, Jelmer M. Wolterink
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data-driven cell tracking and segmentation methods in biomedical imaging
+require diverse and information-rich training data. In cases where the number
+of training samples is limited, synthetic computer-generated data sets can be
+used to improve these methods. This requires the synthesis of cell shapes as
+well as corresponding microscopy images using generative models. To synthesize
+realistic living cell shapes, the shape representation used by the generative
+model should be able to accurately represent fine details and changes in
+topology, which are common in cells. These requirements are not met by 3D voxel
+masks, which are restricted in resolution, and polygon meshes, which do not
+easily model processes like cell growth and mitosis. In this work, we propose
+to represent living cell shapes as level sets of signed distance functions
+(SDFs) which are estimated by neural networks. We optimize a fully-connected
+neural network to provide an implicit representation of the SDF value at any
+point in a 3D+time domain, conditioned on a learned latent code that is
+disentangled from the rotation of the cell shape. We demonstrate the
+effectiveness of this approach on cells that exhibit rapid deformations
+(Platynereis dumerilii), cells that grow and divide (C. elegans), and cells
+that have growing and branching filopodial protrusions (A549 human lung
+carcinoma cells). A quantitative evaluation using shape features and Dice
+similarity coefficients of real and synthetic cell shapes shows that our model
+can generate topologically plausible complex cell shapes in 3D+time with high
+similarity to real living cell shapes. Finally, we show how microscopy images
+of living cells that correspond to our generated cell shapes can be synthesized
+using an image-to-image model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Medical Image Analysis (MedIA) 2023 (Accepted)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ViNL: Visual Navigation and Locomotion Over Obstacles 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.14791v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.14791v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Simar Kareer, Naoki Yokoyama, Dhruv Batra, Sehoon Ha, Joanne Truong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present Visual Navigation and Locomotion over obstacles (ViNL), which
+enables a quadrupedal robot to navigate unseen apartments while stepping over
+small obstacles that lie in its path (e.g., shoes, toys, cables), similar to
+how humans and pets lift their feet over objects as they walk. ViNL consists
+of: (1) a visual navigation policy that outputs linear and angular velocity
+commands that guides the robot to a goal coordinate in unfamiliar indoor
+environments; and (2) a visual locomotion policy that controls the robot's
+joints to avoid stepping on obstacles while following provided velocity
+commands. Both the policies are entirely "model-free", i.e. sensors-to-actions
+neural networks trained end-to-end. The two are trained independently in two
+entirely different simulators and then seamlessly co-deployed by feeding the
+velocity commands from the navigator to the locomotor, entirely "zero-shot"
+(without any co-training). While prior works have developed learning methods
+for visual navigation or visual locomotion, to the best of our knowledge, this
+is the first fully learned approach that leverages vision to accomplish both
+(1) intelligent navigation in new environments, and (2) intelligent visual
+locomotion that aims to traverse cluttered environments without disrupting
+obstacles. On the task of navigation to distant goals in unknown environments,
+ViNL using just egocentric vision significantly outperforms prior work on
+robust locomotion using privileged terrain maps (+32.8% success and -4.42
+collisions per meter). Additionally, we ablate our locomotion policy to show
+that each aspect of our approach helps reduce obstacle collisions. Videos and
+code at http://www.joannetruong.com/projects/vinl.html
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HandNeRF: Learning to Reconstruct Hand-Object Interaction Scene from a
+  Single RGB Image 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.07891v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.07891v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongsuk Choi, Nikhil Chavan-Dafle, Jiacheng Yuan, Volkan Isler, Hyunsoo Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a method to learn hand-object interaction prior for
+reconstructing a 3D hand-object scene from a single RGB image. The inference as
+well as training-data generation for 3D hand-object scene reconstruction is
+challenging due to the depth ambiguity of a single image and occlusions by the
+hand and object. We turn this challenge into an opportunity by utilizing the
+hand shape to constrain the possible relative configuration of the hand and
+object geometry. We design a generalizable implicit function, HandNeRF, that
+explicitly encodes the correlation of the 3D hand shape features and 2D object
+features to predict the hand and object scene geometry. With experiments on
+real-world datasets, we show that HandNeRF is able to reconstruct hand-object
+scenes of novel grasp configurations more accurately than comparable methods.
+Moreover, we demonstrate that object reconstruction from HandNeRF ensures more
+accurate execution of downstream tasks, such as grasping and motion planning
+for robotic hand-over and manipulation. The code will be release here:
+https://gitbhub.com/SamsungLabs/HandNeRF
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages including the supplementary material, 8 tables, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Habitat-Matterport 3D Semantics <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.05633v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.05633v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karmesh Yadav, Ram Ramrakhya, Santhosh Kumar Ramakrishnan, Theo Gervet, John Turner, Aaron Gokaslan, Noah Maestre, Angel Xuan Chang, Dhruv Batra, Manolis Savva, Alexander William Clegg, Devendra Singh Chaplot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the Habitat-Matterport 3D Semantics (HM3DSEM) dataset. HM3DSEM is
+the largest dataset of 3D real-world spaces with densely annotated semantics
+that is currently available to the academic community. It consists of 142,646
+object instance annotations across 216 3D spaces and 3,100 rooms within those
+spaces. The scale, quality, and diversity of object annotations far exceed
+those of prior datasets. A key difference setting apart HM3DSEM from other
+datasets is the use of texture information to annotate pixel-accurate object
+boundaries. We demonstrate the effectiveness of HM3DSEM dataset for the Object
+Goal Navigation task using different methods. Policies trained using HM3DSEM
+perform outperform those trained on prior datasets. Introduction of HM3DSEM in
+the Habitat ObjectNav Challenge lead to an increase in participation from 400
+submissions in 2021 to 1022 submissions in 2022.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 Pages, 11 Figures, 6 Tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">9</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fine-Tuning LLaMA for Multi-Stage Text Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08319v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08319v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xueguang Ma, Liang Wang, Nan Yang, Furu Wei, Jimmy Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The effectiveness of multi-stage text retrieval has been solidly demonstrated
+since before the era of pre-trained language models. However, most existing
+studies utilize models that predate recent advances in large language models
+(LLMs). This study seeks to explore potential improvements that
+state-of-the-art LLMs can bring. We conduct a comprehensive study, fine-tuning
+the latest LLaMA model both as a dense retriever (RepLLaMA) and as a pointwise
+reranker (RankLLaMA) for both passage retrieval and document retrieval using
+the MS MARCO datasets. Our findings demonstrate that the effectiveness of large
+language models indeed surpasses that of smaller models. Additionally, since
+LLMs can inherently handle longer contexts, they can represent entire documents
+holistically, obviating the need for traditional segmenting and pooling
+strategies. Furthermore, evaluations on BEIR demonstrate that our
+RepLLaMA-RankLLaMA pipeline exhibits strong zero-shot effectiveness. Model
+checkpoints from this study are available on HuggingFace.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Using GUI Interaction Data to Improve Text Retrieval-based Bug
+  Localization <span class="chip">ICSE'24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08083v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08083v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junayed Mahmud, Nadeeshan De Silva, Safwat Ali Khan, Seyed Hooman Mostafavi, SM Hasan Mansur, Oscar Chaparro, Andrian Marcus, Kevin Moran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the most important tasks related to managing bug reports is localizing
+the fault so that a fix can be applied. As such, prior work has aimed to
+automate this task of bug localization by formulating it as an information
+retrieval problem, where potentially buggy files are retrieved and ranked
+according to their textual similarity with a given bug report. However, there
+is often a notable semantic gap between the information contained in bug
+reports and identifiers or natural language contained within source code files.
+For user-facing software, there is currently a key source of information that
+could aid in bug localization, but has not been thoroughly investigated -
+information from the GUI.
+  We investigate the hypothesis that, for end user-facing applications,
+connecting information in a bug report with information from the GUI, and using
+this to aid in retrieving potentially buggy files, can improve upon existing
+techniques for bug localization. To examine this phenomenon, we conduct a
+comprehensive empirical study that augments four baseline techniques for bug
+localization with GUI interaction information from a reproduction scenario to
+(i) filter out potentially irrelevant files, (ii) boost potentially relevant
+files, and (iii) reformulate text-retrieval queries. To carry out our study, we
+source the current largest dataset of fully-localized and reproducible real
+bugs for Android apps, with corresponding bug reports, consisting of 80 bug
+reports from 39 popular open-source apps. Our results illustrate that
+augmenting traditional techniques with GUI information leads to a marked
+increase in effectiveness across multiple metrics, including a relative
+increase in Hits@10 of 13-18%. Additionally, through further analysis, we find
+that our studied augmentations largely complement existing techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, to appear in the Proceedings of the 46th International
+  Conference on Software Engineering (ICSE'24)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Negative Pairs in Code Search <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08069v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08069v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haochen Li, Xin Zhou, Luu Anh Tuan, Chunyan Miao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, contrastive learning has become a key component in fine-tuning code
+search models for software development efficiency and effectiveness. It pulls
+together positive code snippets while pushing negative samples away given
+search queries. Among contrastive learning, InfoNCE is the most widely used
+loss function due to its better performance. However, the following problems in
+negative samples of InfoNCE may deteriorate its representation learning: 1) The
+existence of false negative samples in large code corpora due to duplications.
+2). The failure to explicitly differentiate between the potential relevance of
+negative samples. As an example, a bubble sorting algorithm example is less
+``negative'' than a file saving function for the quick sorting algorithm query.
+In this paper, we tackle the above problems by proposing a simple yet effective
+Soft-InfoNCE loss that inserts weight terms into InfoNCE. In our proposed loss
+function, we apply three methods to estimate the weights of negative pairs and
+show that the vanilla InfoNCE loss is a special case of Soft-InfoNCE.
+Theoretically, we analyze the effects of Soft-InfoNCE on controlling the
+distribution of learnt code representations and on deducing a more precise
+mutual information estimation. We furthermore discuss the superiority of
+proposed loss functions with other design alternatives. Extensive experiments
+demonstrate the effectiveness of Soft-InfoNCE and weights estimation methods
+under state-of-the-art code search models on a large-scale public dataset
+consisting of six programming languages. Source code is available at
+\url{https://github.com/Alex-HaochenLi/Soft-InfoNCE}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Large-scale Pre-ranking System: Entire-chain Cross-domain
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08039v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08039v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinbo Song, Ruoran Huang, Xinyang Wang, Wei Huang, Qian Yu, Mingming Chen, Yafei Yao, Chaosheng Fan, Changping Peng, Zhangang Lin, Jinghe Hu, Jingping Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Industrial systems such as recommender systems and online advertising, have
+been widely equipped with multi-stage architectures, which are divided into
+several cascaded modules, including matching, pre-ranking, ranking and
+re-ranking. As a critical bridge between matching and ranking, existing
+pre-ranking approaches mainly endure sample selection bias (SSB) problem owing
+to ignoring the entire-chain data dependence, resulting in sub-optimal
+performances. In this paper, we rethink pre-ranking system from the perspective
+of the entire sample space, and propose Entire-chain Cross-domain Models (ECM),
+which leverage samples from the whole cascaded stages to effectively alleviate
+SSB problem. Besides, we design a fine-grained neural structure named ECMM to
+further improve the pre-ranking accuracy. Specifically, we propose a
+cross-domain multi-tower neural network to comprehensively predict for each
+stage result, and introduce the sub-networking routing strategy with $L0$
+regularization to reduce computational costs. Evaluations on real-world
+large-scale traffic logs demonstrate that our pre-ranking models outperform
+SOTA methods while time consumption is maintained within an acceptable level,
+which achieves better trade-off between efficiency and effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continual Learning via Manifold Expansion Replay 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08038v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08038v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihao Xu, Xuan Tang, Yufei Shi, Jianfeng Zhang, Jian Yang, Mingsong Chen, Xian Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In continual learning, the learner learns multiple tasks in sequence, with
+data being acquired only once for each task. Catastrophic forgetting is a major
+challenge to continual learning. To reduce forgetting, some existing
+rehearsal-based methods use episodic memory to replay samples of previous
+tasks. However, in the process of knowledge integration when learning a new
+task, this strategy also suffers from catastrophic forgetting due to an
+imbalance between old and new knowledge. To address this problem, we propose a
+novel replay strategy called Manifold Expansion Replay (MaER). We argue that
+expanding the implicit manifold of the knowledge representation in the episodic
+memory helps to improve the robustness and expressiveness of the model. To this
+end, we propose a greedy strategy to keep increasing the diameter of the
+implicit manifold represented by the knowledge in the buffer during memory
+management. In addition, we introduce Wasserstein distance instead of cross
+entropy as distillation loss to preserve previous knowledge. With extensive
+experimental validation on MNIST, CIFAR10, CIFAR100, and TinyImageNet, we show
+that the proposed method significantly improves the accuracy in continual
+learning setup, outperforming the state of the arts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-View Variational Autoencoder for Missing Value Imputation in
+  Untargeted Metabolomics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07990v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07990v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chen Zhao, Kuan-Jui Su, Chong Wu, Xuewei Cao, Qiuying Sha, Wu Li, Zhe Luo, Tian Qin, Chuan Qiu, Lan Juan Zhao, Anqi Liu, Lindong Jiang, Xiao Zhang, Hui Shen, Weihua Zhou, Hong-Wen Deng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Background: Missing data is a common challenge in mass spectrometry-based
+metabolomics, which can lead to biased and incomplete analyses. The integration
+of whole-genome sequencing (WGS) data with metabolomics data has emerged as a
+promising approach to enhance the accuracy of data imputation in metabolomics
+studies. Method: In this study, we propose a novel method that leverages the
+information from WGS data and reference metabolites to impute unknown
+metabolites. Our approach utilizes a multi-view variational autoencoder to
+jointly model the burden score, polygenetic risk score (PGS), and linkage
+disequilibrium (LD) pruned single nucleotide polymorphisms (SNPs) for feature
+extraction and missing metabolomics data imputation. By learning the latent
+representations of both omics data, our method can effectively impute missing
+metabolomics values based on genomic information. Results: We evaluate the
+performance of our method on empirical metabolomics datasets with missing
+values and demonstrate its superiority compared to conventional imputation
+techniques. Using 35 template metabolites derived burden scores, PGS and
+LD-pruned SNPs, the proposed methods achieved r2-scores > 0.01 for 71.55% of
+metabolites. Conclusion: The integration of WGS data in metabolomics imputation
+not only improves data completeness but also enhances downstream analyses,
+paving the way for more comprehensive and accurate investigations of metabolic
+pathways and disease associations. Our findings offer valuable insights into
+the potential benefits of utilizing WGS data for metabolomics data imputation
+and underscore the importance of leveraging multi-modal data integration in
+precision medicine research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Question Answering for Electronic Health Records: A Scoping <span class="highlight-title">Review</span> of
+  <span class="highlight-title">dataset</span>s and models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08759v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08759v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jayetri Bardhan, Kirk Roberts, Daisy Zhe Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Question Answering (QA) systems on patient-related data can assist both
+clinicians and patients. They can, for example, assist clinicians in
+decision-making and enable patients to have a better understanding of their
+medical history. Significant amounts of patient data are stored in Electronic
+Health Records (EHRs), making EHR QA an important research area. In EHR QA, the
+answer is obtained from the medical record of the patient. Because of the
+differences in data format and modality, this differs greatly from other
+medical QA tasks that employ medical websites or scientific papers to retrieve
+answers, making it critical to research EHR question answering. This study
+aimed to provide a methodological review of existing works on QA over EHRs. We
+searched for articles from January 1st, 2005 to September 30th, 2023 in four
+digital sources including Google Scholar, ACL Anthology, ACM Digital Library,
+and PubMed to collect relevant publications on EHR QA. 4111 papers were
+identified for our study, and after screening based on our inclusion criteria,
+we obtained a total of 47 papers for further study. Out of the 47 papers, 25
+papers were about EHR QA datasets, and 37 papers were about EHR QA models. It
+was observed that QA on EHRs is relatively new and unexplored. Most of the
+works are fairly recent. Also, it was observed that emrQA is by far the most
+popular EHR QA dataset, both in terms of citations and usage in other papers.
+Furthermore, we identified the different models used in EHR QA along with the
+evaluation metrics used for these models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 tables, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Individual Variation Affects Outbreak Magnitude and Predictability in an
+  Extended Multi-Pathogen SIR Model of Pigeons Vising Dairy Farms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08613v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08613v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Teddy Lazebnik, Orr Spiegel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zoonotic disease transmission between animals and humans is a growing risk
+and the agricultural context acts as a likely point of transition, with
+individual heterogeneity acting as an important contributor. Thus,
+understanding the dynamics of disease spread in the wildlife-livestock
+interface is crucial for mitigating these risks of transmission. Specifically,
+the interactions between pigeons and in-door cows at dairy farms can lead to
+significant disease transmission and economic losses for farmers; putting
+livestock, adjacent human populations, and other wildlife species at risk. In
+this paper, we propose a novel spatio-temporal multi-pathogen model with
+continuous spatial movement. The model expands on the
+Susceptible-Exposed-Infected-Recovered-Dead (SEIRD) framework and accounts for
+both within-species and cross-species transmission of pathogens, as well as the
+exploration-exploitation movement dynamics of pigeons, which play a critical
+role in the spread of infection agents. In addition to model formulation, we
+also implement it as an agent-based simulation approach and use empirical field
+data to investigate different biologically realistic scenarios, evaluating the
+effect of various parameters on the epidemic spread. Namely, in agreement with
+theoretical expectations, the model predicts that the heterogeneity of the
+pigeons' movement dynamics can drastically affect both the magnitude and
+stability of outbreaks. In addition, joint infection by multiple pathogens can
+have an interactive effect unobservable in single-pathogen SIR models,
+reflecting a non-intuitive inhibition of the outbreak. Our findings highlight
+the impact of heterogeneity in host behavior on their pathogens and allow
+realistic predictions of outbreak dynamics in the multi-pathogen
+wildlife-livestock interface with consequences to zoonotic diseases in various
+systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient High-Resolution Template Matching with Vector Quantized
+  Nearest Neighbour Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15010v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15010v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ankit Gupta, Ida-Maria Sintorn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Template matching is a fundamental problem in computer vision with
+applications in fields including object detection, image registration, and
+object tracking. Current methods rely on nearest-neighbour (NN) matching, where
+the query feature space is converted to NN space by representing each query
+pixel with its NN in the template. NN-based methods have been shown to perform
+better in occlusions, appearance changes, and non-rigid transformations;
+however, they scale poorly with high-resolution data and high feature
+dimensions. We present an NN-based method which efficiently reduces the NN
+computations and introduces filtering in the NN fields (NNFs). A vector
+quantization step is introduced before the NN calculation to represent the
+template with $k$ features, and the filter response over the NNFs is used to
+compare the template and query distributions over the features. We show that
+state-of-the-art performance is achieved in low-resolution data, and our method
+outperforms previous methods at higher resolution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">180</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Octopus: Embodied Vision-Language Programmer from Environmental Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08588v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08588v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingkang Yang, Yuhao Dong, Shuai Liu, Bo Li, Ziyue Wang, Chencheng Jiang, Haoran Tan, Jiamu Kang, Yuanhan Zhang, Kaiyang Zhou, Ziwei Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large vision-language models (VLMs) have achieved substantial progress in
+multimodal perception and reasoning. Furthermore, when seamlessly integrated
+into an embodied agent, it signifies a crucial stride towards the creation of
+autonomous and context-aware systems capable of formulating plans and executing
+commands with precision. In this paper, we introduce Octopus, a novel VLM
+designed to proficiently decipher an agent's vision and textual task objectives
+and to formulate intricate action sequences and generate executable code. Our
+design allows the agent to adeptly handle a wide spectrum of tasks, ranging
+from mundane daily chores in simulators to sophisticated interactions in
+complex video games. Octopus is trained by leveraging GPT-4 to control an
+explorative agent to generate training data, i.e., action blueprints and the
+corresponding executable code, within our experimental environment called
+OctoVerse. We also collect the feedback that allows the enhanced training
+scheme of Reinforcement Learning with Environmental Feedback (RLEF). Through a
+series of experiments, we illuminate Octopus's functionality and present
+compelling results, and the proposed RLEF turns out to refine the agent's
+decision-making. By open-sourcing our model architecture, simulator, and
+dataset, we aspire to ignite further innovation and foster collaborative
+applications within the broader embodied AI community.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://choiszt.github.io/Octopus/, Codebase:
+  https://github.com/dongyh20/Octopus</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tree-Planner: Efficient Close-loop Task Planning with Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08582v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08582v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengkang Hu, Yao Mu, Xinmiao Yu, Mingyu Ding, Shiguang Wu, Wenqi Shao, Qiguang Chen, Bin Wang, Yu Qiao, Ping Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies close-loop task planning, which refers to the process of
+generating a sequence of skills (a plan) to accomplish a specific goal while
+adapting the plan based on real-time observations. Recently, prompting Large
+Language Models (LLMs) to generate actions iteratively has become a prevalent
+paradigm due to its superior performance and user-friendliness. However, this
+paradigm is plagued by two inefficiencies: high token consumption and redundant
+error correction, both of which hinder its scalability for large-scale testing
+and applications. To address these issues, we propose Tree-Planner, which
+reframes task planning with LLMs into three distinct phases: plan sampling,
+action tree construction, and grounded deciding. Tree-Planner starts by using
+an LLM to sample a set of potential plans before execution, followed by the
+aggregation of them to form an action tree. Finally, the LLM performs a
+top-down decision-making process on the tree, taking into account real-time
+environmental information. Experiments show that Tree-Planner achieves
+state-of-the-art performance while maintaining high efficiency. By decomposing
+LLM queries into a single plan-sampling call and multiple grounded-deciding
+calls, a considerable part of the prompt are less likely to be repeatedly
+consumed. As a result, token consumption is reduced by 92.2% compared to the
+previously best-performing model. Additionally, by enabling backtracking on the
+action tree as needed, the correction process becomes more flexible, leading to
+a 40.5% decrease in error corrections. Project page:
+https://tree-planner.github.io/
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Visual Data-Type Understanding does not emerge from Scaling
+  Vision-Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08577v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08577v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vishaal Udandarao, Max F. Burg, Samuel Albanie, Matthias Bethge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in the development of vision-language models (VLMs) are
+yielding remarkable success in recognizing visual semantic content, including
+impressive instances of compositional image understanding. Here, we introduce
+the novel task of \textit{Visual Data-Type Identification}, a basic perceptual
+skill with implications for data curation (e.g., noisy data-removal from large
+datasets, domain-specific retrieval) and autonomous vision (e.g.,
+distinguishing changing weather conditions from camera lens staining). We
+develop two datasets consisting of animal images altered across a diverse set
+of 27 visual \textit{data-types}, spanning four broad categories. An extensive
+zero-shot evaluation of 39 VLMs, ranging from 100M to 80B parameters, shows a
+nuanced performance landscape. While VLMs are reasonably good at identifying
+certain stylistic \textit{data-types}, such as cartoons and sketches, they
+struggle with simpler \textit{data-types} arising from basic manipulations like
+image rotations or additive noise. Our findings reveal that (i) model scaling
+alone yields marginal gains for contrastively-trained models like CLIP, and
+(ii) there is a pronounced drop in performance for the largest
+auto-regressively trained VLMs like OpenFlamingo. This finding points to a
+blind spot in current frontier VLMs: they excel in recognizing semantic content
+but fail to acquire an understanding of visual \textit{data-types} through
+scaling. By analyzing the pre-training distributions of these models and
+incorporating \textit{data-type} information into the captions during
+fine-tuning, we achieve a significant enhancement in performance. By exploring
+this previously uncharted task, we aim to set the stage for further advancing
+VLMs to equip them with visual data-type understanding. Code and datasets are
+released \href{https://github.com/bethgelab/DataTypeIdentification}{here}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning to Act from Actionless Videos through Dense Correspondences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08576v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08576v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Po-Chen Ko, Jiayuan Mao, Yilun Du, Shao-Hua Sun, Joshua B. Tenenbaum
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we present an approach to construct a video-based robot policy
+capable of reliably executing diverse tasks across different robots and
+environments from few video demonstrations without using any action
+annotations. Our method leverages images as a task-agnostic representation,
+encoding both the state and action information, and text as a general
+representation for specifying robot goals. By synthesizing videos that
+``hallucinate'' robot executing actions and in combination with dense
+correspondences between frames, our approach can infer the closed-formed action
+to execute to an environment without the need of any explicit action labels.
+This unique capability allows us to train the policy solely based on RGB videos
+and deploy learned policies to various robotic tasks. We demonstrate the
+efficacy of our approach in learning policies on table-top manipulation and
+navigation tasks. Additionally, we contribute an open-source framework for
+efficient video modeling, enabling the training of high-fidelity policy models
+with four GPUs within a single day.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://flow-diffusion.github.io/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Jigsaw: Supporting Designers in Prototyping Multimodal Applications by
+  Assembling AI Foundation Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08574v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08574v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Chuan-En Lin, Nikolas Martelaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in AI foundation models have made it possible for them to
+be utilized off-the-shelf for creative tasks, including ideating design
+concepts or generating visual prototypes. However, integrating these models
+into the creative process can be challenging as they often exist as standalone
+applications tailored to specific tasks. To address this challenge, we
+introduce Jigsaw, a prototype system that employs puzzle pieces as metaphors to
+represent foundation models. Jigsaw allows designers to combine different
+foundation model capabilities across various modalities by assembling
+compatible puzzle pieces. To inform the design of Jigsaw, we interviewed ten
+designers and distilled design goals. In a user study, we showed that Jigsaw
+enhanced designers' understanding of available foundation model capabilities,
+provided guidance on combining capabilities across different modalities and
+tasks, and served as a canvas to support design exploration, prototyping, and
+documentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Webpage: https://preview.jigsaw.to</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bucks for Buckets (B4B): Active Defenses Against Stealing Encoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08571v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08571v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan Dubiński, Stanisław Pawlak, Franziska Boenisch, Tomasz Trzciński, Adam Dziedzic
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine Learning as a Service (MLaaS) APIs provide ready-to-use and
+high-utility encoders that generate vector representations for given inputs.
+Since these encoders are very costly to train, they become lucrative targets
+for model stealing attacks during which an adversary leverages query access to
+the API to replicate the encoder locally at a fraction of the original training
+costs. We propose Bucks for Buckets (B4B), the first active defense that
+prevents stealing while the attack is happening without degrading
+representation quality for legitimate API users. Our defense relies on the
+observation that the representations returned to adversaries who try to steal
+the encoder's functionality cover a significantly larger fraction of the
+embedding space than representations of legitimate users who utilize the
+encoder to solve a particular downstream task.vB4B leverages this to adaptively
+adjust the utility of the returned representations according to a user's
+coverage of the embedding space. To prevent adaptive adversaries from eluding
+our defense by simply creating multiple user accounts (sybils), B4B also
+individually transforms each user's representations. This prevents the
+adversary from directly aggregating representations over multiple accounts to
+create their stolen encoder copy. Our active defense opens a new path towards
+securely sharing and democratizing encoders over public APIs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Transformer</span>s as Decision Makers: Provable In-Context Reinforcement
+  Learning via Supervised <span class="highlight-title">Pretrain</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08566v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08566v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Licong Lin, Yu Bai, Song Mei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large transformer models pretrained on offline reinforcement learning
+datasets have demonstrated remarkable in-context reinforcement learning (ICRL)
+capabilities, where they can make good decisions when prompted with interaction
+trajectories from unseen environments. However, when and how transformers can
+be trained to perform ICRL have not been theoretically well-understood. In
+particular, it is unclear which reinforcement-learning algorithms transformers
+can perform in context, and how distribution mismatch in offline training data
+affects the learned algorithms. This paper provides a theoretical framework
+that analyzes supervised pretraining for ICRL. This includes two recently
+proposed training methods -- algorithm distillation and decision-pretrained
+transformers. First, assuming model realizability, we prove the
+supervised-pretrained transformer will imitate the conditional expectation of
+the expert algorithm given the observed trajectory. The generalization error
+will scale with model capacity and a distribution divergence factor between the
+expert and offline algorithms. Second, we show transformers with ReLU attention
+can efficiently approximate near-optimal online reinforcement learning
+algorithms like LinUCB and Thompson sampling for stochastic linear bandits, and
+UCB-VI for tabular Markov decision processes. This provides the first
+quantitative analysis of the ICRL capabilities of transformers pretrained from
+offline trajectories.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Offline Retraining for Online RL: Decoupled Policy Learning to Mitigate
+  Exploration Bias 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08558v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08558v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Max Sobol Mark, Archit Sharma, Fahim Tajwar, Rafael Rafailov, Sergey Levine, Chelsea Finn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is desirable for policies to optimistically explore new states and
+behaviors during online reinforcement learning (RL) or fine-tuning, especially
+when prior offline data does not provide enough state coverage. However,
+exploration bonuses can bias the learned policy, and our experiments find that
+naive, yet standard use of such bonuses can fail to recover a performant
+policy. Concurrently, pessimistic training in offline RL has enabled recovery
+of performant policies from static datasets. Can we leverage offline RL to
+recover better policies from online interaction? We make a simple observation
+that a policy can be trained from scratch on all interaction data with
+pessimistic objectives, thereby decoupling the policies used for data
+collection and for evaluation. Specifically, we propose offline retraining, a
+policy extraction step at the end of online fine-tuning in our
+Offline-to-Online-to-Offline (OOO) framework for reinforcement learning (RL).
+An optimistic (exploration) policy is used to interact with the environment,
+and a separate pessimistic (exploitation) policy is trained on all the observed
+data for evaluation. Such decoupling can reduce any bias from online
+interaction (intrinsic rewards, primacy bias) in the evaluation policy, and can
+allow more exploratory behaviors during online interaction which in turn can
+generate better data for exploitation. OOO is complementary to several
+offline-to-online RL and online RL methods, and improves their average
+performance by 14% to 26% in our fine-tuning experiments, achieves
+state-of-the-art performance on several environments in the D4RL benchmarks,
+and improves online RL performance by 165% on two OpenAI gym environments.
+Further, OOO can enable fine-tuning from incomplete offline datasets where
+prior methods can fail to recover a performant policy. Implementation:
+https://github.com/MaxSobolMark/OOO
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cross-Episodic Curriculum for <span class="highlight-title">Transformer</span> Agents <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08549v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08549v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucy Xiaoyang Shi, Yunfan Jiang, Jake Grigsby, Linxi "Jim" Fan, Yuke Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a new algorithm, Cross-Episodic Curriculum (CEC), to boost the
+learning efficiency and generalization of Transformer agents. Central to CEC is
+the placement of cross-episodic experiences into a Transformer's context, which
+forms the basis of a curriculum. By sequentially structuring online learning
+trials and mixed-quality demonstrations, CEC constructs curricula that
+encapsulate learning progression and proficiency increase across episodes. Such
+synergy combined with the potent pattern recognition capabilities of
+Transformer models delivers a powerful cross-episodic attention mechanism. The
+effectiveness of CEC is demonstrated under two representative scenarios: one
+involving multi-task reinforcement learning with discrete control, such as in
+DeepMind Lab, where the curriculum captures the learning progression in both
+individual and progressively complex settings; and the other involving
+imitation learning with mixed-quality data for continuous control, as seen in
+RoboMimic, where the curriculum captures the improvement in demonstrators'
+expertise. In all instances, policies resulting from CEC exhibit superior
+performance and strong generalization. Code is open-sourced at
+https://cec-agent.github.io/ to facilitate research on Transformer agent
+learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in NeurIPS 2023; The first two authors contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stronger Coreset Bounds for Kernel Density Estimators via Chaining 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08548v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08548v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rainie Bozzai, Thomas Rothvoss
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We apply the discrepancy method and a chaining approach to give improved
+bounds on the coreset complexity of a wide class of kernel functions. Our
+results give randomized polynomial time algorithms to produce coresets of size
+$O\big(\frac{\sqrt{d}}{\varepsilon}\sqrt{\log\log \frac{1}{\varepsilon}}\big)$
+for the Gaussian and Laplacian kernels in the case that the data set is
+uniformly bounded, an improvement that was not possible with previous
+techniques. We also obtain coresets of size
+$O\big(\frac{1}{\varepsilon}\sqrt{\log\log \frac{1}{\varepsilon}}\big)$ for the
+Laplacian kernel for $d$ constant. Finally, we give the best known bounds of
+$O\big(\frac{\sqrt{d}}{\varepsilon}\sqrt{\log(2\max\{1,\alpha\})}\big)$ on the
+coreset complexity of the exponential, Hellinger, and JS Kernels, where
+$1/\alpha$ is the bandwidth parameter of the kernel.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Do <span class="highlight-title">pretrain</span>ed <span class="highlight-title">Transformer</span>s Really Learn In-context by Gradient Descent? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08540v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08540v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lingfeng Shen, Aayush Mishra, Daniel Khashabi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Is In-Context Learning (ICL) implicitly equivalent to Gradient Descent (GD)?
+Several recent works draw analogies between the dynamics of GD and the emergent
+behavior of ICL in large language models. However, these works make assumptions
+far from the realistic natural language setting in which language models are
+trained. Such discrepancies between theory and practice, therefore, necessitate
+further investigation to validate their applicability.
+  We start by highlighting the weaknesses in prior works that construct
+Transformer weights to simulate gradient descent. Their experiments with
+training Transformers on ICL objective, inconsistencies in the order
+sensitivity of ICL and GD, sparsity of the constructed weights, and sensitivity
+to parameter changes are some examples of a mismatch from the real-world
+setting.
+  Furthermore, we probe and compare the ICL vs. GD hypothesis in a natural
+setting. We conduct comprehensive empirical analyses on language models
+pretrained on natural data (LLaMa-7B). Our comparisons on various performance
+metrics highlight the inconsistent behavior of ICL and GD as a function of
+various factors such as datasets, models, and number of demonstrations. We
+observe that ICL and GD adapt the output distribution of language models
+differently. These results indicate that the equivalence between ICL and GD is
+an open hypothesis, requires nuanced considerations and calls for further
+studies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Learning of Object-Centric Embeddings for Cell Instance
+  Segmentation in Microscopy Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08501v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08501v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Steffen Wolf, Manan Lalit, Henry Westmacott, Katie McDole, Jan Funke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Segmentation of objects in microscopy images is required for many biomedical
+applications. We introduce object-centric embeddings (OCEs), which embed image
+patches such that the spatial offsets between patches cropped from the same
+object are preserved. Those learnt embeddings can be used to delineate
+individual objects and thus obtain instance segmentations. Here, we show
+theoretically that, under assumptions commonly found in microscopy images, OCEs
+can be learnt through a self-supervised task that predicts the spatial offset
+between image patches. Together, this forms an unsupervised cell instance
+segmentation method which we evaluate on nine diverse large-scale microscopy
+datasets. Segmentations obtained with our method lead to substantially improved
+results, compared to state-of-the-art baselines on six out of nine datasets,
+and perform on par on the remaining three datasets. If ground-truth annotations
+are available, our method serves as an excellent starting point for supervised
+training, reducing the required amount of ground-truth needed by one order of
+magnitude, thus substantially increasing the practical applicability of our
+method. Source code is available at https://github.com/funkelab/cellulus.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Impact of time and note duration tokenizations on deep learning symbolic
+  music modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08497v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08497v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nathan Fradet, Nicolas Gutowski, Fabien Chhel, Jean-Pierre Briot
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Symbolic music is widely used in various deep learning tasks, including
+generation, transcription, synthesis, and Music Information Retrieval (MIR). It
+is mostly employed with discrete models like Transformers, which require music
+to be tokenized, i.e., formatted into sequences of distinct elements called
+tokens. Tokenization can be performed in different ways. As Transformer can
+struggle at reasoning, but capture more easily explicit information, it is
+important to study how the way the information is represented for such model
+impact their performances. In this work, we analyze the common tokenization
+methods and experiment with time and note duration representations. We compare
+the performances of these two impactful criteria on several tasks, including
+composer and emotion classification, music generation, and sequence
+representation learning. We demonstrate that explicit information leads to
+better results depending on the task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ISMIR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Characterizing climate pathways using feature importance on echo state
+  networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08495v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08495v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Katherine Goode, Daniel Ries, Kellie McClernon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The 2022 National Defense Strategy of the United States listed climate change
+as a serious threat to national security. Climate intervention methods, such as
+stratospheric aerosol injection, have been proposed as mitigation strategies,
+but the downstream effects of such actions on a complex climate system are not
+well understood. The development of algorithmic techniques for quantifying
+relationships between source and impact variables related to a climate event
+(i.e., a climate pathway) would help inform policy decisions. Data-driven deep
+learning models have become powerful tools for modeling highly nonlinear
+relationships and may provide a route to characterize climate variable
+relationships. In this paper, we explore the use of an echo state network (ESN)
+for characterizing climate pathways. ESNs are a computationally efficient
+neural network variation designed for temporal data, and recent work proposes
+ESNs as a useful tool for forecasting spatio-temporal climate data. Like other
+neural networks, ESNs are non-interpretable black-box models, which poses a
+hurdle for understanding variable relationships. We address this issue by
+developing feature importance methods for ESNs in the context of
+spatio-temporal data to quantify variable relationships captured by the model.
+We conduct a simulation study to assess and compare the feature importance
+techniques, and we demonstrate the approach on reanalysis climate data. In the
+climate application, we select a time period that includes the 1991 volcanic
+eruption of Mount Pinatubo. This event was a significant stratospheric aerosol
+injection, which we use as a proxy for an artificial stratospheric aerosol
+injection. Using the proposed approach, we are able to characterize
+relationships between pathway variables associated with this event.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prometheus: Inducing Fine-grained Evaluation Capability in Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08491v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08491v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seungone Kim, Jamin Shin, Yejin Cho, Joel Jang, Shayne Longpre, Hwaran Lee, Sangdoo Yun, Seongjin Shin, Sungdong Kim, James Thorne, Minjoon Seo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, using a powerful proprietary Large Language Model (LLM) (e.g.,
+GPT-4) as an evaluator for long-form responses has become the de facto
+standard. However, for practitioners with large-scale evaluation tasks and
+custom criteria in consideration (e.g., child-readability), using proprietary
+LLMs as an evaluator is unreliable due to the closed-source nature,
+uncontrolled versioning, and prohibitive costs. In this work, we propose
+Prometheus, a fully open-source LLM that is on par with GPT-4's evaluation
+capabilities when the appropriate reference materials (reference answer, score
+rubric) are accompanied. We first construct the Feedback Collection, a new
+dataset that consists of 1K fine-grained score rubrics, 20K instructions, and
+100K responses and language feedback generated by GPT-4. Using the Feedback
+Collection, we train Prometheus, a 13B evaluator LLM that can assess any given
+long-form text based on customized score rubric provided by the user.
+Experimental results show that Prometheus scores a Pearson correlation of 0.897
+with human evaluators when evaluating with 45 customized score rubrics, which
+is on par with GPT-4 (0.882), and greatly outperforms ChatGPT (0.392).
+Furthermore, measuring correlation with GPT-4 with 1222 customized score
+rubrics across four benchmarks (MT Bench, Vicuna Bench, Feedback Bench, Flask
+Eval) shows similar trends, bolstering Prometheus's capability as an evaluator
+LLM. Lastly, Prometheus achieves the highest accuracy on two human preference
+benchmarks (HHH Alignment & MT Bench Human Judgment) compared to open-sourced
+reward models explicitly trained on human preference datasets, highlighting its
+potential as an universal reward model. We open-source our code, dataset, and
+model at https://github.com/kaistAI/Prometheus.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Work in Progress</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can We Edit Multimodal Large Language Models? <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08475v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08475v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyuan Cheng, Bozhong Tian, Qingbin Liu, Xi Chen, Yongheng Wang, Huajun Chen, Ningyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we focus on editing Multimodal Large Language Models (MLLMs).
+Compared to editing single-modal LLMs, multimodal model editing is more
+challenging, which demands a higher level of scrutiny and careful consideration
+in the editing process. To facilitate research in this area, we construct a new
+benchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite
+of innovative metrics for evaluation. We conduct comprehensive experiments
+involving various model editing baselines and analyze the impact of editing
+different components for multimodal LLMs. Empirically, we notice that previous
+baselines can implement editing multimodal LLMs to some extent, but the effect
+is still barely satisfactory, indicating the potential difficulty of this task.
+We hope that our work can provide the NLP community with insights\footnote{Code
+and dataset are available in https://github.com/zjunlp/EasyEdit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Strategies and impact of learning curve estimation for CNN-based image
+  classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08470v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08470v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laura Didyk, Brayden Yarish, Michael A. Beck, Christopher P. Bidinosti, Christopher J. Henry
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning curves are a measure for how the performance of machine learning
+models improves given a certain volume of training data. Over a wide variety of
+applications and models it was observed that learning curves follow -- to a
+large extent -- a power law behavior. This makes the performance of different
+models for a given task somewhat predictable and opens the opportunity to
+reduce the training time for practitioners, who are exploring the space of
+possible models and hyperparameters for the problem at hand. By estimating the
+learning curve of a model from training on small subsets of data only the best
+models need to be considered for training on the full dataset. How to choose
+subset sizes and how often to sample models on these to obtain estimates is
+however not researched. Given that the goal is to reduce overall training time
+strategies are needed that sample the performance in a time-efficient way and
+yet leads to accurate learning curve estimates. In this paper we formulate the
+framework for these strategies and propose several strategies. Further we
+evaluate the strategies for simulated learning curves and in experiments with
+popular datasets and models for image classification tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DistillSpec: Improving Speculative Decoding via Knowledge Distillation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08461v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08461v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yongchao Zhou, Kaifeng Lyu, Ankit Singh Rawat, Aditya Krishna Menon, Afshin Rostamizadeh, Sanjiv Kumar, Jean-François Kagy, Rishabh Agarwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speculative decoding (SD) accelerates large language model inference by
+employing a faster draft model for generating multiple tokens, which are then
+verified in parallel by the larger target model, resulting in the text
+generated according to the target model distribution. However, identifying a
+compact draft model that is well-aligned with the target model is challenging.
+To tackle this issue, we propose DistillSpec that uses knowledge distillation
+to better align the draft model with the target model, before applying SD.
+DistillSpec makes two key design choices, which we demonstrate via systematic
+study to be crucial to improving the draft and target alignment: utilizing
+on-policy data generation from the draft model, and tailoring the divergence
+function to the task and decoding strategy. Notably, DistillSpec yields
+impressive 10 - 45% speedups over standard SD on a range of standard
+benchmarks, using both greedy and non-greedy sampling. Furthermore, we combine
+DistillSpec with lossy SD to achieve fine-grained control over the latency vs.
+task performance trade-off. Finally, in practical scenarios with models of
+varying sizes, first using distillation to boost the performance of the target
+model and then applying DistillSpec to train a well-aligned draft model can
+reduce decoding latency by 6-10x with minimal performance drop, compared to
+standard decoding without distillation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> on Heterogeneous Transfer Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08459v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08459v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Runxue Bao, Yiming Sun, Yuhe Gao, Jindong Wang, Qiang Yang, Haifeng Chen, Zhi-Hong Mao, Xing Xie, Ye Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The application of transfer learning, an approach utilizing knowledge from a
+source domain to enhance model performance in a target domain, has seen a
+tremendous rise in recent years, underpinning many real-world scenarios. The
+key to its success lies in the shared common knowledge between the domains, a
+prerequisite in most transfer learning methodologies. These methods typically
+presuppose identical feature spaces and label spaces in both domains, known as
+homogeneous transfer learning, which, however, is not always a practical
+assumption. Oftentimes, the source and target domains vary in feature spaces,
+data distributions, and label spaces, making it challenging or costly to secure
+source domain data with identical feature and label spaces as the target
+domain. Arbitrary elimination of these differences is not always feasible or
+optimal. Thus, heterogeneous transfer learning, acknowledging and dealing with
+such disparities, has emerged as a promising approach for a variety of tasks.
+Despite the existence of a survey in 2017 on this topic, the fast-paced
+advances post-2017 necessitate an updated, in-depth review. We therefore
+present a comprehensive survey of recent developments in heterogeneous transfer
+learning methods, offering a systematic guide for future research. Our paper
+reviews methodologies for diverse learning scenarios, discusses the limitations
+of current studies, and covers various application contexts, including Natural
+Language Processing, Computer Vision, Multimodality, and Biomedicine, to foster
+a deeper understanding and spur future research.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Robust Multi-Modal Reasoning via Model Selection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08446v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08446v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiangyan Liu, Rongxue Li, Wei Ji, Tao Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The reasoning capabilities of LLM (Large Language Model) are widely
+acknowledged in recent research, inspiring studies on tool learning and
+autonomous agents. LLM serves as the "brain" of agent, orchestrating multiple
+tools for collaborative multi-step task solving. Unlike methods invoking tools
+like calculators or weather APIs for straightforward tasks, multi-modal agents
+excel by integrating diverse AI models for complex challenges. However, current
+multi-modal agents neglect the significance of model selection: they primarily
+focus on the planning and execution phases, and will only invoke predefined
+task-specific models for each subtask, making the execution fragile. Meanwhile,
+other traditional model selection methods are either incompatible with or
+suboptimal for the multi-modal agent scenarios, due to ignorance of
+dependencies among subtasks arising by multi-step reasoning.
+  To this end, we identify the key challenges therein and propose the
+$\textit{M}^3$ framework as a plug-in with negligible runtime overhead at
+test-time. This framework improves model selection and bolsters the robustness
+of multi-modal agents in multi-step reasoning. In the absence of suitable
+benchmarks, we create MS-GQA, a new dataset specifically designed to
+investigate the model selection challenge in multi-modal agents. Our
+experiments reveal that our framework enables dynamic model selection,
+considering both user inputs and subtask dependencies, thereby robustifying the
+overall reasoning process. Our code and benchmark:
+https://github.com/LINs-lab/M3.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Sampling in Hierarchical Exponential-family Energy-based Models <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08431v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08431v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingsi Dong, Si Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian brain theory suggests that the brain employs generative models to
+understand the external world. The sampling-based perspective posits that the
+brain infers the posterior distribution through samples of stochastic neuronal
+responses. Additionally, the brain continually updates its generative model to
+approach the true distribution of the external world. In this study, we
+introduce the Hierarchical Exponential-family Energy-based (HEE) model, which
+captures the dynamics of inference and learning. In the HEE model, we decompose
+the partition function into individual layers and leverage a group of neurons
+with shorter time constants to sample the gradient of the decomposed
+normalization term. This allows our model to estimate the partition function
+and perform inference simultaneously, circumventing the negative phase
+encountered in conventional energy-based models (EBMs). As a result, the
+learning process is localized both in time and space, and the model is easy to
+converge. To match the brain's rapid computation, we demonstrate that neural
+adaptation can serve as a momentum term, significantly accelerating the
+inference process. On natural image datasets, our model exhibits
+representations akin to those observed in the biological visual system.
+Furthermore, for the machine learning community, our model can generate
+observations through joint or marginal generation. We show that marginal
+generation outperforms joint generation and achieves performance on par with
+other EBMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Differentially Private Non-convex Learning for Multi-layer Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08425v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08425v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hanpu Shen, Cheng-Long Wang, Zihang Xiang, Yiming Ying, Di Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper focuses on the problem of Differentially Private Stochastic
+Optimization for (multi-layer) fully connected neural networks with a single
+output node. In the first part, we examine cases with no hidden nodes,
+specifically focusing on Generalized Linear Models (GLMs). We investigate the
+well-specific model where the random noise possesses a zero mean, and the link
+function is both bounded and Lipschitz continuous. We propose several
+algorithms and our analysis demonstrates the feasibility of achieving an excess
+population risk that remains invariant to the data dimension. We also delve
+into the scenario involving the ReLU link function, and our findings mirror
+those of the bounded link function. We conclude this section by contrasting
+well-specified and misspecified models, using ReLU regression as a
+representative example.
+  In the second part of the paper, we extend our ideas to two-layer neural
+networks with sigmoid or ReLU activation functions in the well-specified model.
+In the third part, we study the theoretical guarantees of DP-SGD in Abadi et
+al. (2016) for fully connected multi-layer neural networks. By utilizing recent
+advances in Neural Tangent Kernel theory, we provide the first excess
+population risk when both the sample size and the width of the network are
+sufficiently large. Additionally, we discuss the role of some parameters in
+DP-SGD regarding their utility, both theoretically and empirically.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Jailbreaking Black Box Large Language Models in Twenty Queries 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08419v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08419v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrick Chao, Alexander Robey, Edgar Dobriban, Hamed Hassani, George J. Pappas, Eric Wong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  There is growing interest in ensuring that large language models (LLMs) align
+with human values. However, the alignment of such models is vulnerable to
+adversarial jailbreaks, which coax LLMs into overriding their safety
+guardrails. The identification of these vulnerabilities is therefore
+instrumental in understanding inherent weaknesses and preventing future misuse.
+To this end, we propose Prompt Automatic Iterative Refinement (PAIR), an
+algorithm that generates semantic jailbreaks with only black-box access to an
+LLM. PAIR -- which is inspired by social engineering attacks -- uses an
+attacker LLM to automatically generate jailbreaks for a separate targeted LLM
+without human intervention. In this way, the attacker LLM iteratively queries
+the target LLM to update and refine a candidate jailbreak. Empirically, PAIR
+often requires fewer than twenty queries to produce a jailbreak, which is
+orders of magnitude more efficient than existing algorithms. PAIR also achieves
+competitive jailbreaking success rates and transferability on open and
+closed-source LLMs, including GPT-3.5/4, Vicuna, and PaLM-2.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages, 10 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Better Evaluation of Instruction-Following: A Case-Study in
+  Summarization <span class="chip">CoNLL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08394v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08394v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ondrej Skopek, Rahul Aralikatte, Sian Gooding, Victor Carbune
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite recent advances, evaluating how well large language models (LLMs)
+follow user instructions remains an open problem. While evaluation methods of
+language models have seen a rise in prompt-based approaches, limited work on
+the correctness of these methods has been conducted. In this work, we perform a
+meta-evaluation of a variety of metrics to quantify how accurately they measure
+the instruction-following abilities of LLMs. Our investigation is performed on
+grounded query-based summarization by collecting a new short-form, real-world
+dataset riSum, containing $300$ document-instruction pairs with $3$ answers
+each. All $900$ answers are rated by $3$ human annotators. Using riSum, we
+analyze agreement between evaluation methods and human judgment. Finally, we
+propose new LLM-based reference-free evaluation methods that improve upon
+established baselines and perform on-par with costly reference-based metrics
+which require high-quality summaries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CoNLL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Introducing a Deep Neural Network-based Model Predictive Control
+  Framework for Rapid Controller Implementation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08392v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08392v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David C. Gordon, Alexander Winkler, Julian Bedei, Patrick Schaber, Jakob Andert, Charles R. Koch
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model Predictive Control (MPC) provides an optimal control solution based on
+a cost function while allowing for the implementation of process constraints.
+As a model-based optimal control technique, the performance of MPC strongly
+depends on the model used where a trade-off between model computation time and
+prediction performance exists. One solution is the integration of MPC with a
+machine learning (ML) based process model which are quick to evaluate online.
+This work presents the experimental implementation of a deep neural network
+(DNN) based nonlinear MPC for Homogeneous Charge Compression Ignition (HCCI)
+combustion control. The DNN model consists of a Long Short-Term Memory (LSTM)
+network surrounded by fully connected layers which was trained using
+experimental engine data and showed acceptable prediction performance with
+under 5% error for all outputs. Using this model, the MPC is designed to track
+the Indicated Mean Effective Pressure (IMEP) and combustion phasing
+trajectories, while minimizing several parameters. Using the acados software
+package to enable the real-time implementation of the MPC on an ARM Cortex A72,
+the optimization calculations are completed within 1.4 ms. The external A72
+processor is integrated with the prototyping engine controller using a UDP
+connection allowing for rapid experimental deployment of the NMPC. The IMEP
+trajectory following of the developed controller was excellent, with a
+root-mean-square error of 0.133 bar, in addition to observing process
+constraints.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to 2024 American Control Conference (ACC), July 8-12, 2024
+  in Toronto, Canada. ACC is the annual conference of the American Automatic
+  Control Council (AACC), the U.S. national member organization of the
+  International Federation for Automatic Control (IFAC)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Many <span class="highlight-title">Pretrain</span>ing Tasks Are Needed for In-Context Learning of Linear
+  Regression? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08391v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08391v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingfeng Wu, Difan Zou, Zixiang Chen, Vladimir Braverman, Quanquan Gu, Peter L. Bartlett
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers pretrained on diverse tasks exhibit remarkable in-context
+learning (ICL) capabilities, enabling them to solve unseen tasks solely based
+on input contexts without adjusting model parameters. In this paper, we study
+ICL in one of its simplest setups: pretraining a linearly parameterized
+single-layer linear attention model for linear regression with a Gaussian
+prior. We establish a statistical task complexity bound for the attention model
+pretraining, showing that effective pretraining only requires a small number of
+independent tasks. Furthermore, we prove that the pretrained model closely
+matches the Bayes optimal algorithm, i.e., optimally tuned ridge regression, by
+achieving nearly Bayes optimal risk on unseen tasks under a fixed context
+length. These theoretical findings complement prior experimental research and
+shed light on the statistical foundations of ICL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MeanAP-Guided Reinforced Active Learning for Object Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08387v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08387v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhixuan Liang, Xingyu Zeng, Rui Zhao, Ping Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Active learning presents a promising avenue for training high-performance
+models with minimal labeled data, achieved by judiciously selecting the most
+informative instances to label and incorporating them into the task learner.
+Despite notable advancements in active learning for image recognition, metrics
+devised or learned to gauge the information gain of data, crucial for query
+strategy design, do not consistently align with task model performance metrics,
+such as Mean Average Precision (MeanAP) in object detection tasks. This paper
+introduces MeanAP-Guided Reinforced Active Learning for Object Detection
+(MAGRAL), a novel approach that directly utilizes the MeanAP metric of the task
+model to devise a sampling strategy employing a reinforcement learning-based
+sampling agent. Built upon LSTM architecture, the agent efficiently explores
+and selects subsequent training instances, and optimizes the process through
+policy gradient with MeanAP serving as reward. Recognizing the time-intensive
+nature of MeanAP computation at each step, we propose fast look-up tables to
+expedite agent training. We assess MAGRAL's efficacy across popular benchmarks,
+PASCAL VOC and MS COCO, utilizing different backbone architectures. Empirical
+findings substantiate MAGRAL's superiority over recent state-of-the-art
+methods, showcasing substantial performance gains. MAGRAL establishes a robust
+baseline for reinforced active object detection, signifying its potential in
+advancing the field.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AutoVP: An Automated Visual <span class="highlight-title">Prompt</span>ing Framework and Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08381v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08381v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hsi-Ai Tsao, Lei Hsiung, Pin-Yu Chen, Sijia Liu, Tsung-Yi Ho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual prompting (VP) is an emerging parameter-efficient fine-tuning approach
+to adapting pre-trained vision models to solve various downstream
+image-classification tasks. However, there has hitherto been little systematic
+study of the design space of VP and no clear benchmark for evaluating its
+performance. To bridge this gap, we propose AutoVP, an end-to-end expandable
+framework for automating VP design choices, along with 12 downstream
+image-classification tasks that can serve as a holistic VP-performance
+benchmark. Our design space covers 1) the joint optimization of the prompts; 2)
+the selection of pre-trained models, including image classifiers and text-image
+encoders; and 3) model output mapping strategies, including nonparametric and
+trainable label mapping. Our extensive experimental results show that AutoVP
+outperforms the best-known current VP methods by a substantial margin, having
+up to 6.7% improvement in accuracy; and attains a maximum performance increase
+of 27.5% compared to linear-probing (LP) baseline. AutoVP thus makes a two-fold
+contribution: serving both as an efficient tool for hyperparameter tuning on VP
+design choices, and as a comprehensive benchmark that can reasonably be
+expected to accelerate VP's development. The source code is available at
+https://github.com/IBM/AutoVP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint. The code is available at https://github.com/IBM/AutoVP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MCU: A Task-centric Framework for Open-ended Agent Evaluation in
+  Minecraft 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08367v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08367v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haowei Lin, Zihao Wang, Jianzhu Ma, Yitao Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To pursue the goal of creating an open-ended agent in Minecraft, an
+open-ended game environment with unlimited possibilities, this paper introduces
+a task-centric framework named MCU for Minecraft agent evaluation. The MCU
+framework leverages the concept of atom tasks as fundamental building blocks,
+enabling the generation of diverse or even arbitrary tasks. Within the MCU
+framework, each task is measured with six distinct difficulty scores (time
+consumption, operational effort, planning complexity, intricacy, creativity,
+novelty). These scores offer a multi-dimensional assessment of a task from
+different angles, and thus can reveal an agent's capability on specific facets.
+The difficulty scores also serve as the feature of each task, which creates a
+meaningful task space and unveils the relationship between tasks. For efficient
+evaluation of Minecraft agents employing the MCU framework, we maintain a
+unified benchmark, namely SkillForge, which comprises representative tasks with
+diverse categories and difficulty distribution. We also provide convenient
+filters for users to select tasks to assess specific capabilities of agents. We
+show that MCU has the high expressivity to cover all tasks used in recent
+literature on Minecraft agent, and underscores the need for advancements in
+areas such as creativity, precise control, and out-of-distribution
+generalization under the goal of open-ended Minecraft agent development.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Demystifying the Generalization Behaviors When Neural Collapse
+  Emerges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08358v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08358v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peifeng Gao, Qianqian Xu, Yibo Yang, Peisong Wen, Huiyang Shao, Zhiyong Yang, Bernard Ghanem, Qingming Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Collapse (NC) is a well-known phenomenon of deep neural networks in
+the terminal phase of training (TPT). It is characterized by the collapse of
+features and classifier into a symmetrical structure, known as simplex
+equiangular tight frame (ETF). While there have been extensive studies on
+optimization characteristics showing the global optimality of neural collapse,
+little research has been done on the generalization behaviors during the
+occurrence of NC. Particularly, the important phenomenon of generalization
+improvement during TPT has been remaining in an empirical observation and
+lacking rigorous theoretical explanation. In this paper, we establish the
+connection between the minimization of CE and a multi-class SVM during TPT, and
+then derive a multi-class margin generalization bound, which provides a
+theoretical explanation for why continuing training can still lead to accuracy
+improvement on test set, even after the train accuracy has reached 100%.
+Additionally, our further theoretical results indicate that different alignment
+between labels and features in a simplex ETF can result in varying degrees of
+generalization improvement, despite all models reaching NC and demonstrating
+similar optimization performance on train set. We refer to this newly
+discovered property as "non-conservative generalization". In experiments, we
+also provide empirical observations to verify the indications suggested by our
+theoretical results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 6 figures. arXiv admin note: substantial text overlap with
+  arXiv:2304.08914</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LightZero: A Unified Benchmark for Monte Carlo Tree Search in General
+  Sequential Decision Scenarios <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08348v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08348v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yazhe Niu, Yuan Pu, Zhenjie Yang, Xueyan Li, Tong Zhou, Jiyuan Ren, Shuai Hu, Hongsheng Li, Yu Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Building agents based on tree-search planning capabilities with learned
+models has achieved remarkable success in classic decision-making problems,
+such as Go and Atari. However, it has been deemed challenging or even
+infeasible to extend Monte Carlo Tree Search (MCTS) based algorithms to diverse
+real-world applications, especially when these environments involve complex
+action spaces and significant simulation costs, or inherent stochasticity. In
+this work, we introduce LightZero, the first unified benchmark for deploying
+MCTS/MuZero in general sequential decision scenarios. Specificially, we
+summarize the most critical challenges in designing a general MCTS-style
+decision-making solver, then decompose the tightly-coupled algorithm and system
+design of tree-search RL methods into distinct sub-modules. By incorporating
+more appropriate exploration and optimization strategies, we can significantly
+enhance these sub-modules and construct powerful LightZero agents to tackle
+tasks across a wide range of domains, such as board games, Atari, MuJoCo,
+MiniGrid and GoBigger. Detailed benchmark results reveal the significant
+potential of such methods in building scalable and efficient decision
+intelligence. The code is available as part of OpenDILab at
+https://github.com/opendilab/LightZero.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023 Spotlight</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Generic Software Framework for Distributed Topological Analysis
+  Pipelines 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08339v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08339v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eve Le Guillou, Michael Will, Pierre Guillou, Jonas Lukasczyk, Pierre Fortin, Christoph Garth, Julien Tierny
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This system paper presents a software framework for the support of
+topological analysis pipelines in a distributed-memory model. While several
+recent papers introduced topology-based approaches for distributed-memory
+environments, these were reporting experiments obtained with tailored,
+mono-algorithm implementations. In contrast, we describe in this paper a
+general-purpose, generic framework for topological analysis pipelines, i.e. a
+sequence of topological algorithms interacting together, possibly on distinct
+numbers of processes. Specifically, we instantiated our framework with the MPI
+model, within the Topology ToolKit (TTK). While developing this framework, we
+faced several algorithmic and software engineering challenges, which we
+document in this paper. We provide a taxonomy for the distributed-memory
+topological algorithms supported by TTK, depending on their communication needs
+and provide examples of hybrid MPI+thread parallelizations. Detailed
+performance analyses show that parallel efficiencies range from $20\%$ to
+$80\%$ (depending on the algorithms), and that the MPI-specific preconditioning
+introduced by our framework induces a negligible computation time overhead. We
+illustrate the new distributed-memory capabilities of TTK with an example of
+advanced analysis pipeline, combining multiple algorithms, run on the largest
+publicly available dataset we have found (120 billion vertices) on a standard
+cluster with 64 nodes (for a total of 1,536 cores). Finally, we provide a
+roadmap for the completion of TTK's MPI extension, along with generic
+recommendations for each algorithm communication category.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Neural Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08337v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08337v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Grigory Bartosh, Dmitry Vetrov, Christian A. Naesseth
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have shown remarkable performance on many generative tasks.
+Despite recent success, most diffusion models are restricted in that they only
+allow linear transformation of the data distribution. In contrast, broader
+family of transformations can potentially help train generative distributions
+more efficiently, simplifying the reverse process and closing the gap between
+the true negative log-likelihood and the variational approximation. In this
+paper, we present Neural Diffusion Models (NDMs), a generalization of
+conventional diffusion models that enables defining and learning time-dependent
+non-linear transformations of data. We show how to optimise NDMs using a
+variational bound in a simulation-free setting. Moreover, we derive a
+time-continuous formulation of NDMs, which allows fast and reliable inference
+using off-the-shelf numerical ODE and SDE solvers. Finally, we demonstrate the
+utility of NDMs with learnable transformations through experiments on standard
+image generation benchmarks, including CIFAR-10, downsampled versions of
+ImageNet and CelebA-HQ. NDMs outperform conventional diffusion models in terms
+of likelihood and produce high-quality samples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Impact of multi-armed bandit strategies on deep recurrent reinforcement
+  learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08331v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08331v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Valentina Zangirolami, Matteo Borrotti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Incomplete knowledge of the environment leads an agent to make decisions
+under uncertainty. One of the major dilemmas in Reinforcement Learning (RL)
+where an autonomous agent has to balance two contrasting needs in making its
+decisions is: exploiting the current knowledge of the environment to maximize
+the cumulative reward as well as exploring actions that allow improving the
+knowledge of the environment, hopefully leading to higher reward values
+(exploration-exploitation trade-off). Concurrently, another relevant issue
+regards the full observability of the states, which may not be assumed in all
+applications. Such as when only 2D images are considered as input in a RL
+approach used for finding the optimal action within a 3D simulation
+environment. In this work, we address these issues by deploying and testing
+several techniques to balance exploration and exploitation trade-off on
+partially observable systems for predicting steering wheels in autonomous
+driving scenario. More precisely, the final aim is to investigate the effects
+of using both stochastic and deterministic multi-armed bandit strategies
+coupled with a Deep Recurrent Q-Network. Additionally, we adapted and evaluated
+the impact of an innovative method to improve the learning phase of the
+underlying Convolutional Recurrent Neural Network. We aim to show that adaptive
+stochastic methods for exploration better approximate the trade-off between
+exploration and exploitation as, in general, Softmax and Max-Boltzmann
+strategies are able to outperform epsilon-greedy techniques.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Defending Our Privacy With Backdoors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08320v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08320v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dominik Hintersdorf, Lukas Struppek, Daniel Neider, Kristian Kersting
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of large AI models trained on uncurated, often sensitive
+web-scraped data has raised significant privacy concerns. One of the concerns
+is that adversaries can extract information about the training data using
+privacy attacks. Unfortunately, the task of removing specific information from
+the models without sacrificing performance is not straightforward and has
+proven to be challenging. We propose a rather easy yet effective defense based
+on backdoor attacks to remove private information such as names of individuals
+from models, and focus in this work on text encoders. Specifically, through
+strategic insertion of backdoors, we align the embeddings of sensitive phrases
+with those of neutral terms-"a person" instead of the person's name. Our
+empirical results demonstrate the effectiveness of our backdoor-based defense
+on CLIP by assessing its performance using a specialized privacy attack for
+zero-shot classifiers. Our approach provides not only a new "dual-use"
+perspective on backdoor attacks, but also presents a promising avenue to
+enhance the privacy of individuals within models trained on uncurated
+web-scraped data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GePSAn: Generative Procedure Step Anticipation in Cooking Videos <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08312v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08312v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mohamed Ashraf Abdelsalam, Samrudhdhi B. Rangrej, Isma Hadji, Nikita Dvornik, Konstantinos G. Derpanis, Afsaneh Fazly
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of future step anticipation in procedural videos. Given
+a video of an ongoing procedural activity, we predict a plausible next
+procedure step described in rich natural language. While most previous work
+focus on the problem of data scarcity in procedural video datasets, another
+core challenge of future anticipation is how to account for multiple plausible
+future realizations in natural settings. This problem has been largely
+overlooked in previous work. To address this challenge, we frame future step
+prediction as modelling the distribution of all possible candidates for the
+next step. Specifically, we design a generative model that takes a series of
+video clips as input, and generates multiple plausible and diverse candidates
+(in natural language) for the next step. Following previous work, we side-step
+the video annotation scarcity by pretraining our model on a large text-based
+corpus of procedural activities, and then transfer the model to the video
+domain. Our experiments, both in textual and video domains, show that our model
+captures diversity in the next step prediction and generates multiple plausible
+future predictions. Moreover, our model establishes new state-of-the-art
+results on YouCookII, where it outperforms existing baselines on the next step
+anticipation. Finally, we also show that our model can successfully transfer
+from text to the video domain zero-shot, ie, without fine-tuning or adaptation,
+and produces good-quality future step predictions from video.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>published at ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CHIP: Contrastive Hierarchical Image <span class="highlight-title">Pretrain</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08304v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08304v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arpit Mittal, Harshil Jhaveri, Swapnil Mallick, Abhishek Ajmera
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Few-shot object classification is the task of classifying objects in an image
+with limited number of examples as supervision. We propose a one-shot/few-shot
+classification model that can classify an object of any unseen class into a
+relatively general category in an hierarchically based classification. Our
+model uses a three-level hierarchical contrastive loss based ResNet152
+classifier for classifying an object based on its features extracted from Image
+embedding, not used during the training phase. For our experimentation, we have
+used a subset of the ImageNet (ILSVRC-12) dataset that contains only the animal
+classes for training our model and created our own dataset of unseen classes
+for evaluating our trained model. Our model provides satisfactory results in
+classifying the unknown objects into a generic category which has been later
+discussed in greater detail.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Symmetry-Aware Exploration of Bayesian Neural Network Posteriors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08287v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08287v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Olivier Laurent, Emanuel Aldea, Gianni Franchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The distribution of the weights of modern deep neural networks (DNNs) -
+crucial for uncertainty quantification and robustness - is an eminently complex
+object due to its extremely high dimensionality. This paper proposes one of the
+first large-scale explorations of the posterior distribution of deep Bayesian
+Neural Networks (BNNs), expanding its study to real-world vision tasks and
+architectures. Specifically, we investigate the optimal approach for
+approximating the posterior, analyze the connection between posterior quality
+and uncertainty quantification, delve into the impact of modes on the
+posterior, and explore methods for visualizing the posterior. Moreover, we
+uncover weight-space symmetries as a critical aspect for understanding the
+posterior. To this extent, we develop an in-depth assessment of the impact of
+both permutation and scaling symmetries that tend to obfuscate the Bayesian
+posterior. While the first type of transformation is known for duplicating
+modes, we explore the relationship between the latter and L2 regularization,
+challenging previous misconceptions. Finally, to help the community improve our
+understanding of the Bayesian posterior, we will shortly release the first
+large-scale checkpoint dataset, including thousands of real-world models and
+our codes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Data driven modeling of self-similar dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08282v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08282v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruyi Tao, Ningning Tao, Yizhuang You, Jiang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiscale modeling of complex systems is crucial for understanding their
+intricacies. Data-driven multiscale modeling has emerged as a promising
+approach to tackle challenges associated with complex systems. On the other
+hand, self-similarity is prevalent in complex systems, hinting that large-scale
+complex systems can be modeled at a reduced cost. In this paper, we introduce a
+multiscale neural network framework that incorporates self-similarity as prior
+knowledge, facilitating the modeling of self-similar dynamical systems. For
+deterministic dynamics, our framework can discern whether the dynamics are
+self-similar. For uncertain dynamics, it can compare and determine which
+parameter set is closer to self-similarity. The framework allows us to extract
+scale-invariant kernels from the dynamics for modeling at any scale. Moreover,
+our method can identify the power law exponents in self-similar systems.
+Preliminary tests on the Ising model yielded critical exponents consistent with
+theoretical expectations, providing valuable insights for addressing critical
+phase transitions in non-equilibrium systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages,4 figures,1 table</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lag-Llama: Towards Foundation Models for Time Series Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08278v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08278v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kashif Rasul, Arjun Ashok, Andrew Robert Williams, Arian Khorasani, George Adamopoulos, Rishika Bhagwatkar, Marin Biloš, Hena Ghonia, Nadhir Vincent Hassen, Anderson Schneider, Sahil Garg, Alexandre Drouin, Nicolas Chapados, Yuriy Nevmyvaka, Irina Rish
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Aiming to build foundation models for time-series forecasting and study their
+scaling behavior, we present here our work-in-progress on Lag-Llama, a
+general-purpose univariate probabilistic time-series forecasting model trained
+on a large collection of time-series data. The model shows good zero-shot
+prediction capabilities on unseen "out-of-distribution" time-series datasets,
+outperforming supervised baselines. We use smoothly broken power-laws to fit
+and predict model scaling behavior. The open source code is made available at
+https://github.com/kashif/pytorch-transformer-ts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Invisible Threats: Backdoor Attack in OCR Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08259v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08259v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mauro Conti, Nicola Farronato, Stefanos Koffas, Luca Pajola, Stjepan Picek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optical Character Recognition (OCR) is a widely used tool to extract text
+from scanned documents. Today, the state-of-the-art is achieved by exploiting
+deep neural networks. However, the cost of this performance is paid at the
+price of system vulnerability. For instance, in backdoor attacks, attackers
+compromise the training phase by inserting a backdoor in the victim's model
+that will be activated at testing time by specific patterns while leaving the
+overall model performance intact. This work proposes a backdoor attack for OCR
+resulting in the injection of non-readable characters from malicious input
+images. This simple but effective attack exposes the state-of-the-art OCR
+weakness, making the extracted text correct to human eyes but simultaneously
+unusable for the NLP application that uses OCR as a preprocessing step.
+Experimental results show that the attacked models successfully output
+non-readable characters for around 90% of the poisoned instances without
+harming their performance for the remaining instances.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Impact of Co-occurrence on Factual Knowledge of Large Language Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08256v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08256v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheongwoong Kang, Jaesik Choi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) often make factually incorrect responses despite
+their success in various applications. In this paper, we hypothesize that
+relying heavily on simple co-occurrence statistics of the pre-training corpora
+is one of the main factors that cause factual errors. Our results reveal that
+LLMs are vulnerable to the co-occurrence bias, defined as preferring frequently
+co-occurred words over the correct answer. Consequently, LLMs struggle to
+recall facts whose subject and object rarely co-occur in the pre-training
+dataset although they are seen during finetuning. We show that co-occurrence
+bias remains despite scaling up model sizes or finetuning. Therefore, we
+suggest finetuning on a debiased dataset to mitigate the bias by filtering out
+biased samples whose subject-object co-occurrence count is high. Although
+debiased finetuning allows LLMs to memorize rare facts in the training set, it
+is not effective in recalling rare facts unseen during finetuning. Further
+research in mitigation will help build reliable language models by preventing
+potential errors. The code is available at
+\url{https://github.com/CheongWoong/impact_of_cooccurrence}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MetaBox: A Benchmark Platform for Meta-Black-Box Optimization with
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08252v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08252v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyuan Ma, Hongshu Guo, Jiacheng Chen, Zhenrui Li, Guojun Peng, Yue-Jiao Gong, Yining Ma, Zhiguang Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Meta-Black-Box Optimization with Reinforcement Learning
+(MetaBBO-RL) has showcased the power of leveraging RL at the meta-level to
+mitigate manual fine-tuning of low-level black-box optimizers. However, this
+field is hindered by the lack of a unified benchmark. To fill this gap, we
+introduce MetaBox, the first benchmark platform expressly tailored for
+developing and evaluating MetaBBO-RL methods. MetaBox offers a flexible
+algorithmic template that allows users to effortlessly implement their unique
+designs within the platform. Moreover, it provides a broad spectrum of over 300
+problem instances, collected from synthetic to realistic scenarios, and an
+extensive library of 19 baseline methods, including both traditional black-box
+optimizers and recent MetaBBO-RL methods. Besides, MetaBox introduces three
+standardized performance metrics, enabling a more thorough assessment of the
+methods. In a bid to illustrate the utility of MetaBox for facilitating
+rigorous evaluation and in-depth analysis, we carry out a wide-ranging
+benchmarking study on existing MetaBBO-RL methods. Our MetaBox is open-source
+and accessible at: https://github.com/GMC-DRL/MetaBox.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at NuerIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards a Unified Analysis of Kernel-based Methods Under Covariate Shift 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08237v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08237v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingdong Feng, Xin He, Caixing Wang, Chao Wang, Jingnan Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Covariate shift occurs prevalently in practice, where the input distributions
+of the source and target data are substantially different. Despite its
+practical importance in various learning problems, most of the existing methods
+only focus on some specific learning tasks and are not well validated
+theoretically and numerically. To tackle this problem, we propose a unified
+analysis of general nonparametric methods in a reproducing kernel Hilbert space
+(RKHS) under covariate shift. Our theoretical results are established for a
+general loss belonging to a rich loss function family, which includes many
+commonly used methods as special cases, such as mean regression, quantile
+regression, likelihood-based classification, and margin-based classification.
+Two types of covariate shift problems are the focus of this paper and the sharp
+convergence rates are established for a general loss function to provide a
+unified theoretical analysis, which concurs with the optimal results in
+literature where the squared loss is used. Extensive numerical studies on
+synthetic and real examples confirm our theoretical findings and further
+illustrate the effectiveness of our proposed method.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Poster to appear in Thirty-seventh Conference on Neural Information
+  Processing Systems</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GROOT: Learning to Follow Instructions by Watching Gameplay Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08235v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08235v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shaofei Cai, Bowei Zhang, Zihao Wang, Xiaojian Ma, Anji Liu, Yitao Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the problem of building a controller that can follow open-ended
+instructions in open-world environments. We propose to follow reference videos
+as instructions, which offer expressive goal specifications while eliminating
+the need for expensive text-gameplay annotations. A new learning framework is
+derived to allow learning such instruction-following controllers from gameplay
+videos while producing a video instruction encoder that induces a structured
+goal space. We implement our agent GROOT in a simple yet effective
+encoder-decoder architecture based on causal transformers. We evaluate GROOT
+against open-world counterparts and human players on a proposed Minecraft
+SkillForge benchmark. The Elo ratings clearly show that GROOT is closing the
+human-machine gap as well as exhibiting a 70% winning rate over the best
+generalist agent baseline. Qualitative analysis of the induced goal space
+further demonstrates some interesting emergent properties, including the goal
+composition and complex gameplay behavior synthesis. Code and video can be
+found on the website https://craftjarvis-groot.github.io.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Emergence of Latent Binary Encoding in Deep Neural Network Classifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08224v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08224v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luigi Sbailò, Luca Ghiringhelli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We observe the emergence of binary encoding within the latent space of
+deep-neural-network classifiers. Such binary encoding is induced by introducing
+a linear penultimate layer, which is equipped during training with a loss
+function that grows as $\exp(\vec{x}^2)$, where $\vec{x}$ are the coordinates
+in the latent space. The phenomenon we describe represents a specific instance
+of a well-documented occurrence known as \textit{neural collapse}, which arises
+in the terminal phase of training and entails the collapse of latent class
+means to the vertices of a simplex equiangular tight frame (ETF). We show that
+binary encoding accelerates convergence toward the simplex ETF and enhances
+classification accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SimCKP: Simple Contrastive Learning of Keyphrase Representations <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minseok Choi, Chaeheon Gwak, Seho Kim, Si Hyeong Kim, Jaegul Choo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Keyphrase generation (KG) aims to generate a set of summarizing words or
+phrases given a source document, while keyphrase extraction (KE) aims to
+identify them from the text. Because the search space is much smaller in KE, it
+is often combined with KG to predict keyphrases that may or may not exist in
+the corresponding document. However, current unified approaches adopt sequence
+labeling and maximization-based generation that primarily operate at a token
+level, falling short in observing and scoring keyphrases as a whole. In this
+work, we propose SimCKP, a simple contrastive learning framework that consists
+of two stages: 1) An extractor-generator that extracts keyphrases by learning
+context-aware phrase-level representations in a contrastive manner while also
+generating keyphrases that do not appear in the document; 2) A reranker that
+adapts scores for each generated phrase by likewise aligning their
+representations with the corresponding document. Experimental results on
+multiple benchmark datasets demonstrate the effectiveness of our proposed
+approach, which outperforms the state-of-the-art models by a significant
+margin.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TriRE: A Multi-Mechanism Learning Paradigm for Continual Knowledge
+  Retention and Promotion <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08217v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08217v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Preetha Vijayan, Prashant Bhat, Elahe Arani, Bahram Zonooz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Continual learning (CL) has remained a persistent challenge for deep neural
+networks due to catastrophic forgetting (CF) of previously learned tasks.
+Several techniques such as weight regularization, experience rehearsal, and
+parameter isolation have been proposed to alleviate CF. Despite their relative
+success, these research directions have predominantly remained orthogonal and
+suffer from several shortcomings, while missing out on the advantages of
+competing strategies. On the contrary, the brain continually learns,
+accommodates, and transfers knowledge across tasks by simultaneously leveraging
+several neurophysiological processes, including neurogenesis, active
+forgetting, neuromodulation, metaplasticity, experience rehearsal, and
+context-dependent gating, rarely resulting in CF. Inspired by how the brain
+exploits multiple mechanisms concurrently, we propose TriRE, a novel CL
+paradigm that encompasses retaining the most prominent neurons for each task,
+revising and solidifying the extracted knowledge of current and past tasks, and
+actively promoting less active neurons for subsequent tasks through rewinding
+and relearning. Across CL settings, TriRE significantly reduces task
+interference and surpasses different CL approaches considered in isolation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at 37th Conference on Neural Information Processing Systems
+  (NeurIPS 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Trustworthy Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08215v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08215v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bálint Mucsányi, Michael Kirchhof, Elisa Nguyen, Alexander Rubinstein, Seong Joon Oh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As machine learning technology gets applied to actual products and solutions,
+new challenges have emerged. Models unexpectedly fail to generalize to small
+changes in the distribution, tend to be confident on novel data they have never
+seen, or cannot communicate the rationale behind their decisions effectively
+with the end users. Collectively, we face a trustworthiness issue with the
+current machine learning technology. This textbook on Trustworthy Machine
+Learning (TML) covers a theoretical and technical background of four key topics
+in TML: Out-of-Distribution Generalization, Explainability, Uncertainty
+Quantification, and Evaluation of Trustworthiness. We discuss important
+classical and contemporary research papers of the aforementioned fields and
+uncover and connect their underlying intuitions. The book evolved from the
+homonymous course at the University of T\"ubingen, first offered in the Winter
+Semester of 2022/23. It is meant to be a stand-alone product accompanied by
+code snippets and various pointers to further sources on topics of TML. The
+dedicated website of the book is https://trustworthyml.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>373 pages, textbook at the University of T\"ubingen</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Conformal inference for regression on Riemannian Manifolds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08209v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08209v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alejandro Cholaquidis, Fabrice Gamboa, Leonardo Moreno
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Regression on manifolds, and, more broadly, statistics on manifolds, has
+garnered significant importance in recent years due to the vast number of
+applications for this type of data. Circular data is a classic example, but so
+is data in the space of covariance matrices, data on the Grassmannian manifold
+obtained as a result of principal component analysis, among many others. In
+this work we investigate prediction sets for regression scenarios when the
+response variable, denoted by $Y$, resides in a manifold, and the covariable,
+denoted by X, lies in Euclidean space. This extends the concepts delineated in
+[Lei and Wasserman, 2014] to this novel context. Aligning with traditional
+principles in conformal inference, these prediction sets are distribution-free,
+indicating that no specific assumptions are imposed on the joint distribution
+of $(X, Y)$, and they maintain a non-parametric character. We prove the
+asymptotic almost sure convergence of the empirical version of these regions on
+the manifold to their population counterparts. The efficiency of this method is
+shown through a comprehensive simulation study and an analysis involving
+real-world data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Lifelong Audio-video Masked Autoencoder with Forget-robust Localized
+  Alignments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08204v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08204v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaewoo Lee, Jaehong Yoon, Wonjae Kim, Yunji Kim, Sung Ju Hwang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a lifelong audio-video masked autoencoder that continually learns
+the multimodal representations from a video stream containing audio-video
+pairs, while its distribution continually shifts over time. Specifically, we
+propose two novel ideas to tackle the problem: (1) Localized Alignment: We
+introduce a small trainable multimodal encoder that predicts the audio and
+video tokens that are well-aligned with each other. This allows the model to
+learn only the highly correlated audiovisual patches with accurate multimodal
+relationships. (2) Forget-robust multimodal patch selection: We compare the
+relative importance of each audio-video patch between the current and past data
+pair to mitigate unintended drift of the previously learned audio-video
+representations. Our proposed method, FLAVA (Forget-robust Localized
+Audio-Video Alignment), therefore, captures the complex relationships between
+the audio and video modalities during training on a sequence of pre-training
+tasks while alleviating the forgetting of learned audiovisual correlations. Our
+experiments validate that FLAVA outperforms the state-of-the-art continual
+learning methods on several benchmark datasets under continual audio-video
+representation learning scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Preprint, project page: https://g-jwlee.github.io/FLAVA/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Traditional DoE: Deep Reinforcement Learning for Optimizing
+  Experiments in Model Identification of Battery Dynamics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08198v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08198v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gokhan Budan, Francesca Damiani, Can Kurtulus, N. Kemal Ure
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model identification of battery dynamics is a central problem in energy
+research; many energy management systems and design processes rely on accurate
+battery models for efficiency optimization. The standard methodology for
+battery modelling is traditional design of experiments (DoE), where the battery
+dynamics are excited with many different current profiles and the measured
+outputs are used to estimate the system dynamics. However, although it is
+possible to obtain useful models with the traditional approach, the process is
+time consuming and expensive because of the need to sweep many different
+current-profile configurations. In the present work, a novel DoE approach is
+developed based on deep reinforcement learning, which alters the configuration
+of the experiments on the fly based on the statistics of past experiments.
+Instead of sticking to a library of predefined current profiles, the proposed
+approach modifies the current profiles dynamically by updating the output space
+covered by past measurements, hence only the current profiles that are
+informative for future experiments are applied. Simulations and real
+experiments are used to show that the proposed approach gives models that are
+as accurate as those obtained with traditional DoE but by using 85\% less
+resources.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learn From Model Beyond Fine-Tuning: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08184v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08184v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongling Zheng, Li Shen, Anke Tang, Yong Luo, Han Hu, Bo Du, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models (FM) have demonstrated remarkable performance across a wide
+range of tasks (especially in the fields of natural language processing and
+computer vision), primarily attributed to their ability to comprehend
+instructions and access extensive, high-quality data. This not only showcases
+their current effectiveness but also sets a promising trajectory towards the
+development of artificial general intelligence. Unfortunately, due to multiple
+constraints, the raw data of the model used for large model training are often
+inaccessible, so the use of end-to-end models for downstream tasks has become a
+new research trend, which we call Learn From Model (LFM) in this article. LFM
+focuses on the research, modification, and design of FM based on the model
+interface, so as to better understand the model structure and weights (in a
+black box environment), and to generalize the model to downstream tasks. The
+study of LFM techniques can be broadly categorized into five major areas: model
+tuning, model distillation, model reuse, meta learning and model editing. Each
+category encompasses a repertoire of methods and strategies that aim to enhance
+the capabilities and performance of FM. This paper gives a comprehensive review
+of the current methods based on FM from the perspective of LFM, in order to
+help readers better understand the current research status and ideas. To
+conclude, we summarize the survey by highlighting several critical areas for
+future exploration and addressing open issues that require further attention
+from the research community. The relevant papers we investigated in this
+article can be accessed at
+<https://github.com/ruthless-man/Awesome-Learn-from-Model>.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ XIMAGENET-12: An Explainable AI Benchmark <span class="highlight-title">Dataset</span> for Model Robustness
+  Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08182v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08182v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiang Li, Dan Zhang, Shengzhao Lei, Xun Zhao, Shuyan Li, Porawit Kamnoedboon, WeiWei Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The lack of standardized robustness metrics and the widespread reliance on
+numerous unrelated benchmark datasets for testing have created a gap between
+academically validated robust models and their often problematic practical
+adoption. To address this, we introduce XIMAGENET-12, an explainable benchmark
+dataset with over 200K images and 15,600 manual semantic annotations. Covering
+12 categories from ImageNet to represent objects commonly encountered in
+practical life and simulating six diverse scenarios, including overexposure,
+blurring, color changing, etc., we further propose a novel robustness criterion
+that extends beyond model generation ability assessment. This benchmark
+dataset, along with related code, is available at
+https://sites.google.com/view/ximagenet-12/home. Researchers and practitioners
+can leverage this resource to evaluate the robustness of their visual models
+under challenging conditions and ultimately benefit from the demands of
+practical computer vision systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>UnderSubmission</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Fast Minimum-Norm Attacks with Hyperparameter Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08177v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08177v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giuseppe Floris, Raffaele Mura, Luca Scionis, Giorgio Piras, Maura Pintor, Ambra Demontis, Battista Biggio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evaluating the adversarial robustness of machine learning models using
+gradient-based attacks is challenging. In this work, we show that
+hyperparameter optimization can improve fast minimum-norm attacks by automating
+the selection of the loss function, the optimizer and the step-size scheduler,
+along with the corresponding hyperparameters. Our extensive evaluation
+involving several robust models demonstrates the improved efficacy of fast
+minimum-norm attacks when hyper-up with hyperparameter optimization. We release
+our open-source code at https://github.com/pralab/HO-FMN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ESANN23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Infinite Width Graph Neural Networks for Node Regression/ Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08176v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08176v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunus Cobanoglu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work analyzes Graph Neural Networks, a generalization of Fully-Connected
+Deep Neural Nets on Graph structured data, when their width, that is the number
+of nodes in each fullyconnected layer is increasing to infinity. Infinite Width
+Neural Networks are connecting Deep Learning to Gaussian Processes and Kernels,
+both Machine Learning Frameworks with long traditions and extensive theoretical
+foundations. Gaussian Processes and Kernels have much less hyperparameters then
+Neural Networks and can be used for uncertainty estimation, making them more
+user friendly for applications. This works extends the increasing amount of
+research connecting Gaussian Processes and Kernels to Neural Networks. The
+Kernel and Gaussian Process closed forms are derived for a variety of
+architectures, namely the standard Graph Neural Network, the Graph Neural
+Network with Skip-Concatenate Connections and the Graph Attention Neural
+Network. All architectures are evaluated on a variety of datasets on the task
+of transductive Node Regression and Classification. Additionally, a Spectral
+Sparsification method known as Effective Resistance is used to improve runtime
+and memory requirements. Extending the setting to inductive graph learning
+tasks (Graph Regression/ Classification) is straightforward and is briefly
+discussed in 3.5.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>50 Pages, 2 Figures (with subfigures)o, multiple tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ COVID-19 Detection Using Swin <span class="highlight-title">Transformer</span> Approach from Computed
+  Tomography Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08165v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08165v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kenan Morani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The accurate and efficient diagnosis of COVID-19 is of paramount importance,
+particularly in the context of large-scale medical imaging datasets. In this
+preprint paper, we propose a novel approach for COVID-19 diagnosis using CT
+images that leverages the power of Swin Transformer models, state-of-the-art
+solutions in computer vision tasks. Our method includes a systematic approach
+for patient-level predictions, where individual CT slices are classified as
+COVID-19 or non-COVID, and the patient's overall diagnosis is determined
+through majority voting. The application of the Swin Transformer in this
+context results in patient-level predictions that demonstrate exceptional
+diagnostic accuracy. In terms of evaluation metrics, our approach consistently
+outperforms the baseline, as well as numerous competing methods, showcasing its
+effectiveness in COVID-19 diagnosis. The macro F1 score achieved by our model
+exceeds the baseline and offers a robust solution for accurate diagnosis.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interpreting Reward Models in RLHF-Tuned Language Models Using Sparse
+  Autoencoders 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08164v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08164v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luke Marks, Amir Abdullah, Luna Mendez, Rauno Arike, Philip Torr, Fazl Barez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) aligned to human preferences via reinforcement
+learning from human feedback (RLHF) underpin many commercial applications.
+However, how RLHF impacts LLM internals remains opaque. We propose a novel
+method to interpret learned reward functions in RLHF-tuned LLMs using sparse
+autoencoders. Our approach trains autoencoder sets on activations from a base
+LLM and its RLHF-tuned version. By comparing autoencoder hidden spaces, we
+identify unique features that reflect the accuracy of the learned reward model.
+To quantify this, we construct a scenario where the tuned LLM learns
+token-reward mappings to maximize reward. This is the first application of
+sparse autoencoders for interpreting learned rewards and broadly inspecting
+reward learning in LLMs. Our method provides an abstract approximation of
+reward integrity. This presents a promising technique for ensuring alignment
+between specified objectives and model behaviors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On Extreme Value Asymptotics of Projected Sample Covariances in High
+  Dimensions with Applications in Finance and Convolutional Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08150v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08150v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ansgar Steland
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Maximum-type statistics of certain functions of the sample covariance matrix
+of high-dimensional vector time series are studied to statistically confirm or
+reject the null hypothesis that a data set has been collected under normal
+conditions. The approach generalizes the case of the maximal deviation of the
+sample autocovariances function from its assumed values. Within a linear time
+series framework it is shown that Gumbel-type extreme value asymptotics holds
+true. As applications we discuss long-only mimimal-variance portfolio
+optimization and subportfolio analysis with respect to idiosyncratic risks, ETF
+index tracking by sparse tracking portfolios, convolutional deep learners for
+image analysis and the analysis of array-of-sensors data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Open-Set Knowledge-Based Visual Question Answering with Inference Paths 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08148v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08148v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingru Gan, Xinzhe Han, Shuhui Wang, Qingming Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given an image and an associated textual question, the purpose of
+Knowledge-Based Visual Question Answering (KB-VQA) is to provide a correct
+answer to the question with the aid of external knowledge bases. Prior KB-VQA
+models are usually formulated as a retriever-classifier framework, where a
+pre-trained retriever extracts textual or visual information from knowledge
+graphs and then makes a prediction among the candidates. Despite promising
+progress, there are two drawbacks with existing models. Firstly, modeling
+question-answering as multi-class classification limits the answer space to a
+preset corpus and lacks the ability of flexible reasoning. Secondly, the
+classifier merely consider "what is the answer" without "how to get the
+answer", which cannot ground the answer to explicit reasoning paths. In this
+paper, we confront the challenge of \emph{explainable open-set} KB-VQA, where
+the system is required to answer questions with entities at wild and retain an
+explainable reasoning path. To resolve the aforementioned issues, we propose a
+new retriever-ranker paradigm of KB-VQA, Graph pATH rankER (GATHER for
+brevity). Specifically, it contains graph constructing, pruning, and path-level
+ranking, which not only retrieves accurate answers but also provides inference
+paths that explain the reasoning process. To comprehensively evaluate our
+model, we reformulate the benchmark dataset OK-VQA with manually corrected
+entity-level annotations and release it as ConceptVQA. Extensive experiments on
+real-world questions demonstrate that our framework is not only able to perform
+open-set question answering across the whole knowledge base but provide
+explicit reasoning path.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Scale Spatial-Temporal Recurrent Networks for Traffic Flow
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08138v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08138v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haiyang Liu, Chunjiang Zhu, Detian Zhang, Qing Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traffic flow prediction is one of the most fundamental tasks of intelligent
+transportation systems. The complex and dynamic spatial-temporal dependencies
+make the traffic flow prediction quite challenging. Although existing
+spatial-temporal graph neural networks hold prominent, they often encounter
+challenges such as (1) ignoring the fixed graph that limits the predictive
+performance of the model, (2) insufficiently capturing complex spatial-temporal
+dependencies simultaneously, and (3) lacking attention to spatial-temporal
+information at different time lengths. In this paper, we propose a Multi-Scale
+Spatial-Temporal Recurrent Network for traffic flow prediction, namely MSSTRN,
+which consists of two different recurrent neural networks: the single-step gate
+recurrent unit and the multi-step gate recurrent unit to fully capture the
+complex spatial-temporal information in the traffic data under different time
+windows. Moreover, we propose a spatial-temporal synchronous attention
+mechanism that integrates adaptive position graph convolutions into the
+self-attention mechanism to achieve synchronous capture of spatial-temporal
+dependencies. We conducted extensive experiments on four real traffic datasets
+and demonstrated that our model achieves the best prediction accuracy with
+non-trivial margins compared to all the twenty baseline methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Counterfactual Explanations for Time Series Forecasting <span class="chip">ICDM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08137v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08137v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhendong Wang, Ioanna Miliou, Isak Samsten, Panagiotis Papapetrou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Among recent developments in time series forecasting methods, deep
+forecasting models have gained popularity as they can utilize hidden feature
+patterns in time series to improve forecasting performance. Nevertheless, the
+majority of current deep forecasting models are opaque, hence making it
+challenging to interpret the results. While counterfactual explanations have
+been extensively employed as a post-hoc approach for explaining classification
+models, their application to forecasting models still remains underexplored. In
+this paper, we formulate the novel problem of counterfactual generation for
+time series forecasting, and propose an algorithm, called ForecastCF, that
+solves the problem by applying gradient-based perturbations to the original
+time series. ForecastCF guides the perturbations by applying constraints to the
+forecasted values to obtain desired prediction outcomes. We experimentally
+evaluate ForecastCF using four state-of-the-art deep model architectures and
+compare to two baselines. Our results show that ForecastCF outperforms the
+baseline in terms of counterfactual validity and data manifold closeness.
+Overall, our findings suggest that ForecastCF can generate meaningful and
+relevant counterfactual explanations for various forecasting tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 6 figures. Accepted by ICDM 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Core-sets for Fair and Diverse Data Summarization <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08122v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08122v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sepideh Mahabadi, Stojan Trajanovski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study core-set construction algorithms for the task of Diversity
+Maximization under fairness/partition constraint. Given a set of points $P$ in
+a metric space partitioned into $m$ groups, and given $k_1,\ldots,k_m$, the
+goal of this problem is to pick $k_i$ points from each group $i$ such that the
+overall diversity of the $k=\sum_i k_i$ picked points is maximized. We consider
+two natural diversity measures: sum-of-pairwise distances and
+sum-of-nearest-neighbor distances, and show improved core-set construction
+algorithms with respect to these measures. More precisely, we show the first
+constant factor core-set w.r.t. sum-of-pairwise distances whose size is
+independent of the size of the dataset and the aspect ratio. Second, we show
+the first core-set w.r.t. the sum-of-nearest-neighbor distances. Finally, we
+run several experiments showing the effectiveness of our core-set approach. In
+particular, we apply constrained diversity maximization to summarize a set of
+timed messages that takes into account the messages' recency. Specifically, the
+summary should include more recent messages compared to older ones. This is a
+real task in one of the largest communication platforms, affecting the
+experience of hundreds of millions daily active users. By utilizing our
+core-set method for this task, we achieve a 100x speed-up while losing the
+diversity by only a few percent. Moreover, our approach allows us to improve
+the space usage of the algorithm in the streaming setting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Overview</span> of Physics-Informed Machine Learning Inversion of Geophysical
+  Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08109v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08109v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gerard T. Schuster, Shihang Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We review four types of algorithms for physics-informed machine learning
+(PIML) inversion of geophysical data. The unifying equation is given by the
+joint objective function $\epsilon$:
+  \begin{eqnarray} \epsilon^{||-PIML}&=&\lambda_1 \overbrace{||{\bf
+W}^{ML}({\bf H}_{{\bf w}} {\bf d}^{obs}-{\bf m})||^2}^{NN} + \lambda_2
+\overbrace{{||{\bf W}^{FWI}({\bf L} {\bf m}-{\bf d}^{obs})||^2}}^{FWI} ~+
+\nonumber\\ \nonumber\\ && + ~~Regularizer, \label{PIML.eq120}
+\end{eqnarray}where the optimal model ${\bf m}^*$ and weights $\bf w^*$
+minimize $\epsilon$. Here, The matrix weights are given by the boldface symbol
+$\bf W$, and full waveform inversion (FWI) is typically computed using a
+finite-difference solution of the wave equation, where $\bf L$ represents the
+forward modeling operation of the wave equation as a function of the model $\bf
+m$. Also, a fully-connected neural network (NN) is used to compute the model
+${\bf H_w}{\bf d}^{obs} \approx \bf m$ from the observed input data ${\bf
+d}^{obs}$. The selection of weights $\lambda_i$ and the NN operations determine
+one of four different PIML algorithms.
+  PIML offers potential advantages over standard FWI through its enhanced
+ability to avoid local minima and the option to locally train the inversion
+operator, minimizing the requirement for extensive training data for global
+applicability. However, the effectiveness of PIML relies on the similarity
+between the test and trained data. Nevertheless, a possible strategy to
+overcome this limitation involves initial pretraining of a PIML architecture
+with data from a broader region, followed by fine-tuning for specific data-a
+method reminiscent of the way large language models are pretrained and adapted
+for various tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37 pages, 16 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Intrinsic Optimization: Intrisic Control with Model Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08100v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08100v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianfei Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Future sequence represents the outcome after executing the action into the
+environment. When driven by the information-theoretic concept of mutual
+information, it seeks maximally informative consequences. Explicit outcomes may
+vary across state, return, or trajectory serving different purposes such as
+credit assignment or imitation learning. However, the inherent nature of
+incorporating intrinsic motivation with reward maximization is often neglected.
+In this work, we propose a variational approach to jointly learn the necessary
+quantity for estimating the mutual information and the dynamics model,
+providing a general framework for incorporating different forms of outcomes of
+interest. Integrated into a policy iteration scheme, our approach guarantees
+convergence to the optimal policy. While we mainly focus on theoretical
+analysis, our approach opens the possibilities of leveraging intrinsic control
+with model learning to enhance sample efficiency and incorporate uncertainty of
+the environment into decision-making.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Climate<span class="highlight-title">BERT</span>-NetZero: Detecting and Assessing Net Zero and Reduction
+  Targets 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08096v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08096v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tobias Schimanski, Julia Bingler, Camilla Hyslop, Mathias Kraus, Markus Leippold
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Public and private actors struggle to assess the vast amounts of information
+about sustainability commitments made by various institutions. To address this
+problem, we create a novel tool for automatically detecting corporate,
+national, and regional net zero and reduction targets in three steps. First, we
+introduce an expert-annotated data set with 3.5K text samples. Second, we train
+and release ClimateBERT-NetZero, a natural language classifier to detect
+whether a text contains a net zero or reduction target. Third, we showcase its
+analysis potential with two use cases: We first demonstrate how
+ClimateBERT-NetZero can be combined with conventional question-answering (Q&A)
+models to analyze the ambitions displayed in net zero and reduction targets.
+Furthermore, we employ the ClimateBERT-NetZero model on quarterly earning call
+transcripts and outline how communication patterns evolve over time. Our
+experiments demonstrate promising pathways for extracting and analyzing net
+zero and emission reduction targets at scale.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Discerning Temporal Difference Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08091v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08091v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianfei Ma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Temporal difference learning (TD) is a foundational concept in reinforcement
+learning (RL), aimed at efficiently assessing a policy's value function.
+TD($\lambda$), a potent variant, incorporates a memory trace to distribute the
+prediction error into the historical context. However, this approach often
+neglects the significance of historical states and the relative importance of
+propagating the TD error, influenced by challenges such as visitation imbalance
+or outcome noise. To address this, we propose a novel TD algorithm named
+discerning TD learning (DTD), which allows flexible emphasis
+functions$-$predetermined or adapted during training$-$to allocate efforts
+effectively across states. We establish the convergence properties of our
+method within a specific class of emphasis functions and showcase its promising
+potential for adaptation to deep RL contexts. Empirical results underscore that
+employing a judicious emphasis function not only improves value estimation but
+also expedites learning across diverse scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dealing with zero-inflated data: achieving SOTA with a two-fold machine
+  learning approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08088v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08088v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jože M. Rožanec, Gašper Petelin, João Costa, Blaž Bertalanič, Gregor Cerar, Marko Guček, Gregor Papa, Dunja Mladenić
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In many cases, a machine learning model must learn to correctly predict a few
+data points with particular values of interest in a broader range of data where
+many target values are zero. Zero-inflated data can be found in diverse
+scenarios, such as lumpy and intermittent demands, power consumption for home
+appliances being turned on and off, impurities measurement in distillation
+processes, and even airport shuttle demand prediction. The presence of zeroes
+affects the models' learning and may result in poor performance. Furthermore,
+zeroes also distort the metrics used to compute the model's prediction quality.
+This paper showcases two real-world use cases (home appliances classification
+and airport shuttle demand prediction) where a hierarchical model applied in
+the context of zero-inflated data leads to excellent results. In particular,
+for home appliances classification, the weighted average of Precision, Recall,
+F1, and AUC ROC was increased by 27%, 34%, 49%, and 27%, respectively.
+Furthermore, it is estimated that the proposed approach is also four times more
+energy efficient than the SOTA approach against which it was compared to.
+Two-fold models performed best in all cases when predicting airport shuttle
+demand, and the difference against other models has been proven to be
+statistically significant.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Carbon Tracking Model for Federated Learning: Impact of Quantization
+  and Sparsification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08087v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08087v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Luca Barbieri, Stefano Savazzi, Sanaz Kianoush, Monica Nicoli, Luigi Serio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated Learning (FL) methods adopt efficient communication technologies to
+distribute machine learning tasks across edge devices, reducing the overhead in
+terms of data storage and computational complexity compared to centralized
+solutions. Rather than moving large data volumes from producers (sensors,
+machines) to energy-hungry data centers, raising environmental concerns due to
+resource demands, FL provides an alternative solution to mitigate the energy
+demands of several learning tasks while enabling new Artificial Intelligence of
+Things (AIoT) applications. This paper proposes a framework for real-time
+monitoring of the energy and carbon footprint impacts of FL systems. The carbon
+tracking tool is evaluated for consensus (fully decentralized) and classical FL
+policies. For the first time, we present a quantitative evaluation of different
+computationally and communication efficient FL methods from the perspectives of
+energy consumption and carbon equivalent emissions, suggesting also general
+guidelines for energy-efficient design. Results indicate that consensus-driven
+FL implementations should be preferred for limiting carbon emissions when the
+energy efficiency of the communication is low (i.e., < 25 Kbit/Joule). Besides,
+quantization and sparsification operations are shown to strike a balance
+between learning performances and energy consumption, leading to sustainable FL
+designs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted for presentation at IEEE CAMAD 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ To token or not to token: A Comparative Study of Text Representations
+  for Cross-Lingual Transfer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08078v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08078v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Md Mushfiqur Rahman, Fardin Ahsan Sakib, Fahim Faisal, Antonios Anastasopoulos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Choosing an appropriate tokenization scheme is often a bottleneck in
+low-resource cross-lingual transfer. To understand the downstream implications
+of text representation choices, we perform a comparative analysis on language
+models having diverse text representation modalities including 2
+segmentation-based models (\texttt{BERT}, \texttt{mBERT}), 1 image-based model
+(\texttt{PIXEL}), and 1 character-level model (\texttt{CANINE}). First, we
+propose a scoring Language Quotient (LQ) metric capable of providing a weighted
+representation of both zero-shot and few-shot evaluation combined. Utilizing
+this metric, we perform experiments comprising 19 source languages and 133
+target languages on three tasks (POS tagging, Dependency parsing, and NER). Our
+analysis reveals that image-based models excel in cross-lingual transfer when
+languages are closely related and share visually similar scripts. However, for
+tasks biased toward word meaning (POS, NER), segmentation-based models prove to
+be superior. Furthermore, in dependency parsing tasks where word relationships
+play a crucial role, models with their character-level focus, outperform
+others. Finally, we propose a recommendation scheme based on our findings to
+guide model selection according to task and language requirements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at 3RD MULTILINGUAL REPRESENTATION LEARNING (MRL) WORKSHOP,
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Samples on Thin Ice: Re-Evaluating Adversarial Pruning of Neural
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08073v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08073v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Giorgio Piras, Maura Pintor, Ambra Demontis, Battista Biggio
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural network pruning has shown to be an effective technique for reducing
+the network size, trading desirable properties like generalization and
+robustness to adversarial attacks for higher sparsity. Recent work has claimed
+that adversarial pruning methods can produce sparse networks while also
+preserving robustness to adversarial examples. In this work, we first
+re-evaluate three state-of-the-art adversarial pruning methods, showing that
+their robustness was indeed overestimated. We then compare pruned and dense
+versions of the same models, discovering that samples on thin ice, i.e., closer
+to the unpruned model's decision boundary, are typically misclassified after
+pruning. We conclude by discussing how this intuition may lead to designing
+more effective adversarial pruning methods in future work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning Transferable Conceptual Prototypes for Interpretable
+  Unsupervised Domain Adaptation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08071v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08071v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junyu Gao, Xinhong Ma, Changsheng Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the great progress of unsupervised domain adaptation (UDA) with the
+deep neural networks, current UDA models are opaque and cannot provide
+promising explanations, limiting their applications in the scenarios that
+require safe and controllable model decisions. At present, a surge of work
+focuses on designing deep interpretable methods with adequate data annotations
+and only a few methods consider the distributional shift problem. Most existing
+interpretable UDA methods are post-hoc ones, which cannot facilitate the model
+learning process for performance enhancement. In this paper, we propose an
+inherently interpretable method, named Transferable Conceptual Prototype
+Learning (TCPL), which could simultaneously interpret and improve the processes
+of knowledge transfer and decision-making in UDA. To achieve this goal, we
+design a hierarchically prototypical module that transfers categorical basic
+concepts from the source domain to the target domain and learns domain-shared
+prototypes for explaining the underlying reasoning process. With the learned
+transferable prototypes, a self-predictive consistent pseudo-label strategy
+that fuses confidence, predictions, and prototype information, is designed for
+selecting suitable target samples for pseudo annotations and gradually
+narrowing down the domain gap. Comprehensive experiments show that the proposed
+method can not only provide effective and intuitive explanations but also
+outperform previous state-of-the-arts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE TIP</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tight Time-Space Lower Bounds for Constant-Pass Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08070v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08070v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xin Lyu, Avishay Tal, Hongxun Wu, Junzhao Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In his breakthrough paper, Raz showed that any parity learning algorithm
+requires either quadratic memory or an exponential number of samples [FOCS'16,
+JACM'19]. A line of work that followed extended this result to a large class of
+learning problems. Until recently, all these results considered learning in the
+streaming model, where each sample is drawn independently, and the learner is
+allowed a single pass over the stream of samples. Garg, Raz, and Tal [CCC'19]
+considered a stronger model, allowing multiple passes over the stream. In the
+$2$-pass model, they showed that learning parities of size $n$ requires either
+a memory of size $n^{1.5}$ or at least $2^{\sqrt{n}}$ samples. (Their result
+also generalizes to other learning problems.)
+  In this work, for any constant $q$, we prove tight memory-sample lower bounds
+for any parity learning algorithm that makes $q$ passes over the stream of
+samples. We show that such a learner requires either $\Omega(n^{2})$ memory
+size or at least $2^{\Omega(n)}$ samples. Beyond establishing a tight lower
+bound, this is the first non-trivial lower bound for $q$-pass learning for any
+$q\ge 3$. Similar to prior work, our results extend to any learning problem
+with many nearly-orthogonal concepts.
+  We complement the lower bound with an upper bound, showing that parity
+learning with $q$ passes can be done efficiently with $O(n^2/\log q)$ memory.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear at FOCS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Negative Pairs in Code Search <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08069v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08069v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haochen Li, Xin Zhou, Luu Anh Tuan, Chunyan Miao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, contrastive learning has become a key component in fine-tuning code
+search models for software development efficiency and effectiveness. It pulls
+together positive code snippets while pushing negative samples away given
+search queries. Among contrastive learning, InfoNCE is the most widely used
+loss function due to its better performance. However, the following problems in
+negative samples of InfoNCE may deteriorate its representation learning: 1) The
+existence of false negative samples in large code corpora due to duplications.
+2). The failure to explicitly differentiate between the potential relevance of
+negative samples. As an example, a bubble sorting algorithm example is less
+``negative'' than a file saving function for the quick sorting algorithm query.
+In this paper, we tackle the above problems by proposing a simple yet effective
+Soft-InfoNCE loss that inserts weight terms into InfoNCE. In our proposed loss
+function, we apply three methods to estimate the weights of negative pairs and
+show that the vanilla InfoNCE loss is a special case of Soft-InfoNCE.
+Theoretically, we analyze the effects of Soft-InfoNCE on controlling the
+distribution of learnt code representations and on deducing a more precise
+mutual information estimation. We furthermore discuss the superiority of
+proposed loss functions with other design alternatives. Extensive experiments
+demonstrate the effectiveness of Soft-InfoNCE and weights estimation methods
+under state-of-the-art code search models on a large-scale public dataset
+consisting of six programming languages. Source code is available at
+\url{https://github.com/Alex-HaochenLi/Soft-InfoNCE}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ETDock: A Novel Equivariant <span class="highlight-title">Transformer</span> for Protein-Ligand Docking 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08061v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08061v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiqiang Yi, Xu Wan, Yatao Bian, Le Ou-Yang, Peilin Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predicting the docking between proteins and ligands is a crucial and
+challenging task for drug discovery. However, traditional docking methods
+mainly rely on scoring functions, and deep learning-based docking approaches
+usually neglect the 3D spatial information of proteins and ligands, as well as
+the graph-level features of ligands, which limits their performance. To address
+these limitations, we propose an equivariant transformer neural network for
+protein-ligand docking pose prediction. Our approach involves the fusion of
+ligand graph-level features by feature processing, followed by the learning of
+ligand and protein representations using our proposed TAMformer module.
+Additionally, we employ an iterative optimization approach based on the
+predicted distance matrix to generate refined ligand poses. The experimental
+results on real datasets show that our model can achieve state-of-the-art
+performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Learning from Label Proportions: Bootstrapping Supervised Learners via
+  Belief Propagation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08056v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08056v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shreyas Havaldar, Navodita Sharma, Shubhi Sareen, Karthikeyan Shanmugam, Aravindan Raghuveer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning from Label Proportions (LLP) is a learning problem where only
+aggregate level labels are available for groups of instances, called bags,
+during training, and the aim is to get the best performance at the
+instance-level on the test data. This setting arises in domains like
+advertising and medicine due to privacy considerations. We propose a novel
+algorithmic framework for this problem that iteratively performs two main
+steps. For the first step (Pseudo Labeling) in every iteration, we define a
+Gibbs distribution over binary instance labels that incorporates a) covariate
+information through the constraint that instances with similar covariates
+should have similar labels and b) the bag level aggregated label. We then use
+Belief Propagation (BP) to marginalize the Gibbs distribution to obtain pseudo
+labels. In the second step (Embedding Refinement), we use the pseudo labels to
+provide supervision for a learner that yields a better embedding. Further, we
+iterate on the two steps again by using the second step's embeddings as new
+covariates for the next iteration. In the final iteration, a classifier is
+trained using the pseudo labels. Our algorithm displays strong gains against
+several SOTA baselines (up to 15%) for the LLP Binary Classification problem on
+various dataset types - tabular and Image. We achieve these improvements with
+minimal computational overhead above standard supervised learning due to Belief
+Propagation, for large bag sizes, even for a million samples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LGL-BCI: A Lightweight Geometric Learning Framework for Motor
+  Imagery-Based Brain-Computer Interfaces 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08051v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08051v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianchao Lu, Yuzhe Tian, Yang Zhang, Jiaqi Ge, Quan Z. Sheng, Xi Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Brain-Computer Interfaces (BCIs) are a groundbreaking technology for
+interacting with external devices using brain signals. Despite advancements,
+electroencephalogram (EEG)-based Motor Imagery (MI) tasks face challenges like
+amplitude and phase variability, and complex spatial correlations, with a need
+for smaller model size and faster inference. This study introduces the LGL-BCI
+framework, employing a Geometric Deep Learning Framework for EEG processing in
+non-Euclidean metric spaces, particularly the Symmetric Positive Definite (SPD)
+Manifold space. LGL-BCI offers robust EEG data representation and captures
+spatial correlations. We propose an EEG channel selection solution via a
+feature decomposition algorithm to reduce SPD matrix dimensionality, with a
+lossless transformation boosting inference speed. Extensive experiments show
+LGL-BCI's superior accuracy and efficiency compared to current solutions,
+highlighting geometric deep learning's potential in MI-BCI applications. The
+efficiency, assessed on two public EEG datasets and two real-world EEG devices,
+significantly outperforms the state-of-the-art solution in accuracy ($82.54\%$
+versus $62.22\%$) with fewer parameters (64.9M compared to 183.7M).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Relationship Between Model Architecture and In-Context
+  Learning Ability 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08049v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08049v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ivan Lee, Nan Jiang, Taylor Berg-Kirkpatrick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  What is the relationship between model architecture and the ability to
+perform in-context learning? In this empirical study, we take the first steps
+towards answering this question. In particular, we evaluate fifteen model
+architectures across a suite of synthetic in-context learning tasks. The
+selected architectures represent a broad range of paradigms, including
+recurrent and convolution-based neural networks, transformers, and emerging
+attention alternatives. We discover that all considered architectures can
+perform in-context learning under certain conditions. However, contemporary
+architectures are found to be the best performing, especially as task
+complexity grows. Additionally, our follow-up experiments delve into various
+factors that influence in-context learning. We observe varied sensitivities
+among architectures with respect to hyperparameter settings. Our study of
+training dynamics reveals that certain architectures exhibit a smooth,
+progressive learning trajectory, while others demonstrate periods of stagnation
+followed by abrupt mastery of the task. Finally, and somewhat surprisingly, we
+find that several emerging attention alternatives are more robust in-context
+learners than transformers; since such approaches have constant-sized memory
+footprints at inference time, this result opens the future possibility of
+scaling up in-context learning to vastly larger numbers of in-context examples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ QLLM: Accurate and Efficient Low-Bitwidth Quantization for Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08041v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08041v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jing Liu, Ruihao Gong, Xiuying Wei, Zhiwei Dong, Jianfei Cai, Bohan Zhuang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) excel in NLP, but their demands hinder their
+widespread deployment. While Quantization-Aware Training (QAT) offers a
+solution, its extensive training costs make Post-Training Quantization (PTQ) a
+more practical approach for LLMs. In existing studies, activation outliers in
+particular channels are identified as the bottleneck to PTQ accuracy. They
+propose to transform the magnitudes from activations to weights, which however
+offers limited alleviation or suffers from unstable gradients, resulting in a
+severe performance drop at low-bitwidth. In this paper, we propose QLLM, an
+accurate and efficient low-bitwidth PTQ method designed for LLMs. QLLM
+introduces an adaptive channel reassembly technique that reallocates the
+magnitude of outliers to other channels, thereby mitigating their impact on the
+quantization range. This is achieved by channel disassembly and channel
+assembly, which first breaks down the outlier channels into several
+sub-channels to ensure a more balanced distribution of activation magnitudes.
+Then similar channels are merged to maintain the original channel number for
+efficiency. Additionally, an adaptive strategy is designed to autonomously
+determine the optimal number of sub-channels for channel disassembly. To
+further compensate for the performance loss caused by quantization, we propose
+an efficient tuning method that only learns a small number of low-rank weights
+while freezing the pre-trained quantized model. After training, these low-rank
+parameters can be fused into the frozen weights without affecting inference.
+Extensive experiments on LLaMA-1 and LLaMA-2 show that QLLM can obtain accurate
+quantized models efficiently. For example, QLLM quantizes the 4-bit LLaMA-2-70B
+within 10 hours on a single A100-80G GPU, outperforming the previous
+state-of-the-art method by 7.89% on the average accuracy across five zero-shot
+tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SEE-OoD: Supervised Exploration For Enhanced Out-of-Distribution
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08040v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08040v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoyang Song, Wenbo Sun, Maher Nouiehed, Raed Al Kontar, Judy Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current techniques for Out-of-Distribution (OoD) detection predominantly rely
+on quantifying predictive uncertainty and incorporating model regularization
+during the training phase, using either real or synthetic OoD samples. However,
+methods that utilize real OoD samples lack exploration and are prone to overfit
+the OoD samples at hand. Whereas synthetic samples are often generated based on
+features extracted from training data, rendering them less effective when the
+training and OoD data are highly overlapped in the feature space. In this work,
+we propose a Wasserstein-score-based generative adversarial training scheme to
+enhance OoD detection accuracy, which, for the first time, performs data
+augmentation and exploration simultaneously under the supervision of limited
+OoD samples. Specifically, the generator explores OoD spaces and generates
+synthetic OoD samples using feedback from the discriminator, while the
+discriminator exploits both the observed and synthesized samples for OoD
+detection using a predefined Wasserstein score. We provide theoretical
+guarantees that the optimal solutions of our generative scheme are
+statistically achievable through adversarial training in empirical settings. We
+then demonstrate that the proposed method outperforms state-of-the-art
+techniques on various computer vision datasets and exhibits superior
+generalizability to unseen OoD data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking Large-scale Pre-ranking System: Entire-chain Cross-domain
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08039v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08039v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinbo Song, Ruoran Huang, Xinyang Wang, Wei Huang, Qian Yu, Mingming Chen, Yafei Yao, Chaosheng Fan, Changping Peng, Zhangang Lin, Jinghe Hu, Jingping Shao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Industrial systems such as recommender systems and online advertising, have
+been widely equipped with multi-stage architectures, which are divided into
+several cascaded modules, including matching, pre-ranking, ranking and
+re-ranking. As a critical bridge between matching and ranking, existing
+pre-ranking approaches mainly endure sample selection bias (SSB) problem owing
+to ignoring the entire-chain data dependence, resulting in sub-optimal
+performances. In this paper, we rethink pre-ranking system from the perspective
+of the entire sample space, and propose Entire-chain Cross-domain Models (ECM),
+which leverage samples from the whole cascaded stages to effectively alleviate
+SSB problem. Besides, we design a fine-grained neural structure named ECMM to
+further improve the pre-ranking accuracy. Specifically, we propose a
+cross-domain multi-tower neural network to comprehensively predict for each
+stage result, and introduce the sub-networking routing strategy with $L0$
+regularization to reduce computational costs. Evaluations on real-world
+large-scale traffic logs demonstrate that our pre-ranking models outperform
+SOTA methods while time consumption is maintained within an acceptable level,
+which achieves better trade-off between efficiency and effectiveness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Continual Learning via Manifold Expansion Replay 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08038v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08038v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihao Xu, Xuan Tang, Yufei Shi, Jianfeng Zhang, Jian Yang, Mingsong Chen, Xian Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In continual learning, the learner learns multiple tasks in sequence, with
+data being acquired only once for each task. Catastrophic forgetting is a major
+challenge to continual learning. To reduce forgetting, some existing
+rehearsal-based methods use episodic memory to replay samples of previous
+tasks. However, in the process of knowledge integration when learning a new
+task, this strategy also suffers from catastrophic forgetting due to an
+imbalance between old and new knowledge. To address this problem, we propose a
+novel replay strategy called Manifold Expansion Replay (MaER). We argue that
+expanding the implicit manifold of the knowledge representation in the episodic
+memory helps to improve the robustness and expressiveness of the model. To this
+end, we propose a greedy strategy to keep increasing the diameter of the
+implicit manifold represented by the knowledge in the buffer during memory
+management. In addition, we introduce Wasserstein distance instead of cross
+entropy as distillation loss to preserve previous knowledge. With extensive
+experimental validation on MNIST, CIFAR10, CIFAR100, and TinyImageNet, we show
+that the proposed method significantly improves the accuracy in continual
+learning setup, outperforming the state of the arts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ZEST: Attention-based Zero-Shot Learning for Unseen IoT Device
+  Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08036v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08036v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Binghui Wu, Philipp Gysel, Dinil Mon Divakaran, Mohan Gurusamy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent research works have proposed machine learning models for classifying
+IoT devices connected to a network. However, there is still a practical
+challenge of not having all devices (and hence their traffic) available during
+the training of a model. This essentially means, during the operational phase,
+we need to classify new devices not seen during the training phase. To address
+this challenge, we propose ZEST -- a ZSL (zero-shot learning) framework based
+on self-attention for classifying both seen and unseen devices. ZEST consists
+of i) a self-attention based network feature extractor, termed SANE, for
+extracting latent space representations of IoT traffic, ii) a generative model
+that trains a decoder using latent features to generate pseudo data, and iii) a
+supervised model that is trained on the generated pseudo data for classifying
+devices. We carry out extensive experiments on real IoT traffic data; our
+experiments demonstrate i) ZEST achieves significant improvement (in terms of
+accuracy) over the baselines; ii) ZEST is able to better extract meaningful
+representations than LSTM which has been commonly used for modeling network
+traffic.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Local Graph Clustering with Noisy Labels 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08031v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08031v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Artur Back de Luca, Kimon Fountoulakis, Shenghao Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The growing interest in machine learning problems over graphs with additional
+node information such as texts, images, or labels has popularized methods that
+require the costly operation of processing the entire graph. Yet, little effort
+has been made to the development of fast local methods (i.e. without accessing
+the entire graph) that extract useful information from such data. To that end,
+we propose a study of local graph clustering using noisy node labels as a proxy
+for additional node information. In this setting, nodes receive initial binary
+labels based on cluster affiliation: 1 if they belong to the target cluster and
+0 otherwise. Subsequently, a fraction of these labels is flipped. We
+investigate the benefits of incorporating noisy labels for local graph
+clustering. By constructing a weighted graph with such labels, we study the
+performance of graph diffusion-based local clustering method on both the
+original and the weighted graphs. From a theoretical perspective, we consider
+recovering an unknown target cluster with a single seed node in a random graph
+with independent noisy node labels. We provide sufficient conditions on the
+label noise under which, with high probability, using diffusion in the weighted
+graph yields a more accurate recovery of the target cluster. This approach
+proves more effective than using the given labels alone or using diffusion in
+the label-free original graph. Empirically, we show that reliable node labels
+can be obtained with just a few samples from an attributed graph. Moreover,
+utilizing these labels via diffusion in the weighted graph leads to
+significantly better local clustering performance across several real-world
+datasets, improving F1 scores by up to 13%.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 5 figures, 14 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust 1-bit Compressed Sensing with Iterative Hard Thresholding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08019v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08019v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Namiko Matsumoto, Arya Mazumdar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In 1-bit compressed sensing, the aim is to estimate a $k$-sparse unit vector
+$x\in S^{n-1}$ within an $\epsilon$ error (in $\ell_2$) from minimal number of
+linear measurements that are quantized to just their signs, i.e., from
+measurements of the form $y = \mathrm{Sign}(\langle a, x\rangle).$ In this
+paper, we study a noisy version where a fraction of the measurements can be
+flipped, potentially by an adversary. In particular, we analyze the Binary
+Iterative Hard Thresholding (BIHT) algorithm, a proximal gradient descent on a
+properly defined loss function used for 1-bit compressed sensing, in this noisy
+setting. It is known from recent results that, with
+$\tilde{O}(\frac{k}{\epsilon})$ noiseless measurements, BIHT provides an
+estimate within $\epsilon$ error. This result is optimal and universal, meaning
+one set of measurements work for all sparse vectors. In this paper, we show
+that BIHT also provides better results than all known methods for the noisy
+setting. We show that when up to $\tau$-fraction of the sign measurements are
+incorrect (adversarial error), with the same number of measurements as before,
+BIHT agnostically provides an estimate of $x$ within an
+$\tilde{O}(\epsilon+\tau)$ error, maintaining the universality of measurements.
+This establishes stability of iterative hard thresholding in the presence of
+measurement error. To obtain the result, we use the restricted approximate
+invertibility of Gaussian matrices, as well as a tight analysis of the
+high-dimensional geometry of the adversarially corrupted measurements.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to appear in ACM-SIAM Symposium on Discrete Algorithms
+  (SODA) 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Why Train More? Effective and Efficient Membership Inference via
+  Memorization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08015v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08015v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jihye Choi, Shruti Tople, Varun Chandrasekaran, Somesh Jha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Membership Inference Attacks (MIAs) aim to identify specific data samples
+within the private training dataset of machine learning models, leading to
+serious privacy violations and other sophisticated threats. Many practical
+black-box MIAs require query access to the data distribution (the same
+distribution where the private data is drawn) to train shadow models. By doing
+so, the adversary obtains models trained "with" or "without" samples drawn from
+the distribution, and analyzes the characteristics of the samples under
+consideration. The adversary is often required to train more than hundreds of
+shadow models to extract the signals needed for MIAs; this becomes the
+computational overhead of MIAs. In this paper, we propose that by strategically
+choosing the samples, MI adversaries can maximize their attack success while
+minimizing the number of shadow models. First, our motivational experiments
+suggest memorization as the key property explaining disparate sample
+vulnerability to MIAs. We formalize this through a theoretical bound that
+connects MI advantage with memorization. Second, we show sample complexity
+bounds that connect the number of shadow models needed for MIAs with
+memorization. Lastly, we confirm our theoretical arguments with comprehensive
+experiments; by utilizing samples with high memorization scores, the adversary
+can (a) significantly improve its efficacy regardless of the MIA used, and (b)
+reduce the number of shadow models by nearly two orders of magnitude compared
+to state-of-the-art approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AutoFHE: Automated Adaption of CNNs for Efficient Evaluation over FHE <span class="chip">USENIX Security</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08012v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08012v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Ao, Vishnu Naresh Boddeti
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Secure inference of deep convolutional neural networks (CNNs) under RNS-CKKS
+involves polynomial approximation of unsupported non-linear activation
+functions. However, existing approaches have three main limitations: 1)
+Inflexibility: The polynomial approximation and associated homomorphic
+evaluation architecture are customized manually for each CNN architecture and
+do not generalize to other networks. 2) Suboptimal Approximation: Each
+activation function is approximated instead of the function represented by the
+CNN. 3) Restricted Design: Either high-degree or low-degree polynomial
+approximations are used. The former retains high accuracy but slows down
+inference due to bootstrapping operations, while the latter accelerates
+ciphertext inference but compromises accuracy. To address these limitations, we
+present AutoFHE, which automatically adapts standard CNNs for secure inference
+under RNS-CKKS. The key idea is to adopt layerwise mixed-degree polynomial
+activation functions, which are optimized jointly with the homomorphic
+evaluation architecture in terms of the placement of bootstrapping operations.
+The problem is modeled within a multi-objective optimization framework to
+maximize accuracy and minimize the number of bootstrapping operations. AutoFHE
+can be applied flexibly on any CNN architecture, and it provides diverse
+solutions that span the trade-off between accuracy and latency. Experimental
+evaluation over RNS-CKKS encrypted CIFAR datasets shows that AutoFHE
+accelerates secure inference by $1.32\times$ to $1.8\times$ compared to methods
+employing high-degree polynomials. It also improves accuracy by up to 2.56%
+compared to methods using low-degree polynomials. Lastly, AutoFHE accelerates
+inference and improves accuracy by $103\times$ and 3.46%, respectively,
+compared to CNNs under TFHE.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>USENIX Security Symposium 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ When Machine Learning Models Leak: An Exploration of Synthetic Training
+  Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08775v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08775v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Manel Slokom, Peter-Paul de Wolf, Martha Larson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate an attack on a machine learning model that predicts whether a
+person or household will relocate in the next two years, i.e., a
+propensity-to-move classifier. The attack assumes that the attacker can query
+the model to obtain predictions and that the marginal distribution of the data
+on which the model was trained is publicly available. The attack also assumes
+that the attacker has obtained the values of non-sensitive attributes for a
+certain number of target individuals. The objective of the attack is to infer
+the values of sensitive attributes for these target individuals. We explore how
+replacing the original data with synthetic data when training the model impacts
+how successfully the attacker can infer sensitive attributes.\footnote{Original
+paper published at PSD 2022. The paper was subsequently updated.}
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         <span class="highlight-title">★</span> PhyloGFN: Phylogenetic inference with generative flow networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08774v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08774v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingyang Zhou, Zichao Yan, Elliot Layne, Nikolay Malkin, Dinghuai Zhang, Moksh Jain, Mathieu Blanchette, <span class="highlight-author">Yoshua Bengio</span>
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Phylogenetics is a branch of computational biology that studies the
+evolutionary relationships among biological entities. Its long history and
+numerous applications notwithstanding, inference of phylogenetic trees from
+sequence data remains challenging: the high complexity of tree space poses a
+significant obstacle for the current combinatorial and probabilistic
+techniques. In this paper, we adopt the framework of generative flow networks
+(GFlowNets) to tackle two core problems in phylogenetics: parsimony-based and
+Bayesian phylogenetic inference. Because GFlowNets are well-suited for sampling
+complex combinatorial structures, they are a natural choice for exploring and
+sampling from the multimodal posterior distribution over tree topologies and
+evolutionary distances. We demonstrate that our amortized posterior sampler,
+PhyloGFN, produces diverse and high-quality evolutionary hypotheses on real
+benchmark datasets. PhyloGFN is competitive with prior works in marginal
+likelihood estimation and achieves a closer fit to the target distribution than
+state-of-the-art variational inference methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Modeling Fission Gas Release at the Mesoscale using Multiscale DenseNet
+  Regression with Attention Mechanism and Inception Blocks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08767v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08767v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peter Toma, Md Ali Muntaha, Joel B. Harley, Michael R. Tonks
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mesoscale simulations of fission gas release (FGR) in nuclear fuel provide a
+powerful tool for understanding how microstructure evolution impacts FGR, but
+they are computationally intensive. In this study, we present an alternate,
+data-driven approach, using deep learning to predict instantaneous FGR flux
+from 2D nuclear fuel microstructure images. Four convolutional neural network
+(CNN) architectures with multiscale regression are trained and evaluated on
+simulated FGR data generated using a hybrid phase field/cluster dynamics model.
+All four networks show high predictive power, with $R^{2}$ values above 98%.
+The best performing network combine a Convolutional Block Attention Module
+(CBAM) and InceptionNet mechanisms to provide superior accuracy (mean absolute
+percentage error of 4.4%), training stability, and robustness on very low
+instantaneous FGR flux values.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted at Journal of Nuclear Materials, 20 pages, 10 figures, 3
+  tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Calibrating Likelihoods towards Consistency in Summarization Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08764v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08764v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Polina Zablotskaia, Misha Khalman, Rishabh Joshi, Livio Baldini Soares, Shoshana Jakobovits, Joshua Maynez, Shashi Narayan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the recent advances in abstractive text summarization, current
+summarization models still suffer from generating factually inconsistent
+summaries, reducing their utility for real-world application. We argue that the
+main reason for such behavior is that the summarization models trained with
+maximum likelihood objective assign high probability to plausible sequences
+given the context, but they often do not accurately rank sequences by their
+consistency. In this work, we solve this problem by calibrating the likelihood
+of model generated sequences to better align with a consistency metric measured
+by natural language inference (NLI) models. The human evaluation study and
+automatic metrics show that the calibrated models generate more consistent and
+higher-quality summaries. We also show that the models trained using our method
+return probabilities that are better aligned with the NLI scores, which
+significantly increase reliability of summarization models.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stabilizing Subject Transfer in EEG Classification with Divergence
+  Estimation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08762v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08762v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niklas Smedemark-Margulies, Ye Wang, Toshiaki Koike-Akino, Jing Liu, Kieran Parsons, Yunus Bicer, Deniz Erdogmus
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classification models for electroencephalogram (EEG) data show a large
+decrease in performance when evaluated on unseen test sub jects. We reduce this
+performance decrease using new regularization techniques during model training.
+We propose several graphical models to describe an EEG classification task.
+From each model, we identify statistical relationships that should hold true in
+an idealized training scenario (with infinite data and a globally-optimal
+model) but that may not hold in practice. We design regularization penalties to
+enforce these relationships in two stages. First, we identify suitable proxy
+quantities (divergences such as Mutual Information and Wasserstein-1) that can
+be used to measure statistical independence and dependence relationships.
+Second, we provide algorithms to efficiently estimate these quantities during
+training using secondary neural network models. We conduct extensive
+computational experiments using a large benchmark EEG dataset, comparing our
+proposed techniques with a baseline method that uses an adversarial classifier.
+We find our proposed methods significantly increase balanced accuracy on test
+subjects and decrease overfitting. The proposed methods exhibit a larger
+benefit over a greater range of hyperparameters than the baseline method, with
+only a small computational cost at training time. These benefits are largest
+when used for a fixed training period, though there is still a significant
+benefit for a subset of hyperparameters when our techniques are used in
+conjunction with early stopping regularization.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Question Answering for Electronic Health Records: A Scoping <span class="highlight-title">Review</span> of
+  <span class="highlight-title">dataset</span>s and models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08759v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08759v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jayetri Bardhan, Kirk Roberts, Daisy Zhe Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Question Answering (QA) systems on patient-related data can assist both
+clinicians and patients. They can, for example, assist clinicians in
+decision-making and enable patients to have a better understanding of their
+medical history. Significant amounts of patient data are stored in Electronic
+Health Records (EHRs), making EHR QA an important research area. In EHR QA, the
+answer is obtained from the medical record of the patient. Because of the
+differences in data format and modality, this differs greatly from other
+medical QA tasks that employ medical websites or scientific papers to retrieve
+answers, making it critical to research EHR question answering. This study
+aimed to provide a methodological review of existing works on QA over EHRs. We
+searched for articles from January 1st, 2005 to September 30th, 2023 in four
+digital sources including Google Scholar, ACL Anthology, ACM Digital Library,
+and PubMed to collect relevant publications on EHR QA. 4111 papers were
+identified for our study, and after screening based on our inclusion criteria,
+we obtained a total of 47 papers for further study. Out of the 47 papers, 25
+papers were about EHR QA datasets, and 37 papers were about EHR QA models. It
+was observed that QA on EHRs is relatively new and unexplored. Most of the
+works are fairly recent. Also, it was observed that emrQA is by far the most
+popular EHR QA dataset, both in terms of citations and usage in other papers.
+Furthermore, we identified the different models used in EHR QA along with the
+evaluation metrics used for these models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 tables, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Detection and prediction of clopidogrel treatment failures using
+  longitudinal structured electronic health records 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08757v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08757v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Samuel Kim, In Gu Sean Lee, Mijeong Irene Ban, Jane Chiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose machine learning algorithms to automatically detect and predict
+clopidogrel treatment failure using longitudinal structured electronic health
+records (EHR). By drawing analogies between natural language and structured
+EHR, we introduce various machine learning algorithms used in natural language
+processing (NLP) applications to build models for treatment failure detection
+and prediction. In this regard, we generated a cohort of patients with
+clopidogrel prescriptions from UK Biobank and annotated if the patients had
+treatment failure events within one year of the first clopidogrel prescription;
+out of 502,527 patients, 1,824 patients were identified as treatment failure
+cases, and 6,859 patients were considered as control cases. From the dataset,
+we gathered diagnoses, prescriptions, and procedure records together per
+patient and organized them into visits with the same date to build models. The
+models were built for two different tasks, i.e., detection and prediction, and
+the experimental results showed that time series models outperform bag-of-words
+approaches in both tasks. In particular, a Transformer-based model, namely
+BERT, could reach 0.928 AUC in detection tasks and 0.729 AUC in prediction
+tasks. BERT also showed competence over other time series models when there is
+not enough training data, because it leverages the pre-training procedure using
+large unlabeled data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Tokenizer Choice For LLM Training: Negligible or Crucial? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08754v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08754v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehdi Ali, Michael Fromm, Klaudia Thellmann, Richard Rutmann, Max Lübbering, Johannes Leveling, Katrin Klug, Jan Ebert, Niclas Doll, Jasper Schulze Buschhoff, Charvi Jain, Alexander Arno Weber, Lena Jurkschat, Hammam Abdelwahab, Chelsea John, Pedro Ortiz Suarez, Malte Ostendorff, Samuel Weinbach, Rafet Sifa, Stefan Kesselheim, Nicolas Flores-Herr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The recent success of LLMs has been predominantly driven by curating the
+training dataset composition, scaling of model architectures and dataset sizes
+and advancements in pretraining objectives, leaving tokenizer influence as a
+blind spot. Shedding light on this underexplored area, we conduct a
+comprehensive study on the influence of tokenizer choice on LLM downstream
+performance by training 24 mono- and multilingual LLMs at a 2.6B parameter
+scale, ablating different tokenizer algorithms and parameterizations. Our
+studies highlight that the tokenizer choice can significantly impact the
+model's downstream performance, training and inference costs. In particular, we
+find that the common tokenizer evaluation metrics fertility and parity are not
+always predictive of model downstream performance, rendering these metrics a
+questionable choice for tokenizer evaluation. Furthermore, we show that
+multilingual tokenizers trained on the five most frequent European languages
+require vocabulary size increases of factor three in comparison to English.
+While English-only tokenizers have been applied to the training of
+multi-lingual LLMs in the past, we find that this approach results in a severe
+downstream performance degradation and additional training costs of up to 68%,
+due to an inefficient tokenization vocabulary.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Constrained Bayesian Optimization with Adaptive Active Learning of
+  Unknown Constraints 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08751v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08751v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fengxue Zhang, Zejie Zhu, Yuxin Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Optimizing objectives under constraints, where both the objectives and
+constraints are black box functions, is a common scenario in real-world
+applications such as scientific experimental design, design of medical
+therapies, and industrial process optimization. One popular approach to
+handling these complex scenarios is Bayesian Optimization (BO). In terms of
+theoretical behavior, BO is relatively well understood in the unconstrained
+setting, where its principles have been well explored and validated. However,
+when it comes to constrained Bayesian optimization (CBO), the existing
+framework often relies on heuristics or approximations without the same level
+of theoretical guarantees.
+  In this paper, we delve into the theoretical and practical aspects of
+constrained Bayesian optimization, where the objective and constraints can be
+independently evaluated and are subject to noise. By recognizing that both the
+objective and constraints can help identify high-confidence regions of interest
+(ROI), we propose an efficient CBO framework that intersects the ROIs
+identified from each aspect to determine the general ROI. The ROI, coupled with
+a novel acquisition function that adaptively balances the optimization of the
+objective and the identification of feasible regions, enables us to derive
+rigorous theoretical justifications for its performance. We showcase the
+efficiency and robustness of our proposed CBO framework through empirical
+evidence and discuss the fundamental challenge of deriving practical regret
+bounds for CBO algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Search-Adaptor: Text Embedding Customization for Information Retrieval 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08750v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08750v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinsung Yoon, Sercan O Arik, Yanfei Chen, Tomas Pfister
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Text embeddings extracted by pre-trained Large Language Models (LLMs) have
+significant potential to improve information retrieval and search. Beyond the
+zero-shot setup in which they are being conventionally used, being able to take
+advantage of the information from the relevant query-corpus paired data has the
+power to further boost the LLM capabilities. In this paper, we propose a novel
+method, Search-Adaptor, for customizing LLMs for information retrieval in an
+efficient and robust way. Search-Adaptor modifies the original text embedding
+generated by pre-trained LLMs, and can be integrated with any LLM, including
+those only available via APIs. On multiple real-world English and multilingual
+retrieval datasets, we show consistent and significant performance benefits for
+Search-Adaptor -- e.g., more than 5.2% improvements over the Google Embedding
+APIs in nDCG@10 averaged over 13 BEIR datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 2 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evolutionary Dynamic Optimization and Machine Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08748v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08748v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdennour Boulesnane
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evolutionary Computation (EC) has emerged as a powerful field of Artificial
+Intelligence, inspired by nature's mechanisms of gradual development. However,
+EC approaches often face challenges such as stagnation, diversity loss,
+computational complexity, population initialization, and premature convergence.
+To overcome these limitations, researchers have integrated learning algorithms
+with evolutionary techniques. This integration harnesses the valuable data
+generated by EC algorithms during iterative searches, providing insights into
+the search space and population dynamics. Similarly, the relationship between
+evolutionary algorithms and Machine Learning (ML) is reciprocal, as EC methods
+offer exceptional opportunities for optimizing complex ML tasks characterized
+by noisy, inaccurate, and dynamic objective functions. These hybrid techniques,
+known as Evolutionary Machine Learning (EML), have been applied at various
+stages of the ML process. EC techniques play a vital role in tasks such as data
+balancing, feature selection, and model training optimization. Moreover, ML
+tasks often require dynamic optimization, for which Evolutionary Dynamic
+Optimization (EDO) is valuable. This paper presents the first comprehensive
+exploration of reciprocal integration between EDO and ML. The study aims to
+stimulate interest in the evolutionary learning community and inspire
+innovative contributions in this domain.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robustness to Multi-Modal Environment Uncertainty in MARL using
+  Curriculum Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08746v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08746v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aakriti Agrawal, Rohith Aralikatti, Yanchao Sun, Furong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-agent reinforcement learning (MARL) plays a pivotal role in tackling
+real-world challenges. However, the seamless transition of trained policies
+from simulations to real-world requires it to be robust to various
+environmental uncertainties. Existing works focus on finding Nash Equilibrium
+or the optimal policy under uncertainty in one environment variable (i.e.
+action, state or reward). This is because a multi-agent system itself is highly
+complex and unstationary. However, in real-world situation uncertainty can
+occur in multiple environment variables simultaneously. This work is the first
+to formulate the generalised problem of robustness to multi-modal environment
+uncertainty in MARL. To this end, we propose a general robust training approach
+for multi-modal uncertainty based on curriculum learning techniques. We handle
+two distinct environmental uncertainty simultaneously and present extensive
+results across both cooperative and competitive MARL environments,
+demonstrating that our approach achieves state-of-the-art levels of robustness.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Circuit Component Reuse Across Tasks in <span class="highlight-title">Transformer</span> Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08744v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08744v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jack Merullo, Carsten Eickhoff, Ellie Pavlick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work in mechanistic interpretability has shown that behaviors in
+language models can be successfully reverse-engineered through circuit
+analysis. A common criticism, however, is that each circuit is task-specific,
+and thus such analysis cannot contribute to understanding the models at a
+higher level. In this work, we present evidence that insights (both low-level
+findings about specific heads and higher-level findings about general
+algorithms) can indeed generalize across tasks. Specifically, we study the
+circuit discovered in Wang et al. (2022) for the Indirect Object Identification
+(IOI) task and 1.) show that it reproduces on a larger GPT2 model, and 2.) that
+it is mostly reused to solve a seemingly different task: Colored Objects
+(Ippolito & Callison-Burch, 2023). We provide evidence that the process
+underlying both tasks is functionally very similar, and contains about a 78%
+overlap in in-circuit attention heads. We further present a proof-of-concept
+intervention experiment, in which we adjust four attention heads in middle
+layers in order to 'repair' the Colored Objects circuit and make it behave like
+the IOI circuit. In doing so, we boost accuracy from 49.6% to 93.7% on the
+Colored Objects task and explain most sources of error. The intervention
+affects downstream attention heads in specific ways predicted by their
+interactions in the IOI circuit, indicating that this subcircuit behavior is
+invariant to the different task inputs. Overall, our results provide evidence
+that it may yet be possible to explain large language models' behavior in terms
+of a relatively small number of interpretable task-general algorithmic building
+blocks and computational components.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Development and Validation of a Deep Learning-Based Microsatellite
+  Instability Predictor from Prostate Cancer Whole-Slide Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08743v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08743v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiyuan Hu, Abbas A. Rizvi, Geoffery Schau, Kshitij Ingale, Yoni Muller, Rachel Baits, Sebastian Pretzer, Aïcha BenTaieb, Abigail Gordhamer, Roberto Nussenzveig, Adam Cole, Matthew O. Leavitt, Rohan P. Joshi, Nike Beaubier, Martin C. Stumpe, Kunal Nagpal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Microsatellite instability-high (MSI-H) is a tumor agnostic biomarker for
+immune checkpoint inhibitor therapy. However, MSI status is not routinely
+tested in prostate cancer, in part due to low prevalence and assay cost. As
+such, prediction of MSI status from hematoxylin and eosin (H&E) stained
+whole-slide images (WSIs) could identify prostate cancer patients most likely
+to benefit from confirmatory testing and becoming eligible for immunotherapy.
+Prostate biopsies and surgical resections from de-identified records of
+consecutive prostate cancer patients referred to our institution were analyzed.
+Their MSI status was determined by next generation sequencing. Patients before
+a cutoff date were split into an algorithm development set (n=4015, MSI-H 1.8%)
+and a paired validation set (n=173, MSI-H 19.7%) that consisted of two serial
+sections from each sample, one stained and scanned internally and the other at
+an external site. Patients after the cutoff date formed the temporal validation
+set (n=1350, MSI-H 2.3%). Attention-based multiple instance learning models
+were trained to predict MSI-H from H&E WSIs. The MSI-H predictor achieved area
+under the receiver operating characteristic curve values of 0.78 (95% CI
+[0.69-0.86]), 0.72 (95% CI [0.63-0.81]), and 0.72 (95% CI [0.62-0.82]) on the
+internally prepared, externally prepared, and temporal validation sets,
+respectively. While MSI-H status is significantly correlated with Gleason
+score, the model remained predictive within each Gleason score subgroup. In
+summary, we developed and validated an AI-based MSI-H diagnostic model on a
+large real-world cohort of routine H&E slides, which effectively generalized to
+externally stained and scanned samples and a temporally independent validation
+cohort. This algorithm has the potential to direct prostate cancer patients
+toward immunotherapy and to identify MSI-H cases secondary to Lynch syndrome.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Splicing Up Your Predictions with RNA Contrastive Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08738v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08738v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philip Fradkin, Ruian Shi, Bo Wang, Brendan Frey, Leo J. Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the face of rapidly accumulating genomic data, our understanding of the
+RNA regulatory code remains incomplete. Recent self-supervised methods in other
+domains have demonstrated the ability to learn rules underlying the
+data-generating process such as sentence structure in language. Inspired by
+this, we extend contrastive learning techniques to genomic data by utilizing
+functional similarities between sequences generated through alternative
+splicing and gene duplication. Our novel dataset and contrastive objective
+enable the learning of generalized RNA isoform representations. We validate
+their utility on downstream tasks such as RNA half-life and mean ribosome load
+prediction. Our pre-training strategy yields competitive results using linear
+probing on both tasks, along with up to a two-fold increase in Pearson
+correlation in low-data conditions. Importantly, our exploration of the learned
+latent space reveals that our contrastive objective yields semantically
+meaningful representations, underscoring its potential as a valuable
+initialization technique for RNA property prediction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Provably Robust Cost-Sensitive Learning via Randomized Smoothing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08732v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08732v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuan Xin, Michael Backes, Xiao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We focus on learning adversarially robust classifiers under a cost-sensitive
+scenario, where the potential harm of different classwise adversarial
+transformations is encoded in a binary cost matrix. Existing methods are either
+empirical that cannot certify robustness or suffer from inherent scalability
+issues. In this work, we study whether randomized smoothing, a more scalable
+robustness certification framework, can be leveraged to certify cost-sensitive
+robustness. Built upon a notion of cost-sensitive certified radius, we show how
+to adapt the standard randomized smoothing certification pipeline to produce
+tight robustness guarantees for any cost matrix. In addition, with fine-grained
+certified radius optimization schemes specifically designed for different data
+subgroups, we propose an algorithm to train smoothed classifiers that are
+optimized for cost-sensitive robustness. Extensive experiments on image
+benchmarks and a real-world medical dataset demonstrate the superiority of our
+method in achieving significantly improved performance of certified
+cost-sensitive robustness while having a negligible impact on overall accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages, 7 tables, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Simple Way to Incorporate Novelty Detection in World Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08731v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08731v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Geigh Zollicoffer, Kenneth Eaton, Jonathan Balloch, Julia Kim, Mark O. Riedl, Robert Wright
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning (RL) using world models has found significant recent
+successes. However, when a sudden change to world mechanics or properties
+occurs then agent performance and reliability can dramatically decline. We
+refer to the sudden change in visual properties or state transitions as {\em
+novelties}. Implementing novelty detection within generated world model
+frameworks is a crucial task for protecting the agent when deployed. In this
+paper, we propose straightforward bounding approaches to incorporate novelty
+detection into world model RL agents, by utilizing the misalignment of the
+world model's hallucinated states and the true observed states as an anomaly
+score. We first provide an ontology of novelty detection relevant to sequential
+decision making, then we provide effective approaches to detecting novelties in
+a distribution of transitions learned by an agent in a world model. Finally, we
+show the advantage of our work in a novel environment compared to traditional
+machine learning novelty detection methods as well as currently accepted RL
+focused novelty detection algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Quasi-Arithmetic Mixtures, Divergence Minimization, and Bregman
+  Information 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.07481v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.07481v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rob Brekelmans, Frank Nielsen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Markov Chain Monte Carlo methods for sampling from complex distributions and
+estimating normalization constants often simulate samples from a sequence of
+intermediate distributions along an annealing path, which bridges between a
+tractable initial distribution and a target density of interest. Prior work has
+constructed annealing paths using quasi-arithmetic means, and interpreted the
+resulting intermediate densities as minimizing an expected divergence to the
+endpoints. We provide a comprehensive analysis of this 'centroid' property
+using Bregman divergences under a monotonic embedding of the density function,
+thereby associating common divergences such as Amari's and Renyi's
+${\alpha}$-divergences, ${(\alpha,\beta)}$-divergences, and the Jensen-Shannon
+divergence with intermediate densities along an annealing path. Our analysis
+highlights the interplay between parametric families, quasi-arithmetic means,
+and divergence functions using the rho-tau Bregman divergence framework of
+Zhang 2004,2013.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages + appendix (rewritten + changed title in revision)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Seeing-Eye Quadruped Navigation with Force Responsive Locomotion Control 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.04370v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.04370v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David DeFazio, Eisuke Hirota, Shiqi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Seeing-eye robots are very useful tools for guiding visually impaired people,
+potentially producing a huge societal impact given the low availability and
+high cost of real guide dogs. Although a few seeing-eye robot systems have
+already been demonstrated, none considered external tugs from humans, which
+frequently occur in a real guide dog setting. In this paper, we simultaneously
+train a locomotion controller that is robust to external tugging forces via
+Reinforcement Learning (RL), and an external force estimator via supervised
+learning. The controller ensures stable walking, and the force estimator
+enables the robot to respond to the external forces from the human. These
+forces are used to guide the robot to the global goal, which is unknown to the
+robot, while the robot guides the human around nearby obstacles via a local
+planner. Experimental results in simulation and on hardware show that our
+controller is robust to external forces, and our seeing-eye system can
+accurately detect force direction. We demonstrate our full seeing-eye robot
+system on a real quadruped robot with a blindfolded human. The video can be
+seen at our project page: https://bu-air-lab.github.io/guide_dog/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to CoRL 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamic Subgoal-based Exploration via Bayesian Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1910.09143v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1910.09143v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yijia Wang, Matthias Poloczek, Daniel R. Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement learning in sparse-reward navigation environments with
+expensive and limited interactions is challenging and poses a need for
+effective exploration. Motivated by complex navigation tasks that require
+real-world training (when cheap simulators are not available), we consider an
+agent that faces an unknown distribution of environments and must decide on an
+exploration strategy. It may leverage a series of training environments to
+improve its policy before it is evaluated in a test environment drawn from the
+same environment distribution. Most existing approaches focus on fixed
+exploration strategies, while the few that view exploration as a
+meta-optimization problem tend to ignore the need for cost-efficient
+exploration. We propose a cost-aware Bayesian optimization approach that
+efficiently searches over a class of dynamic subgoal-based exploration
+strategies. The algorithm adjusts a variety of levers -- the locations of the
+subgoals, the length of each episode, and the number of replications per trial
+-- in order to overcome the challenges of sparse rewards, expensive
+interactions, and noise. An experimental evaluation demonstrates that the new
+approach outperforms existing baselines across a number of problem domains. We
+also provide a theoretical foundation and prove that the method asymptotically
+identifies a near-optimal subgoal design.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DePT: Decomposed <span class="highlight-title">Prompt</span> Tuning for Parameter-Efficient Fine-tuning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.05173v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.05173v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengxiang Shi, Aldo Lipani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompt tuning (PT), where a small amount of trainable soft (continuous)
+prompt vectors is affixed to the input of language models (LM), has shown
+promising results across various tasks and models for parameter-efficient
+fine-tuning (PEFT). PT stands out from other PEFT approaches because it
+maintains competitive performance with fewer trainable parameters and does not
+drastically scale up its parameters as the model size expands. However, PT
+introduces additional soft prompt tokens, leading to longer input sequences,
+which significantly impacts training and inference time and memory usage due to
+the Transformer's quadratic complexity. Particularly concerning for Large
+Language Models (LLMs) that face heavy daily querying. To address this issue,
+we propose Decomposed Prompt Tuning (DePT), which decomposes the soft prompt
+into a shorter soft prompt and a pair of low-rank matrices that are then
+optimised with two different learning rates. This allows DePT to achieve better
+performance while saving over 20% memory and time costs compared to vanilla PT
+and its variants, without changing trainable parameter sizes. Through extensive
+experiments on 23 natural language processing (NLP) and vision-language (VL)
+tasks, we demonstrate that DePT outperforms state-of-the-art PEFT approaches,
+including the full fine-tuning baseline in some scenarios. Additionally, we
+empirically show that DEPT grows more efficient as the model size increases.
+Our further study reveals that DePT integrates seamlessly with
+parameter-efficient transfer learning in the few-shot learning setting and
+highlights its adaptability to various model architectures and sizes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available at https://github.com/ZhengxiangShi/DePT</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ L2P: Learning to Place for Estimating Heavy-Tailed Distributed Outcomes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1908.04628v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1908.04628v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xindi Wang, Onur Varol, Tina Eliassi-Rad
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Many real-world prediction tasks have outcome variables that have
+characteristic heavy-tail distributions. Examples include copies of books sold,
+auction prices of art pieces, demand for commodities in warehouses, etc. By
+learning heavy-tailed distributions, "big and rare" instances (e.g., the
+best-sellers) will have accurate predictions. Most existing approaches are not
+dedicated to learning heavy-tailed distribution; thus, they heavily
+under-predict such instances. To tackle this problem, we introduce Learning to
+Place (L2P), which exploits the pairwise relationships between instances for
+learning. In its training phase, L2P learns a pairwise preference classifier:
+is instance A > instance B? In its placing phase, L2P obtains a prediction by
+placing the new instance among the known instances. Based on its placement, the
+new instance is then assigned a value for its outcome variable. Experiments on
+real data show that L2P outperforms competing approaches in terms of accuracy
+and ability to reproduce heavy-tailed outcome distribution. In addition, L2P
+provides an interpretable model by placing each predicted instance in relation
+to its comparable neighbors. Interpretable models are highly desirable when
+lives and treasure are at stake.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 6 figures, 2 tables Nature of changes from previous version:
+  1. Added complexity analysis in Section 2.2 2. Datasets change 3. Added
+  LambdaMART in the baseline methods, also a brief discussion on why LambdaMart
+  failed in our problem. 4. Figure updates</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Lion Secretly Solves Constrained Optimization: As Lyapunov Predicts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05898v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05898v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lizhang Chen, Bo Liu, Kaizhao Liang, Qiang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Lion (Evolved Sign Momentum), a new optimizer discovered through program
+search, has shown promising results in training large AI models. It performs
+comparably or favorably to AdamW but with greater memory efficiency. As we can
+expect from the results of a random search program, Lion incorporates elements
+from several existing algorithms, including signed momentum, decoupled weight
+decay, Polak, and Nesterov momentum, but does not fit into any existing
+category of theoretically grounded optimizers. Thus, even though Lion appears
+to perform well as a general-purpose optimizer for a wide range of tasks, its
+theoretical basis remains uncertain. This lack of theoretical clarity limits
+opportunities to further enhance and expand Lion's efficacy.
+  This work aims to demystify Lion. Based on both continuous-time and
+discrete-time analysis, we demonstrate that Lion is a theoretically novel and
+principled approach for minimizing a general loss function $f(x)$ while
+enforcing a bound constraint $\|x\|_\infty \leq 1/\lambda$. Lion achieves this
+through the incorporation of decoupled weight decay, where $\lambda$ represents
+the weight decay coefficient. Our analysis is made possible by the development
+of a new Lyapunov function for the Lion updates. It applies to a broader family
+of Lion-$\kappa$ algorithms, where the $\text{sign}(\cdot)$ operator in Lion is
+replaced by the subgradient of a convex function $\kappa$, leading to the
+solution of a general composite optimization problem of $\min_x f(x) +
+\kappa^*(x)$. Our findings provide valuable insights into the dynamics of Lion
+and pave the way for further improvements and extensions of Lion-related
+algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">GPT</span>-4 as an Agronomist Assistant? Answering Agriculture Exams Using
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06225v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06225v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bruno Silva, Leonardo Nunes, Roberto Estevão, Vijay Aski, Ranveer Chandra
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated remarkable capabilities in
+natural language understanding across various domains, including healthcare and
+finance. For some tasks, LLMs achieve similar or better performance than
+trained human beings, therefore it is reasonable to employ human exams (e.g.,
+certification tests) to assess the performance of LLMs. We present a
+comprehensive evaluation of popular LLMs, such as Llama 2 and GPT, on their
+ability to answer agriculture-related questions. In our evaluation, we also
+employ RAG (Retrieval-Augmented Generation) and ER (Ensemble Refinement)
+techniques, which combine information retrieval, generation capabilities, and
+prompting strategies to improve the LLMs' performance. To demonstrate the
+capabilities of LLMs, we selected agriculture exams and benchmark datasets from
+three of the largest agriculture producer countries: Brazil, India, and the
+USA. Our analysis highlights GPT-4's ability to achieve a passing score on
+exams to earn credits for renewing agronomist certifications, answering 93% of
+the questions correctly and outperforming earlier general-purpose models, which
+achieved 88% accuracy. On one of our experiments, GPT-4 obtained the highest
+performance when compared to human subjects. This performance suggests that
+GPT-4 could potentially pass on major graduate education admission tests or
+even earn credits for renewing agronomy certificates. We also explore the
+models' capacity to address general agriculture-related questions and generate
+crop management guidelines for Brazilian and Indian farmers, utilizing robust
+datasets from the Brazilian Agency of Agriculture (Embrapa) and graduate
+program exams from India. The results suggest that GPT-4, ER, and RAG can
+contribute meaningfully to agricultural education, assessment, and crop
+management practice, offering valuable insights to farmers and agricultural
+professionals.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NECO: NEural Collapse Based Out-of-distribution detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06823v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06823v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mouïn Ben Ammar, Nacim Belkhir, Sebastian Popescu, Antoine Manzanera, Gianni Franchi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting out-of-distribution (OOD) data is a critical challenge in machine
+learning due to model overconfidence, often without awareness of their
+epistemological limits. We hypothesize that ``neural collapse'', a phenomenon
+affecting in-distribution data for models trained beyond loss convergence, also
+influences OOD data. To benefit from this interplay, we introduce NECO, a novel
+post-hoc method for OOD detection, which leverages the geometric properties of
+``neural collapse'' and of principal component spaces to identify OOD data. Our
+extensive experiments demonstrate that NECO achieves state-of-the-art results
+on both small and large-scale OOD detection tasks while exhibiting strong
+generalization capabilities across different network architectures.
+Furthermore, we provide a theoretical explanation for the effectiveness of our
+method in OOD detection. We plan to release the code after the anonymity
+period.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Security Vulnerabilities of Text-to-SQL Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.15363v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.15363v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xutan Peng, Yipeng Zhang, Jingfeng Yang, Mark Stevenson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although it has been demonstrated that Natural Language Processing (NLP)
+algorithms are vulnerable to deliberate attacks, the question of whether such
+weaknesses can lead to software security threats is under-explored. To bridge
+this gap, we conducted vulnerability tests on Text-to-SQL systems that are
+commonly used to create natural language interfaces to databases. We showed
+that the Text-to-SQL modules within six commercial applications can be
+manipulated to produce malicious code, potentially leading to data breaches and
+Denial of Service attacks. This is the first demonstration that NLP models can
+be exploited as attack vectors in the wild. In addition, experiments using four
+open-source language models verified that straightforward backdoor attacks on
+Text-to-SQL systems achieve a 100% success rate without affecting their
+performance. The aim of this work is to draw the community's attention to
+potential software security issues associated with NLP algorithms and encourage
+exploration of methods to mitigate against them.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ISSRE 2023: Best Paper Candidate</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Generate Novel Scientific Directions with Contextualized
+  Literature-based Discovery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14259v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14259v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyun Wang, Doug Downey, Heng Ji, Tom Hope
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Literature-Based Discovery (LBD) aims to discover new scientific knowledge by
+mining papers and generating hypotheses. Standard LBD is limited to predicting
+pairwise relations between discrete concepts (e.g., drug-disease links), and
+ignores critical contexts like experimental settings (e.g., a specific patient
+population where a drug is evaluated) and background motivations (e.g., to find
+drugs without specific side effects). We address these limitations with a novel
+formulation of contextualized-LBD (C-LBD): generating scientific hypotheses in
+natural language, while grounding them in a context that controls the
+hypothesis search space. We present a modeling framework using retrieval of
+``inspirations'' from past scientific papers. Our evaluations reveal that GPT-4
+tends to generate ideas with overall low technical depth and novelty, while our
+inspiration prompting approaches partially mitigate this issue. Our work
+represents a first step toward building language models that generate new ideas
+derived from scientific literature.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages. Code and resource is available at
+  https://github.com/EagleW/CLBD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Collaborative Information Dissemination with Graph-based
+  Multi-Agent Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.16198v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.16198v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raffaele Galliera, Kristen Brent Venable, Matteo Bassani, Niranjan Suri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In modern communication systems, efficient and reliable information
+dissemination is crucial for supporting critical operations across domains like
+disaster response, autonomous vehicles, and sensor networks. This paper
+introduces a Multi-Agent Reinforcement Learning (MARL) approach as a
+significant step forward in achieving more decentralized, efficient, and
+collaborative solutions. We propose a Partially Observable Stochastic Game
+(POSG) formulation for information dissemination empowering each agent to
+decide on message forwarding independently, based on their one-hop
+neighborhood. This constitutes a significant paradigm shift from traditional
+heuristics based on Multi-Point Relay (MPR) selection. Our approach harnesses
+Graph Convolutional Reinforcement Learning, employing Graph Attention Networks
+(GAT) with dynamic attention to capture essential network features. We propose
+two approaches, L-DGN and HL-DGN, which differ in the information that is
+exchanged among agents. We evaluate the performance of our decentralized
+approaches, by comparing them with a widely-used MPR heuristic, and we show
+that our trained policies are able to efficiently cover the network while
+bypassing the MPR set selection process. Our approach is a first step toward
+supporting the resilience of real-world broadcast communication infrastructures
+via learned, collaborative information dissemination.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages (2 of Supplementary Materials), 4 figures, 3 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Generative modeling of time-dependent densities via optimal transport
+  and projection pursuit 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.09663v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.09663v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonah Botvinick-Greenhouse, Yunan Yang, Romit Maulik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Motivated by the computational difficulties incurred by popular deep learning
+algorithms for the generative modeling of temporal densities, we propose a
+cheap alternative which requires minimal hyperparameter tuning and scales
+favorably to high dimensional problems. In particular, we use a
+projection-based optimal transport solver [Meng et al., 2019] to join
+successive samples and subsequently use transport splines [Chewi et al., 2020]
+to interpolate the evolving density. When the sampling frequency is
+sufficiently high, the optimal maps are close to the identity and are thus
+computationally efficient to compute. Moreover, the training process is highly
+parallelizable as all optimal maps are independent and can thus be learned
+simultaneously. Finally, the approach is based solely on numerical linear
+algebra rather than minimizing a nonconvex objective function, allowing us to
+easily analyze and control the algorithm. We present several numerical
+experiments on both synthetic and real-world datasets to demonstrate the
+efficiency of our method. In particular, these experiments show that the
+proposed approach is highly competitive compared with state-of-the-art
+normalizing flows conditioned on time across a wide range of dimensionalities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This article may be downloaded for personal use only. Any other use
+  requires prior permission of the author and AIP Publishing. This article
+  appeared in Chaos: An Interdisciplinary Journal of Nonlinear Science, Volume
+  33, Issue 10, October 2023 and may be found at
+  https://doi.org/10.1063/5.0155783</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Understanding Sparse Feature Updates in Deep Networks using Iterative
+  Linearisation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.12345v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.12345v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrian Goldwaser, Hong Ge
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Larger and deeper networks generalise well despite their increased capacity
+to overfit. Understanding why this happens is theoretically and practically
+important. One recent approach looks at the infinitely wide limits of such
+networks and their corresponding kernels. However, these theoretical tools
+cannot fully explain finite networks as the empirical kernel changes
+significantly during gradient-descent-based training in contrast to infinite
+networks. In this work, we derive an iterative linearised training method as a
+novel empirical tool to further investigate this distinction, allowing us to
+control for sparse (i.e. infrequent) feature updates and quantify the frequency
+of feature learning needed to achieve comparable performance. We justify
+iterative linearisation as an interpolation between a finite analog of the
+infinite width regime, which does not learn features, and standard gradient
+descent training, which does. Informally, we also show that it is analogous to
+a damped version of the Gauss-Newton algorithm -- a second-order method. We
+show that in a variety of cases, iterative linearised training surprisingly
+performs on par with standard training, noting in particular how much less
+frequent feature learning is required to achieve comparable performance. We
+also show that feature learning is essential for good performance. Since such
+feature learning inevitably causes changes in the NTK kernel, we provide direct
+negative evidence for the NTK theory, which states the NTK kernel remains
+constant during training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data-Centric Learning from Unlabeled Graphs with Diffusion Model <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.10108v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.10108v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gang Liu, Eric Inae, Tong Zhao, Jiaxin Xu, Tengfei Luo, Meng Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph property prediction tasks are important and numerous. While each task
+offers a small size of labeled examples, unlabeled graphs have been collected
+from various sources and at a large scale. A conventional approach is training
+a model with the unlabeled graphs on self-supervised tasks and then fine-tuning
+the model on the prediction tasks. However, the self-supervised task knowledge
+could not be aligned or sometimes conflicted with what the predictions needed.
+In this paper, we propose to extract the knowledge underlying the large set of
+unlabeled graphs as a specific set of useful data points to augment each
+property prediction model. We use a diffusion model to fully utilize the
+unlabeled graphs and design two new objectives to guide the model's denoising
+process with each task's labeled data to generate task-specific graph examples
+and their labels. Experiments demonstrate that our data-centric approach
+performs significantly better than fifteen existing various methods on fifteen
+tasks. The performance improvement brought by unlabeled data is visible as the
+generated labeled examples unlike the self-supervised learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-Objective Optimization for Sparse Deep Neural Network Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.12243v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.12243v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        S. S. Hotegni, S. Peitz, M. Berkemeier
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Different conflicting optimization criteria arise naturally in various Deep
+Learning scenarios. These can address different main tasks (i.e., in the
+setting of Multi-Task Learning), but also main and secondary tasks such as loss
+minimization versus sparsity. The usual approach is a simple weighting of the
+criteria, which formally only works in the convex setting. In this paper, we
+present a Multi-Objective Optimization algorithm using a modified Weighted
+Chebyshev scalarization for training Deep Neural Networks (DNNs) with respect
+to several tasks. By employing this scalarization technique, the algorithm can
+identify all optimal solutions of the original problem while reducing its
+complexity to a sequence of single-objective problems. The simplified problems
+are then solved using an Augmented Lagrangian method, enabling the use of
+popular optimization techniques such as Adam and Stochastic Gradient Descent,
+while efficaciously handling constraints. Our work aims to address the
+(economical and also ecological) sustainability issue of DNN models, with a
+particular focus on Deep Multi-Task models, which are typically designed with a
+very large number of weights to perform equally well on multiple tasks. Through
+experiments conducted on two Machine Learning datasets, we demonstrate the
+possibility of adaptively sparsifying the model during training without
+significantly impacting its performance, if we are willing to apply
+task-specific adaptations to the network weights. Code is available at
+https://github.com/salomonhotegni/MDMTN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ High-fidelity Pseudo-labels for Boosting Weakly-Supervised Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.02621v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.02621v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arvi Jonnarth, Yushan Zhang, Michael Felsberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image-level weakly-supervised semantic segmentation (WSSS) reduces the
+usually vast data annotation cost by surrogate segmentation masks during
+training. The typical approach involves training an image classification
+network using global average pooling (GAP) on convolutional feature maps. This
+enables the estimation of object locations based on class activation maps
+(CAMs), which identify the importance of image regions. The CAMs are then used
+to generate pseudo-labels, in the form of segmentation masks, to supervise a
+segmentation model in the absence of pixel-level ground truth. Our work is
+based on two techniques for improving CAMs; importance sampling, which is a
+substitute for GAP, and the feature similarity loss, which utilizes a heuristic
+that object contours almost always align with color edges in images. However,
+both are based on the multinomial posterior with softmax, and implicitly assume
+that classes are mutually exclusive, which turns out suboptimal in our
+experiments. Thus, we reformulate both techniques based on binomial posteriors
+of multiple independent binary problems. This has two benefits; their
+performance is improved and they become more general, resulting in an add-on
+method that can boost virtually any WSSS method. This is demonstrated on a wide
+variety of baselines on the PASCAL VOC dataset, improving the region similarity
+and contour quality of all implemented state-of-the-art methods. Experiments on
+the MS COCO dataset show that our proposed add-on is well-suited for
+large-scale settings. Our code is available at https://github.com/arvijj/hfpl.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Model-Free, Regret-Optimal Best Policy Identification in Online CMDPs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15395v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15395v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Zhou, Honghao Wei, Lei Ying
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper considers the best policy identification (BPI) problem in online
+Constrained Markov Decision Processes (CMDPs). We are interested in algorithms
+that are model-free, have low regret, and identify an optimal policy with a
+high probability. Existing model-free algorithms for online CMDPs with
+sublinear regret and constraint violation do not provide any convergence
+guarantee to an optimal policy and provide only average performance guarantees
+when a policy is uniformly sampled at random from all previously used policies.
+In this paper, we develop a new algorithm, named
+Pruning-Refinement-Identification (PRI), based on a fundamental structural
+property of CMDPs we discover, called limited stochasticity. The property says
+for a CMDP with $N$ constraints, there exists an optimal policy with at most
+$N$ stochastic decisions.
+  The proposed algorithm first identifies at which step and in which state a
+stochastic decision has to be taken and then fine-tunes the distributions of
+these stochastic decisions. PRI achieves trio objectives: (i) PRI is a
+model-free algorithm; and (ii) it outputs a near-optimal policy with a high
+probability at the end of learning; and (iii) in the tabular setting, PRI
+guarantees $\tilde{\mathcal{O}}(\sqrt{K})$ regret and constraint violation,
+which significantly improves the best existing regret bound
+$\tilde{\mathcal{O}}(K^{\frac{4}{5}})$ under a model-free algorithm, where $K$
+is the total number of episodes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Emulating the dynamics of complex systems using autoregressive models on
+  manifolds (mNARX) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.16335v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.16335v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Styfen Schär, Stefano Marelli, Bruno Sudret
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a novel surrogate modelling approach to efficiently and accurately
+approximate the response of complex dynamical systems driven by time-varying
+exogenous excitations over extended time periods. Our approach, namely manifold
+nonlinear autoregressive modelling with exogenous input (mNARX), involves
+constructing a problem-specific exogenous input manifold that is optimal for
+constructing autoregressive surrogates. The manifold, which forms the core of
+mNARX, is constructed incrementally by incorporating the physics of the system,
+as well as prior expert- and domain- knowledge. Because mNARX decomposes the
+full problem into a series of smaller sub-problems, each with a lower
+complexity than the original, it scales well with the complexity of the
+problem, both in terms of training and evaluation costs of the final surrogate.
+Furthermore, mNARX synergizes well with traditional dimensionality reduction
+techniques, making it highly suitable for modelling dynamical systems with
+high-dimensional exogenous inputs, a class of problems that is typically
+challenging to solve. Since domain knowledge is particularly abundant in
+physical systems, such as those found in civil and mechanical engineering,
+mNARX is well suited for these applications. We demonstrate that mNARX
+outperforms traditional autoregressive surrogates in predicting the response of
+a classical coupled spring-mass system excited by a one-dimensional random
+excitation. Additionally, we show that mNARX is well suited for emulating very
+high-dimensional time- and state-dependent systems, even when affected by
+active controllers, by surrogating the dynamics of a realistic
+aero-servo-elastic onshore wind turbine simulator. In general, our results
+demonstrate that mNARX offers promising prospects for modelling complex
+dynamical systems, in terms of accuracy and efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Flood and Echo: Algorithmic Alignment of GNNs with Distributed Computing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06970v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06970v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joël Mathys, Florian Grötschla, Kalyan Varma Nadimpalli, Roger Wattenhofer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph Neural Networks are a natural fit for learning algorithms. They can
+directly represent tasks through an abstract but versatile graph structure and
+handle inputs of different sizes. This opens up the possibility for scaling and
+extrapolation to larger graphs, one of the most important advantages of an
+algorithm. However, this raises two core questions i) How can we enable nodes
+to gather the required information in a given graph ($\textit{information
+exchange}$), even if is far away and ii) How can we design an execution
+framework which enables this information exchange for extrapolation to larger
+graph sizes ($\textit{algorithmic alignment for extrapolation}$). We propose a
+new execution framework that is inspired by the design principles of
+distributed algorithms: Flood and Echo Net. It propagates messages through the
+entire graph in a wave like activation pattern, which naturally generalizes to
+larger instances. Through its sparse but parallel activations it is provably
+more efficient in terms of message complexity. We study the proposed model and
+provide both empirical evidence and theoretical insights in terms of its
+expressiveness, efficiency, information exchange and ability to extrapolate.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Smoothed $f$-Divergence Distributionally Robust Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14041v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14041v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenyuan Liu, Bart P. G. Van Parys, Henry Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In data-driven optimization, sample average approximation (SAA) is known to
+suffer from the so-called optimizer's curse that causes an over-optimistic
+evaluation of the solution performance. We argue that a special type of
+distributionallly robust optimization (DRO) formulation offers theoretical
+advantages in correcting for this optimizer's curse compared to simple
+``margin'' adjustments to SAA and other DRO approaches: It attains a
+statistical bound on the out-of-sample performance, for a wide class of
+objective functions and distributions, that is nearly tightest in terms of
+exponential decay rate. This DRO uses an ambiguity set based on a Kullback
+Leibler (KL) divergence smoothed by the Wasserstein or L\'evy-Prokhorov (LP)
+distance via a suitable distance optimization. Computationally, we also show
+that such a DRO, and its generalized versions using smoothed $f$-divergence,
+are not harder than DRO problems based on $f$-divergence or Wasserstein
+distances, rendering our DRO formulations both statistically optimal and
+computationally viable.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ In-Context Unlearning: Language Models as Few Shot Unlearners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07579v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07579v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Pawelczyk, Seth Neel, Himabindu Lakkaraju
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine unlearning, the study of efficiently removing the impact of specific
+training points on the trained model, has garnered increased attention of late,
+driven by the need to comply with privacy regulations like the Right to be
+Forgotten. Although unlearning is particularly relevant for LLMs in light of
+the copyright issues they raise, achieving precise unlearning is
+computationally infeasible for very large models. To this end, recent work has
+proposed several algorithms which approximate the removal of training data
+without retraining the model. These algorithms crucially rely on access to the
+model parameters in order to update them, an assumption that may not hold in
+practice due to computational constraints or when the LLM is accessed via API.
+In this work, we propose a new class of unlearning methods for LLMs we call
+''In-Context Unlearning'', providing inputs in context and without having to
+update model parameters. To unlearn a particular training instance, we provide
+the instance alongside a flipped label and additional correctly labelled
+instances which are prepended as inputs to the LLM at inference time. Our
+experimental results demonstrate that these contexts effectively remove
+specific information from the training set while maintaining performance levels
+that are competitive with (or in some cases exceed) state-of-the-art unlearning
+methods that require access to the LLM parameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Limits of Model Selection under Transfer Learning <span class="chip">COLT</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.00152v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.00152v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Steve Hanneke, Samory Kpotufe, Yasaman Mahdaviyeh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Theoretical studies on transfer learning or domain adaptation have so far
+focused on situations with a known hypothesis class or model; however in
+practice, some amount of model selection is usually involved, often appearing
+under the umbrella term of hyperparameter-tuning: for example, one may think of
+the problem of tuning for the right neural network architecture towards a
+target task, while leveraging data from a related source task.
+  Now, in addition to the usual tradeoffs on approximation vs estimation errors
+involved in model selection, this problem brings in a new complexity term,
+namely, the transfer distance between source and target distributions, which is
+known to vary with the choice of hypothesis class.
+  We present a first study of this problem, focusing on classification; in
+particular, the analysis reveals some remarkable phenomena: adaptive rates,
+i.e., those achievable with no distributional information, can be arbitrarily
+slower than oracle rates, i.e., when given knowledge on distances.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for presentation at the Conference on Learning Theory (COLT)
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Theoretical Explanation of Activation Sparsity through Flat Minima and
+  Adversarial Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.03004v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.03004v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ze Peng, Lei Qi, Yinghuan Shi, Yang Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A recent empirical observation (Li et al., 2022b) of activation sparsity in
+MLP blocks offers an opportunity to drastically reduce computation costs for
+free. Although having attributed it to training dynamics, existing theoretical
+explanations of activation sparsity are restricted to shallow networks, small
+training steps and special training, despite its emergence in deep models
+standardly trained for a large number of steps. To fill these gaps, we propose
+the notion of gradient sparsity as one source of activation sparsity and a
+theoretical explanation based on it that sees sparsity a necessary step to
+adversarial robustness w.r.t. hidden features and parameters, which is
+approximately the flatness of minima for well-learned models. The theory
+applies to standardly trained LayerNorm-ed MLPs, and further to Transformers or
+other architectures trained with weight noises. Eliminating other sources of
+flatness except for sparsity, we discover the phenomenon that the ratio between
+the largest and smallest non-zero singular values of weight matrices is small.
+When discussing the emergence of this spectral concentration, we use random
+matrix theory (RMT) as a powerful tool to analyze stochastic gradient noises.
+Validational experiments are conducted to verify our gradient-sparsity-based
+explanation. We propose two plug-and-play modules for both training and
+finetuning for sparsity. Experiments on ImageNet-1k and C4 demonstrate their
+50% sparsity improvements, indicating further potential cost reduction in both
+training and inference.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Optimizing Convolutional Neural Networks for Chronic Obstructive
+  Pulmonary Disease Detection in Clinical Computed Tomography Imaging 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.07189v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.07189v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tina Dorosti, Manuel Schultheiss, Felix Hofmann, Johannes Thalhammer, Luisa Kirchner, Theresa Urban, Franz Pfeiffer, Florian Schaff, Tobias Lasser, Daniela Pfeiffer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We aim to optimize the binary detection of Chronic Obstructive Pulmonary
+Disease (COPD) based on emphysema presence in the lung with convolutional
+neural networks (CNN) by exploring manually adjusted versus automated
+window-setting optimization (WSO) on computed tomography (CT) images. 7,194 CT
+images (3,597 with COPD; 3,597 healthy controls) from 78 subjects (43 with
+COPD; 35 healthy controls) were selected retrospectively (10.2018-12.2019) and
+preprocessed. For each image, intensity values were manually clipped to the
+emphysema window setting and a baseline 'full-range' window setting.
+Class-balanced train, validation, and test sets contained 3,392, 1,114, and
+2,688 images. The network backbone was optimized by comparing various CNN
+architectures. Furthermore, automated WSO was implemented by adding a
+customized layer to the model. The image-level area under the Receiver
+Operating Characteristics curve (AUC) [lower, upper limit 95% confidence] was
+utilized to compare model variations. Repeated inference (n=7) on the test set
+showed that the DenseNet was the most efficient backbone and achieved a mean
+AUC of 0.80 [0.76, 0.85] without WSO. Comparably, with input images manually
+adjusted to the emphysema window, the DenseNet model predicted COPD with a mean
+AUC of 0.86 [0.82, 0.89]. By adding a customized WSO layer to the DenseNet, an
+optimal window in the proximity of the emphysema window setting was learned
+automatically, and a mean AUC of 0.82 [0.78, 0.86] was achieved. Detection of
+COPD with DenseNet models was improved by WSO of CT data to the emphysema
+window setting range.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Differentially-Private Decision Trees and Provable Robustness to Data
+  Poisoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15394v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15394v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daniël Vos, Jelle Vos, Tianyu Li, Zekeriya Erkin, Sicco Verwer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Decision trees are interpretable models that are well-suited to non-linear
+learning problems. Much work has been done on extending decision tree learning
+algorithms with differential privacy, a system that guarantees the privacy of
+samples within the training data. However, current state-of-the-art algorithms
+for this purpose sacrifice much utility for a small privacy benefit. These
+solutions create random decision nodes that reduce decision tree accuracy or
+spend an excessive share of the privacy budget on labeling leaves. Moreover,
+many works do not support continuous features or leak information about them.
+We propose a new method called PrivaTree based on private histograms that
+chooses good splits while consuming a small privacy budget. The resulting trees
+provide a significantly better privacy-utility trade-off and accept mixed
+numerical and categorical data without leaking information about numerical
+features. Finally, while it is notoriously hard to give robustness guarantees
+against data poisoning attacks, we demonstrate bounds for the expected accuracy
+and success rates of backdoor attacks against differentially-private learners.
+By leveraging the better privacy-utility trade-off of PrivaTree we are able to
+train decision trees with significantly better robustness against backdoor
+attacks compared to regular decision trees and with meaningful theoretical
+guarantees.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ OWAdapt: An adaptive loss function for deep learning using OWA operators 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.19443v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.19443v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sebastián Maldonado, Carla Vairetti, Katherine Jara, Miguel Carrasco, Julio López
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a fuzzy adaptive loss function for enhancing deep
+learning performance in classification tasks. Specifically, we redefine the
+cross-entropy loss to effectively address class-level noise conditions,
+including the challenging problem of class imbalance. Our approach introduces
+aggregation operators, leveraging the power of fuzzy logic to improve
+classification accuracy. The rationale behind our proposed method lies in the
+iterative up-weighting of class-level components within the loss function,
+focusing on those with larger errors. To achieve this, we employ the ordered
+weighted average (OWA) operator and combine it with an adaptive scheme for
+gradient-based learning. Through extensive experimentation, our method
+outperforms other commonly used loss functions, such as the standard
+cross-entropy or focal loss, across various binary and multiclass
+classification tasks. Furthermore, we explore the influence of hyperparameters
+associated with the OWA operators and present a default configuration that
+performs well across different experimental settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 1 figure, published</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Diffusion-based Generative AI for Exploring Transition States from 2D
+  Molecular Graphs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.12233v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.12233v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seonghwan Kim, Jeheon Woo, Woo Youn Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The exploration of transition state (TS) geometries is crucial for
+elucidating chemical reaction mechanisms and modeling their kinetics. Recently,
+machine learning (ML) models have shown remarkable performance for prediction
+of TS geometries. However, they require 3D conformations of reactants and
+products often with their appropriate orientations as input, which demands
+substantial efforts and computational cost. Here, we propose a generative
+approach based on the stochastic diffusion method, namely TSDiff, for
+prediction of TS geometries just from 2D molecular graphs. TSDiff outperformed
+the existing ML models with 3D geometries in terms of both accuracy and
+efficiency. Moreover, it enables to sample various TS conformations, because it
+learned the distribution of TS geometries for diverse reactions in training.
+Thus, TSDiff was able to find more favorable reaction pathways with lower
+barrier heights than those in the reference database. These results demonstrate
+that TSDiff shows promising potential for an efficient and reliable TS
+exploration.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Memorization with neural nets: going beyond the worst case 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00327v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00327v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sjoerd Dirksen, Patrick Finke, Martin Genzel
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In practice, deep neural networks are often able to easily interpolate their
+training data. To understand this phenomenon, many works have aimed to quantify
+the memorization capacity of a neural network architecture: the largest number
+of points such that the architecture can interpolate any placement of these
+points with any assignment of labels. For real-world data, however, one
+intuitively expects the presence of a benign structure so that interpolation
+already occurs at a smaller network size than suggested by memorization
+capacity. In this paper, we investigate interpolation by adopting an
+instance-specific viewpoint. We introduce a simple randomized algorithm that,
+given a fixed finite dataset with two classes, with high probability constructs
+an interpolating three-layer neural network in polynomial time. The required
+number of parameters is linked to geometric properties of the two classes and
+their mutual arrangement. As a result, we obtain guarantees that are
+independent of the number of samples and hence move beyond worst-case
+memorization capacity bounds. We illustrate the effectiveness of the algorithm
+in non-pathological situations with extensive numerical experiments and link
+the insights back to the theoretical results.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Federated Learning from Small <span class="highlight-title">Dataset</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2110.03469v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2110.03469v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Michael Kamp, Jonas Fischer, Jilles Vreeken
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning allows multiple parties to collaboratively train a joint
+model without sharing local data. This enables applications of machine learning
+in settings of inherently distributed, undisclosable data such as in the
+medical domain. In practice, joint training is usually achieved by aggregating
+local models, for which local training objectives have to be in expectation
+similar to the joint (global) objective. Often, however, local datasets are so
+small that local objectives differ greatly from the global objective, resulting
+in federated learning to fail. We propose a novel approach that intertwines
+model aggregations with permutations of local models. The permutations expose
+each local model to a daisy chain of local datasets resulting in more efficient
+training in data-sparse domains. This enables training on extremely small local
+datasets, such as patient data across hospitals, while retaining the training
+efficiency and privacy benefits of federated learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BarlowRL: Barlow Twins for Data-Efficient Reinforcement Learning <span class="chip">ACML 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.04263v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.04263v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Omer Veysel Cagatan, Baris Akgun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces BarlowRL, a data-efficient reinforcement learning agent
+that combines the Barlow Twins self-supervised learning framework with DER
+(Data-Efficient Rainbow) algorithm. BarlowRL outperforms both DER and its
+contrastive counterpart CURL on the Atari 100k benchmark. BarlowRL avoids
+dimensional collapse by enforcing information spread to the whole space. This
+helps RL algorithms to utilize uniformly spread state representation that
+eventually results in a remarkable performance. The integration of Barlow Twins
+with DER enhances data efficiency and achieves superior performance in the RL
+tasks. BarlowRL demonstrates the potential of incorporating self-supervised
+learning techniques to improve RL algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ACML 2023, Camera-Ready Version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLMMaps -- A Visual Metaphor for Stratified Evaluation of Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.00457v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.00457v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Patrik Puchert, Poonam Poonam, Christian van Onzenoodt, Timo Ropinski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have revolutionized natural language processing
+and demonstrated impressive capabilities in various tasks. Unfortunately, they
+are prone to hallucinations, where the model exposes incorrect or false
+information in its responses, which renders diligent evaluation approaches
+mandatory. While LLM performance in specific knowledge fields is often
+evaluated based on question and answer (Q&A) datasets, such evaluations usually
+report only a single accuracy number for the dataset, which often covers an
+entire field. This field-based evaluation, is problematic with respect to
+transparency and model improvement. A stratified evaluation could instead
+reveal subfields, where hallucinations are more likely to occur and thus help
+to better assess LLMs' risks and guide their further development. To support
+such stratified evaluations, we propose LLMMaps as a novel visualization
+technique that enables users to evaluate LLMs' performance with respect to Q&A
+datasets. LLMMaps provide detailed insights into LLMs' knowledge capabilities
+in different subfields, by transforming Q&A datasets as well as LLM responses
+into an internal knowledge structure. An extension for comparative
+visualization furthermore, allows for the detailed comparison of multiple LLMs.
+To assess LLMMaps we use them to conduct a comparative analysis of several
+state-of-the-art LLMs, such as BLOOM, GPT-2, GPT-3, ChatGPT and LLaMa-13B, as
+well as two qualitative user evaluations. All necessary source code and data
+for generating LLMMaps to be used in scientific publications and elsewhere is
+available on GitHub: https://github.com/viscom-ulm/LLMMaps
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptive Optimizers with Sparse Group Lasso for Neural Networks in CTR
+  Prediction <span class="chip">ECML</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2107.14432v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2107.14432v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yun Yue, Yongchao Liu, Suo Tong, Minghao Li, Zhen Zhang, Chunyang Wen, Huanjun Bao, Lihong Gu, Jinjie Gu, Yixiang Mu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We develop a novel framework that adds the regularizers of the sparse group
+lasso to a family of adaptive optimizers in deep learning, such as Momentum,
+Adagrad, Adam, AMSGrad, AdaHessian, and create a new class of optimizers, which
+are named Group Momentum, Group Adagrad, Group Adam, Group AMSGrad and Group
+AdaHessian, etc., accordingly. We establish theoretically proven convergence
+guarantees in the stochastic convex settings, based on primal-dual methods. We
+evaluate the regularized effect of our new optimizers on three large-scale
+real-world ad click datasets with state-of-the-art deep learning models. The
+experimental results reveal that compared with the original optimizers with the
+post-processing procedure which uses the magnitude pruning method, the
+performance of the models can be significantly improved on the same sparsity
+level. Furthermore, in comparison to the cases without magnitude pruning, our
+methods can achieve extremely high sparsity with significantly better or highly
+competitive performance. The code is available at
+https://github.com/intelligent-machine-learning/dlrover/blob/master/tfplus.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages. Published as a conference paper at ECML PKDD 2021. This
+  version includes Appendix which was not included in the published version
+  because of page limit</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bengali Document Layout Analysis -- A YOLOV8 Based Ensembling Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.00848v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.00848v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nazmus Sakib Ahmed, Saad Sakib Noor, Ashraful Islam Shanto Sikder, Abhijit Paul
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper focuses on enhancing Bengali Document Layout Analysis (DLA) using
+the YOLOv8 model and innovative post-processing techniques. We tackle
+challenges unique to the complex Bengali script by employing data augmentation
+for model robustness. After meticulous validation set evaluation, we fine-tune
+our approach on the complete dataset, leading to a two-stage prediction
+strategy for accurate element segmentation. Our ensemble model, combined with
+post-processing, outperforms individual base architectures, addressing issues
+identified in the BaDLAD dataset. By leveraging this approach, we aim to
+advance Bengali document analysis, contributing to improved OCR and document
+comprehension and BaDLAD serves as a foundational resource for this endeavor,
+aiding future research in the field. Furthermore, our experiments provided key
+insights to incorporate new strategies into the established solution.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Only Pay for What Is Uncertain: Variance-Adaptive Thompson Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09033v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09033v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aadirupa Saha, Branislav Kveton
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most bandit algorithms assume that the reward variances or their upper bounds
+are known, and that they are the same for all arms. This naturally leads to
+suboptimal performance and higher regret due to variance overestimation. On the
+other hand, underestimated reward variances may lead to linear regret due to
+committing early to a suboptimal arm. This motivated prior works on
+variance-adaptive frequentist algorithms, which have strong instance-dependent
+regret bounds but cannot incorporate prior knowledge on reward variances. We
+lay foundations for the Bayesian setting, which incorporates prior knowledge.
+This results in lower regret in practice, due to using the prior in the
+algorithm design, and also improved regret guarantees. Specifically, we study
+Gaussian bandits with {unknown heterogeneous reward variances}, and develop a
+Thompson sampling algorithm with prior-dependent Bayes regret bounds. We
+achieve lower regret with lower reward variances and more informative priors on
+them, which is precisely why we pay only for what is uncertain. This is the
+first result of its kind. Finally, we corroborate our theory with extensive
+experiments, which show the superiority of our variance-adaptive Bayesian
+algorithm over prior frequentist approaches. We also show that our approach is
+robust to model misspecification and can be applied with estimated priors.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NuTime: Numerically Multi-Scaled Embedding for Large-Scale Time Series
+  <span class="highlight-title">Pretrain</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07402v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07402v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenguo Lin, Xumeng Wen, Wei Cao, Congrui Huang, Jiang Bian, Stephen Lin, Zhirong Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent research on time-series self-supervised models shows great promise in
+learning semantic representations. However, it has been limited to small-scale
+datasets, e.g., thousands of temporal sequences. In this work, we make key
+technical contributions that are tailored to the numerical properties of
+time-series data and allow the model to scale to large datasets, e.g., millions
+of temporal sequences. We adopt the Transformer architecture by first
+partitioning the input into non-overlapping windows. Each window is then
+characterized by its normalized shape and two scalar values denoting the mean
+and standard deviation within each window. To embed scalar values that may
+possess arbitrary numerical scales to high-dimensional vectors, we propose a
+numerically multi-scaled embedding module enumerating all possible scales for
+the scalar values. The model undergoes pretraining using the proposed
+numerically multi-scaled embedding with a simple contrastive objective on a
+large-scale dataset containing over a million sequences. We study its transfer
+performance on a number of univariate and multivariate classification
+benchmarks. Our method exhibits remarkable improvement against previous
+representation learning approaches and establishes the new state of the art,
+even compared with domain-specific non-learning-based methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LLM4TS: Two-Stage Fine-Tuning for Time-Series Forecasting with
+  <span class="highlight-title">Pre-Train</span>ed LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.08469v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.08469v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ching Chang, Wen-Chih Peng, Tien-Fu Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we leverage pre-trained Large Language Models (LLMs) to enhance
+time-series forecasting. Mirroring the growing interest in unifying models for
+Natural Language Processing and Computer Vision, we envision creating an
+analogous model for long-term time-series forecasting. Due to limited
+large-scale time-series data for building robust foundation models, our
+approach LLM4TS focuses on leveraging the strengths of pre-trained LLMs. By
+combining time-series patching with temporal encoding, we have enhanced the
+capability of LLMs to handle time-series data effectively. Inspired by the
+supervised fine-tuning in chatbot domains, we prioritize a two-stage
+fine-tuning process: first conducting supervised fine-tuning to orient the LLM
+towards time-series data, followed by task-specific downstream fine-tuning.
+Furthermore, to unlock the flexibility of pre-trained LLMs without extensive
+parameter adjustments, we adopt several Parameter-Efficient Fine-Tuning (PEFT)
+techniques. Drawing on these innovations, LLM4TS has yielded state-of-the-art
+results in long-term forecasting. Our model has also shown exceptional
+capabilities as both a robust representation learner and an effective few-shot
+learner, thanks to the knowledge transferred from the pre-trained LLM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is currently under review. The code will be made available
+  upon acceptance</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Variational operator learning: A unified paradigm marrying training
+  neural operators and solving partial differential equations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.04234v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.04234v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tengfei Xu, Dachuan Liu, Peng Hao, Bo Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural operators as novel neural architectures for fast approximating
+solution operators of partial differential equations (PDEs), have shown
+considerable promise for future scientific computing. However, the mainstream
+of training neural operators is still data-driven, which needs an expensive
+ground-truth dataset from various sources (e.g., solving PDEs' samples with the
+conventional solvers, real-world experiments) in addition to training stage
+costs. From a computational perspective, marrying operator learning and
+specific domain knowledge to solve PDEs is an essential step in reducing
+dataset costs and label-free learning. We propose a novel paradigm that
+provides a unified framework of training neural operators and solving PDEs with
+the variational form, which we refer to as the variational operator learning
+(VOL). Ritz and Galerkin approach with finite element discretization are
+developed for VOL to achieve matrix-free approximation of system functional and
+residual, then direct minimization and iterative update are proposed as two
+optimization strategies for VOL. Various types of experiments based on
+reasonable benchmarks about variable heat source, Darcy flow, and variable
+stiffness elasticity are conducted to demonstrate the effectiveness of VOL.
+With a label-free training set and a 5-label-only shift set, VOL learns
+solution operators with its test errors decreasing in a power law with respect
+to the amount of unlabeled data. To the best of the authors' knowledge, this is
+the first study that integrates the perspectives of the weak form and efficient
+iterative methods for solving sparse linear systems into the end-to-end
+operator learning task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>35 pages, 6 figures with 5 extended figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Conditional Mutual Information for Disentangled Representations in
+  Reinforcement Learning <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14133v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14133v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mhairi Dunion, Trevor McInroe, Kevin Sebastian Luck, Josiah P. Hanna, Stefano V. Albrecht
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Reinforcement Learning (RL) environments can produce training data with
+spurious correlations between features due to the amount of training data or
+its limited feature coverage. This can lead to RL agents encoding these
+misleading correlations in their latent representation, preventing the agent
+from generalising if the correlation changes within the environment or when
+deployed in the real world. Disentangled representations can improve
+robustness, but existing disentanglement techniques that minimise mutual
+information between features require independent features, thus they cannot
+disentangle correlated features. We propose an auxiliary task for RL algorithms
+that learns a disentangled representation of high-dimensional observations with
+correlated features by minimising the conditional mutual information between
+features in the representation. We demonstrate experimentally, using continuous
+control tasks, that our approach improves generalisation under correlation
+shifts, as well as improving the training performance of RL algorithms in the
+presence of correlated features.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Conference on Neural Information Processing Systems (NeurIPS), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Design Toolbox for the Development of Collaborative Distributed
+  Machine Learning Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.16584v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.16584v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Jin, Niclas Kannengießer, Sascha Rank, Ali Sunyaev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To leverage data for the sufficient training of machine learning (ML) models
+from multiple parties in a confidentiality-preserving way, various
+collaborative distributed ML (CDML) system designs have been developed, for
+example, to perform assisted learning, federated learning, and split learning.
+CDML system designs show different traits, including high agent autonomy, ML
+model confidentiality, and fault tolerance. Facing a wide variety of CDML
+system designs with different traits, it is difficult for developers to design
+CDML systems with traits that match use case requirements in a targeted way.
+However, inappropriate CDML system designs may result in CDML systems failing
+their envisioned purposes. We developed a CDML design toolbox that can guide
+the development of CDML systems. Based on the CDML design toolbox, we present
+CDML system archetypes with distinct key traits that can support the design of
+CDML systems to meet use case requirements.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Neural-preconditioned Poisson Solver for Mixed Dirichlet and Neumann
+  Boundary Conditions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00177v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00177v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Weixian Lan, Elias Gueidon, Ayano Kaneda, Julian Panetta, Joseph Teran
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce a neural-preconditioned iterative solver for Poisson equations
+with mixed boundary conditions. The Poisson equation is ubiquitous in
+scientific computing: it governs a wide array of physical phenomena, arises as
+a subproblem in many numerical algorithms, and serves as a model problem for
+the broader class of elliptic PDEs. The most popular Poisson discretizations
+yield large sparse linear systems. At high resolution, and for
+performance-critical applications, iterative solvers can be advantageous for
+these -- but only when paired with powerful preconditioners. The core of our
+solver is a neural network trained to approximate the inverse of a discrete
+structured-grid Laplace operator for a domain of arbitrary shape and with mixed
+boundary conditions. The structure of this problem motivates a novel network
+architecture that we demonstrate is highly effective as a preconditioner even
+for boundary conditions outside the training set. We show that on challenging
+test cases arising from an incompressible fluid simulation, our method
+outperforms state-of-the-art solvers like algebraic multigrid as well as some
+recent neural preconditioners.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NeRF2: Neural Radio-Frequency Radiance Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.06118v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.06118v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaopeng Zhao, Zhenlin An, Qingrui Pan, Lei Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although Maxwell discovered the physical laws of electromagnetic waves 160
+years ago, how to precisely model the propagation of an RF signal in an
+electrically large and complex environment remains a long-standing problem. The
+difficulty is in the complex interactions between the RF signal and the
+obstacles (e.g., reflection, diffraction, etc.). Inspired by the great success
+of using a neural network to describe the optical field in computer vision, we
+propose a neural radio-frequency radiance field, NeRF$^\textbf{2}$, which
+represents a continuous volumetric scene function that makes sense of an RF
+signal's propagation. Particularly, after training with a few signal
+measurements, NeRF$^\textbf{2}$ can tell how/what signal is received at any
+position when it knows the position of a transmitter. As a physical-layer
+neural network, NeRF$^\textbf{2}$ can take advantage of the learned statistic
+model plus the physical model of ray tracing to generate a synthetic dataset
+that meets the training demands of application-layer artificial neural networks
+(ANNs). Thus, we can boost the performance of ANNs by the proposed
+turbo-learning, which mixes the true and synthetic datasets to intensify the
+training. Our experiment results show that turbo-learning can enhance
+performance with an approximate 50% increase. We also demonstrate the power of
+NeRF$^\textbf{2}$ in the field of indoor localization and 5G MIMO.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient probabilistic reconciliation of forecasts for real-valued and
+  count time series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.02286v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.02286v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lorenzo Zambon, Dario Azzimonti, Giorgio Corani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hierarchical time series are common in several applied fields. The forecasts
+for these time series are required to be coherent, that is, to satisfy the
+constraints given by the hierarchy. The most popular technique to enforce
+coherence is called reconciliation, which adjusts the base forecasts computed
+for each time series. However, recent works on probabilistic reconciliation
+present several limitations. In this paper, we propose a new approach based on
+conditioning to reconcile any type of forecast distribution. We then introduce
+a new algorithm, called Bottom-Up Importance Sampling, to efficiently sample
+from the reconciled distribution. It can be used for any base forecast
+distribution: discrete, continuous, or in the form of samples, providing a
+major speedup compared to the current methods. Experiments on several temporal
+hierarchies show a significant improvement over base probabilistic forecasts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>27 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GP-net: Flexible Viewpoint Grasp Proposal 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2209.10404v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2209.10404v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anna Konrad, John McDonald, Rudi Villing
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present the Grasp Proposal Network (GP-net), a Convolutional Neural
+Network model which can generate 6-DoF grasps from flexible viewpoints, e.g. as
+experienced by mobile manipulators. To train GP-net, we synthetically generate
+a dataset containing depth-images and ground-truth grasp information. In
+real-world experiments, we use the EGAD evaluation benchmark to evaluate GP-net
+against two commonly used algorithms, the Volumetric Grasping Network (VGN) and
+the Grasp Pose Detection package (GPD), on a PAL TIAGo mobile manipulator. In
+contrast to the state-of-the-art methods in robotic grasping, GP-net can be
+used for grasping objects from flexible, unknown viewpoints without the need to
+define the workspace and achieves a grasp success of 54.4% compared to 51.6%
+for VGN and 44.2% for GPD. We provide a ROS package along with our code and
+pre-trained models at https://aucoroboticsmu.github.io/GP-net/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICAR 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Finite Scalar Quantization: VQ-VAE Made Simple 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.15505v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.15505v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fabian Mentzer, David Minnen, Eirikur Agustsson, Michael Tschannen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose to replace vector quantization (VQ) in the latent representation
+of VQ-VAEs with a simple scheme termed finite scalar quantization (FSQ), where
+we project the VAE representation down to a few dimensions (typically less than
+10). Each dimension is quantized to a small set of fixed values, leading to an
+(implicit) codebook given by the product of these sets. By appropriately
+choosing the number of dimensions and values each dimension can take, we obtain
+the same codebook size as in VQ. On top of such discrete representations, we
+can train the same models that have been trained on VQ-VAE representations. For
+example, autoregressive and masked transformer models for image generation,
+multimodal generation, and dense prediction computer vision tasks. Concretely,
+we employ FSQ with MaskGIT for image generation, and with UViM for depth
+estimation, colorization, and panoptic segmentation. Despite the much simpler
+design of FSQ, we obtain competitive performance in all these tasks. We
+emphasize that FSQ does not suffer from codebook collapse and does not need the
+complex machinery employed in VQ (commitment losses, codebook reseeding, code
+splitting, entropy penalties, etc.) to learn expressive discrete
+representations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code:
+  https://github.com/google-research/google-research/tree/master/fsq</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Discovering Hierarchical Achievements in Reinforcement Learning via
+  Contrastive Learning <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03486v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03486v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seungyong Moon, Junyoung Yeom, Bumsoo Park, Hyun Oh Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Discovering achievements with a hierarchical structure in procedurally
+generated environments presents a significant challenge. This requires an agent
+to possess a broad range of abilities, including generalization and long-term
+reasoning. Many prior methods have been built upon model-based or hierarchical
+approaches, with the belief that an explicit module for long-term planning
+would be advantageous for learning hierarchical dependencies. However, these
+methods demand an excessive number of environment interactions or large model
+sizes, limiting their practicality. In this work, we demonstrate that proximal
+policy optimization (PPO), a simple yet versatile model-free algorithm,
+outperforms previous methods when optimized with recent implementation
+practices. Moreover, we find that the PPO agent can predict the next
+achievement to be unlocked to some extent, albeit with limited confidence.
+Based on this observation, we introduce a novel contrastive learning method,
+called achievement distillation, which strengthens the agent's ability to
+predict the next achievement. Our method exhibits a strong capacity for
+discovering hierarchical achievements and shows state-of-the-art performance on
+the challenging Crafter environment in a sample-efficient manner while
+utilizing fewer model parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ WiGenAI: The Symphony of Wireless and Generative AI via Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07312v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07312v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehdi Letafati, Samad Ali, Matti Latva-aho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Innovative foundation models, such as GPT-3 and stable diffusion models, have
+made a paradigm shift in the realm of artificial intelligence (AI) towards
+generative AI-based systems. In unison, from data communication and networking
+perspective, AI and machine learning (AI/ML) algorithms are envisioned to be
+pervasively incorporated into the future generations of wireless communications
+systems, highlighting the need for novel AI-native solutions for the emergent
+communication scenarios. In this article, we outline the applications of
+generative AI in wireless communication systems to lay the foundations for
+research in this field. Diffusion-based generative models, as the new
+state-of-the-art paradigm of generative models, are introduced, and their
+applications in wireless communication systems are discussed. Two case studies
+are also presented to showcase how diffusion models can be exploited for the
+development of resilient AI-native communication systems. Specifically, we
+propose denoising diffusion probabilistic models (DDPM) for a wireless
+communication scheme with non-ideal transceivers, where 30% improvement is
+achieved in terms of bit error rate. As the second application, DDPMs are
+employed at the transmitter to shape the constellation symbols, highlighting a
+robust out-of-distribution performance. Finally, future directions and open
+issues for the development of generative AI-based wireless systems are
+discussed to promote future research endeavors towards wireless generative AI
+(WiGenAI).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Memorization Capacity of Multi-Head Attention in <span class="highlight-title">Transformer</span>s 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.02010v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.02010v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sadegh Mahdavi, Renjie Liao, Christos Thrampoulidis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have become the go-to architecture for language and vision
+tasks, yet their theoretical properties, especially memorization capacity,
+remain elusive. This paper investigates the memorization abilities of
+multi-head attention mechanisms, examining how many example sequences they can
+memorize, as a function of the number of heads and sequence length. Motivated
+by experimental findings on vision transformers, we introduce novel assumptions
+about the linear independence of input data, distinct from the commonly used
+general-position assumption. Under these assumptions, we demonstrate that an
+attention layer with $H$ heads, dimension $d$, and context size $n < d$,
+featuring $\Theta(Hd^2)$ parameters, can memorize $\Omega(Hn)$ examples. Our
+analysis sheds light on how different attention heads handle various example
+sequences, aided by the softmax operator's saturation property. We validate our
+findings through experiments on synthetic data.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Pure Monte Carlo Counterfactual Regret Minimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.03084v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.03084v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ju Qi, Ting Feng, Falun Hei, Zhemei Fang, Yunfeng Luo
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Counterfactual Regret Minimization (CFR) and its variants are the best
+algorithms so far for solving large-scale incomplete information games.
+However, we believe that there are two problems with CFR: First, matrix
+multiplication is required in CFR iteration, and the time complexity of one
+iteration is too high; Secondly, the game characteristics in the real world are
+different. Just using one CFR algorithm will not be perfectly suitable for all
+game problems.
+  For these two problems, this paper proposes a new algorithm called Pure CFR
+(PCFR) based on CFR. PCFR can be seen as a combination of CFR and Fictitious
+Play (FP), inheriting the concept of counterfactual regret (value) from CFR,
+and using the best response strategy instead of the regret matching strategy
+for the next iteration. This algorithm has three advantages. First, PCFR can be
+combined with any CFR variant. The resulting Pure MCCFR (PMCCFR) can
+significantly reduce the time and space complexity of one iteration. Secondly,
+our experiments show that the convergence speed of the PMCCFR is 2$\sim$3 times
+that of the MCCFR. Finally, there is a type of game that is very suitable for
+PCFR, we call this type of game clear-game, which is characterized by a high
+proportion of dominated strategies. Experiments show that in clear-game, the
+convergence rate of PMCCFR is two orders of magnitude higher than that of
+MCCFR.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GraphControl: Adding Conditional Control to Universal Graph <span class="highlight-title">Pre-train</span>ed
+  Models for Graph Domain Transfer Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07365v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07365v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yun Zhu, Yaoke Wang, Haizhou Shi, Zhenshuo Zhang, Siliang Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph-structured data is ubiquitous in the world which models complex
+relationships between objects, enabling various Web applications. Daily
+influxes of unlabeled graph data on the Web offer immense potential for these
+applications. Graph self-supervised algorithms have achieved significant
+success in acquiring generic knowledge from abundant unlabeled graph data.
+These pre-trained models can be applied to various downstream Web applications,
+saving training time and improving downstream (target) performance. However,
+different graphs, even across seemingly similar domains, can differ
+significantly in terms of attribute semantics, posing difficulties, if not
+infeasibility, for transferring the pre-trained models to downstream tasks.
+Concretely speaking, for example, the additional task-specific node information
+in downstream tasks (specificity) is usually deliberately omitted so that the
+pre-trained representation (transferability) can be leveraged. The trade-off as
+such is termed as "transferability-specificity dilemma" in this work. To
+address this challenge, we introduce an innovative deployment module coined as
+GraphControl, motivated by ControlNet, to realize better graph domain transfer
+learning. Specifically, by leveraging universal structural pre-trained models
+and GraphControl, we align the input space across various graphs and
+incorporate unique characteristics of target data as conditional inputs. These
+conditions will be progressively integrated into the model during fine-tuning
+or prompt tuning through ControlNet, facilitating personalized deployment.
+Extensive experiments show that our method significantly enhances the
+adaptability of pre-trained models on target attributed datasets, achieving
+1.4-3x performance gain. Furthermore, it outperforms training-from-scratch
+methods on target data with a comparable margin and exhibits faster
+convergence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Variational Imbalanced Regression: Fair Uncertainty Quantification via
+  Probabilistic Smoothing <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06599v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06599v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyan Wang, Hao Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing regression models tend to fall short in both accuracy and
+uncertainty estimation when the label distribution is imbalanced. In this
+paper, we propose a probabilistic deep learning model, dubbed variational
+imbalanced regression (VIR), which not only performs well in imbalanced
+regression but naturally produces reasonable uncertainty estimation as a
+byproduct. Different from typical variational autoencoders assuming I.I.D.
+representations (a data point's representation is not directly affected by
+other data points), our VIR borrows data with similar regression labels to
+compute the latent representation's variational distribution; furthermore,
+different from deterministic regression models producing point estimates, VIR
+predicts the entire normal-inverse-gamma distributions and modulates the
+associated conjugate distributions to impose probabilistic reweighting on the
+imbalanced data, thereby providing better uncertainty estimation. Experiments
+in several real-world datasets show that our VIR can outperform
+state-of-the-art imbalanced regression models in terms of both accuracy and
+uncertainty estimation. Code will soon be available at
+\url{https://github.com/Wang-ML-Lab/variational-imbalanced-regression}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Locality-Aware Generalizable Implicit Neural Representation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05624v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05624v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Doyup Lee, Chiheon Kim, Minsu Cho, Wook-Shin Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generalizable implicit neural representation (INR) enables a single
+continuous function, i.e., a coordinate-based neural network, to represent
+multiple data instances by modulating its weights or intermediate features
+using latent codes. However, the expressive power of the state-of-the-art
+modulation is limited due to its inability to localize and capture fine-grained
+details of data entities such as specific pixels and rays. To address this
+issue, we propose a novel framework for generalizable INR that combines a
+transformer encoder with a locality-aware INR decoder. The transformer encoder
+predicts a set of latent tokens from a data instance to encode local
+information into each latent token. The locality-aware INR decoder extracts a
+modulation vector by selectively aggregating the latent tokens via
+cross-attention for a coordinate input and then predicts the output by
+progressively decoding with coarse-to-fine modulation through multiple
+frequency bandwidths. The selective token aggregation and the multi-band
+feature modulation enable us to learn locality-aware representation in spatial
+and spectral aspects, respectively. Our framework significantly outperforms
+previous generalizable INRs and validates the usefulness of the locality-aware
+latents for downstream tasks such as image generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient Hyperdimensional Computing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.10902v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.10902v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhanglu Yan, Shida Wang, Kaiwen Tang, Weng-Fai Wong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Hyperdimensional computing (HDC) is a method to perform classification that
+uses binary vectors with high dimensions and the majority rule. This approach
+has the potential to be energy-efficient and hence deemed suitable for
+resource-limited platforms due to its simplicity and massive parallelism.
+However, in order to achieve high accuracy, HDC sometimes uses hypervectors
+with tens of thousands of dimensions. This potentially negates its efficiency
+advantage. In this paper, we examine the necessity of such high dimensions and
+conduct a detailed theoretical analysis of the relationship between hypervector
+dimensions and accuracy. Our results demonstrate that as the dimension of the
+hypervectors increases, the worst-case/average-case HDC prediction accuracy
+with the majority rule decreases. Building on this insight, we develop HDC
+models that use binary hypervectors with dimensions orders of magnitude lower
+than those of state-of-the-art HDC models while maintaining equivalent or even
+improved accuracy and efficiency. For instance, on the MNIST dataset, we
+achieve 91.12% HDC accuracy in image classification with a dimension of only
+64. Our methods perform operations that are only 0.35% of other HDC models with
+dimensions of 10,000. Furthermore, we evaluate our methods on ISOLET, UCI-HAR,
+and Fashion-MNIST datasets and investigate the limits of HDC computing.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Score Regularized Policy Optimization through Diffusion Behavior 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07297v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07297v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huayu Chen, Cheng Lu, Zhengyi Wang, Hang Su, Jun Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent developments in offline reinforcement learning have uncovered the
+immense potential of diffusion modeling, which excels at representing
+heterogeneous behavior policies. However, sampling from diffusion policies is
+considerably slow because it necessitates tens to hundreds of iterative
+inference steps for one action. To address this issue, we propose to extract an
+efficient deterministic inference policy from critic models and pretrained
+diffusion behavior models, leveraging the latter to directly regularize the
+policy gradient with the behavior distribution's score function during
+optimization. Our method enjoys powerful generative capabilities of diffusion
+modeling while completely circumventing the computationally intensive and
+time-consuming diffusion sampling scheme, both during training and evaluation.
+Extensive results on D4RL tasks show that our method boosts action sampling
+speed by more than 25 times compared with various leading diffusion-based
+methods in locomotion tasks, while still maintaining state-of-the-art
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning to Simulate Tree-Branch Dynamics for Manipulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.03410v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.03410v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jayadeep Jacob, Tirthankar Bandyopadhyay, Jason Williams, Paulo Borges, Fabio Ramos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose to use a simulation driven inverse inference approach to model the
+dynamics of tree branches under manipulation. Learning branch dynamics and
+gaining the ability to manipulate deformable vegetation can help with
+occlusion-prone tasks, such as fruit picking in dense foliage, as well as
+moving overhanging vines and branches for navigation in dense vegetation. The
+underlying deformable tree geometry is encapsulated as coarse spring
+abstractions executed on parallel, non-differentiable simulators. The implicit
+statistical model defined by the simulator, reference trajectories obtained by
+actively probing the ground truth, and the Bayesian formalism, together guide
+the spring parameter posterior density estimation. Our non-parametric inference
+algorithm, based on Stein Variational Gradient Descent, incorporates
+biologically motivated assumptions into the inference process as neural network
+driven learnt joint priors; moreover, it leverages the finite difference scheme
+for gradient approximations. Real and simulated experiments confirm that our
+model can predict deformation trajectories, quantify the estimation
+uncertainty, and it can perform better when base-lined against other inference
+algorithms, particularly from the Monte Carlo family. The model displays strong
+robustness properties in the presence of heteroscedastic sensor noise;
+furthermore, it can generalise to unseen grasp locations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MINT: Evaluating LLMs in Multi-turn Interaction with Tools and Language
+  Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10691v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10691v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xingyao Wang, Zihan Wang, Jiateng Liu, Yangyi Chen, Lifan Yuan, Hao Peng, Heng Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To solve complex tasks, large language models (LLMs) often require multiple
+rounds of interactions with the user, sometimes assisted by external tools.
+However, current evaluation protocols often emphasize benchmark performance
+with single-turn exchanges, neglecting the nuanced interactions among the user,
+LLMs, and external tools, while also underestimating the importance of natural
+language feedback from users. These oversights contribute to discrepancies
+between research benchmark evaluations and real-world use cases. We introduce
+MINT, a benchmark that evaluates LLMs' ability to solve tasks with multi-turn
+interactions by (1) using tools and (2) leveraging natural language feedback.
+To ensure reproducibility, we provide an evaluation framework where LLMs can
+access tools by executing Python code and receive users' natural language
+feedback simulated by GPT-4. We repurpose a diverse set of established
+evaluation datasets focusing on reasoning, coding, and decision-making and
+carefully curate them into a compact subset for efficient evaluation. Our
+analysis of 20 open- and closed-source LLMs offers intriguing findings. (a)
+LLMs generally benefit from tools and language feedback, with performance gains
+(absolute, same below) of 1-8% for each turn of tool use and 2-17% with natural
+language feedback. (b) Better single-turn performance does not guarantee better
+multi-turn performance. (c) Surprisingly, on the LLMs evaluated, supervised
+instruction-finetuning (SIFT) and reinforcement learning from human feedback
+(RLHF) generally hurt multi-turn capabilities. We expect MINT can help measure
+progress and incentivize research in improving LLMs' capabilities in multi-turn
+interactions, especially for open-source communities where multi-turn human
+evaluation can be less accessible compared to commercial LLMs with a larger
+user base.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Code is available on our project website:
+  https://xingyaoww.github.io/mint-bench</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Elastic Decision <span class="highlight-title">Transformer</span> <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02484v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02484v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yueh-Hua Wu, Xiaolong Wang, Masashi Hamaya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces Elastic Decision Transformer (EDT), a significant
+advancement over the existing Decision Transformer (DT) and its variants.
+Although DT purports to generate an optimal trajectory, empirical evidence
+suggests it struggles with trajectory stitching, a process involving the
+generation of an optimal or near-optimal trajectory from the best parts of a
+set of sub-optimal trajectories. The proposed EDT differentiates itself by
+facilitating trajectory stitching during action inference at test time,
+achieved by adjusting the history length maintained in DT. Further, the EDT
+optimizes the trajectory by retaining a longer history when the previous
+trajectory is optimal and a shorter one when it is sub-optimal, enabling it to
+"stitch" with a more optimal trajectory. Extensive experimentation demonstrates
+EDT's ability to bridge the performance gap between DT-based and Q
+Learning-based approaches. In particular, the EDT outperforms Q Learning-based
+methods in a multi-task regime on the D4RL locomotion benchmark and Atari
+games. Videos are available at: https://kristery.github.io/edt/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Data-and Knowledge-Driven Artificial Intelligence: A <span class="highlight-title">Survey</span> on
+  Neuro-Symbolic Computing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.15889v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.15889v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wenguan Wang, Yi Yang, Fei Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural-symbolic computing (NeSy), which pursues the integration of the
+symbolic and statistical paradigms of cognition, has been an active research
+area of Artificial Intelligence (AI) for many years. As NeSy shows promise of
+reconciling the advantages of reasoning and interpretability of symbolic
+representation and robust learning in neural networks, it may serve as a
+catalyst for the next generation of AI. In the present paper, we provide a
+systematic overview of the recent developments and important contributions of
+NeSy research. Firstly, we introduce study history of this area, covering early
+work and foundations. We further discuss background concepts and identify key
+driving factors behind the development of NeSy. Afterward, we categorize recent
+landmark approaches along several main characteristics that underline this
+research paradigm, including neural-symbolic integration, knowledge
+representation, knowledge embedding, and functionality. Next, we briefly
+discuss the successful application of modern NeSy approaches in several
+domains. Then, we benchmark several NeSy methods on three representative
+application tasks. Finally, we identify the open problems together with
+potential future research directions. This survey is expected to help new
+researchers enter this rapidly evolving field and accelerate the progress
+towards data-and knowledge-driven AI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Ongoing project</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bitrate-Constrained DRO: Beyond Worst Case Robustness To Unknown Group
+  Shifts 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.02931v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.02931v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amrith Setlur, Don Dennis, Benjamin Eysenbach, Aditi Raghunathan, Chelsea Finn, Virginia Smith, Sergey Levine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Training machine learning models robust to distribution shifts is critical
+for real-world applications. Some robust training algorithms (e.g., Group DRO)
+specialize to group shifts and require group information on all training
+points. Other methods (e.g., CVaR DRO) that do not need group annotations can
+be overly conservative, since they naively upweight high loss points which may
+form a contrived set that does not correspond to any meaningful group in the
+real world (e.g., when the high loss points are randomly mislabeled training
+points). In this work, we address limitations in prior approaches by assuming a
+more nuanced form of group shift: conditioned on the label, we assume that the
+true group function (indicator over group) is simple. For example, we may
+expect that group shifts occur along low bitrate features (e.g., image
+background, lighting). Thus, we aim to learn a model that maintains high
+accuracy on simple group functions realized by these low bitrate features, that
+need not spend valuable model capacity achieving high accuracy on contrived
+groups of examples. Based on this, we consider the two-player game formulation
+of DRO where the adversary's capacity is bitrate-constrained. Our resulting
+practical algorithm, Bitrate-Constrained DRO (BR-DRO), does not require group
+information on training samples yet matches the performance of Group DRO on
+datasets that have training group annotations and that of CVaR DRO on
+long-tailed distributions. Our theoretical analysis reveals that in some
+settings BR-DRO objective can provably yield statistically efficient and less
+conservative solutions than unconstrained CVaR DRO.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Nest-DGIL: Nesterov-optimized Deep Geometric Incremental Learning for CS
+  Image Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03807v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03807v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaohong Fan, Yin Yang, Ke Chen, Yujie Feng, Jianping Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Proximal gradient-based optimization is one of the most common strategies to
+solve inverse problem of images, and it is easy to implement. However, these
+techniques often generate heavy artifacts in image reconstruction. One of the
+most popular refinement methods is to fine-tune the regularization parameter to
+alleviate such artifacts, but it may not always be sufficient or applicable due
+to increased computational costs. In this work, we propose a deep geometric
+incremental learning framework based on the second Nesterov proximal gradient
+optimization. The proposed end-to-end network not only has the powerful
+learning ability for high-/low-frequency image features, but also can
+theoretically guarantee that geometric texture details will be reconstructed
+from preliminary linear reconstruction. Furthermore, it can avoid the risk of
+intermediate reconstruction results falling outside the geometric decomposition
+domains and achieve fast convergence. Our reconstruction framework is
+decomposed into four modules including general linear reconstruction, cascade
+geometric incremental restoration, Nesterov acceleration, and post-processing.
+In the image restoration step, a cascade geometric incremental learning module
+is designed to compensate for missing texture information from different
+geometric spectral decomposition domains. Inspired by the overlap-tile
+strategy, we also develop a post-processing module to remove the block effect
+in patch-wise-based natural image reconstruction. All parameters in the
+proposed model are learnable, an adaptive initialization technique of physical
+parameters is also employed to make model flexibility and ensure converging
+smoothly. We compare the reconstruction performance of the proposed method with
+existing state-of-the-art methods to demonstrate its superiority. Our source
+codes are available at https://github.com/fanxiaohong/Nest-DGIL.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages,our source codes are available at
+  https://github.com/fanxiaohong/Nest-DGIL</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking the <span class="highlight-title">BERT</span>-like <span class="highlight-title">Pretrain</span>ing for DNA Sequences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07644v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07644v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoqi Liang, Weiqiang Bai, Lifeng Qiao, Yuchen Ren, Jianle Sun, Peng Ye, Hongliang Yan, Xinzhu Ma, Wangmeng Zuo, Wanli Ouyang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the success of large-scale pretraining in NLP, there is an increasing
+trend of applying it to the domain of life sciences. In particular, pretraining
+methods based on DNA sequences have garnered growing attention due to their
+potential to capture generic information about genes. However, existing
+pretraining methods for DNA sequences largely rely on direct adoptions of BERT
+pretraining from NLP, lacking a comprehensive understanding and a specifically
+tailored approach. To address this research gap, we first conducted a series of
+exploratory experiments and gained several insightful observations: 1) In the
+fine-tuning phase of downstream tasks, when using K-mer overlapping
+tokenization instead of K-mer non-overlapping tokenization, both overlapping
+and non-overlapping pretraining weights show consistent performance
+improvement.2) During the pre-training process, using K-mer overlapping
+tokenization quickly produces clear K-mer embeddings and reduces the loss to a
+very low level, while using K-mer non-overlapping tokenization results in less
+distinct embeddings and continuously decreases the loss. 3) Using overlapping
+tokenization causes the self-attention in the intermediate layers of
+pre-trained models to tend to overly focus on certain tokens, reflecting that
+these layers are not adequately optimized. In summary, overlapping tokenization
+can benefit the fine-tuning of downstream tasks but leads to inadequate
+pretraining with fast convergence. To unleash the pretraining potential, we
+introduce a novel approach called RandomMask, which gradually increases the
+task difficulty of BERT-like pretraining by continuously expanding its mask
+boundary, forcing the model to learn more knowledge. RandomMask is simple but
+effective, achieving top-tier performance across 26 datasets of 28 datasets
+spanning 7 downstream tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FABind: Fast and Accurate Protein-Ligand Binding <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06763v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06763v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qizhi Pei, Kaiyuan Gao, Lijun Wu, Jinhua Zhu, Yingce Xia, Shufang Xie, Tao Qin, Kun He, Tie-Yan Liu, Rui Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modeling the interaction between proteins and ligands and accurately
+predicting their binding structures is a critical yet challenging task in drug
+discovery. Recent advancements in deep learning have shown promise in
+addressing this challenge, with sampling-based and regression-based methods
+emerging as two prominent approaches. However, these methods have notable
+limitations. Sampling-based methods often suffer from low efficiency due to the
+need for generating multiple candidate structures for selection. On the other
+hand, regression-based methods offer fast predictions but may experience
+decreased accuracy. Additionally, the variation in protein sizes often requires
+external modules for selecting suitable binding pockets, further impacting
+efficiency. In this work, we propose $\mathbf{FABind}$, an end-to-end model
+that combines pocket prediction and docking to achieve accurate and fast
+protein-ligand binding. $\mathbf{FABind}$ incorporates a unique ligand-informed
+pocket prediction module, which is also leveraged for docking pose estimation.
+The model further enhances the docking process by incrementally integrating the
+predicted pocket to optimize protein-ligand binding, reducing discrepancies
+between training and inference. Through extensive experiments on benchmark
+datasets, our proposed $\mathbf{FABind}$ demonstrates strong advantages in
+terms of effectiveness and efficiency compared to existing methods. Our code is
+available at $\href{https://github.com/QizhiPei/FABind}{Github}$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Neural Information Processing Systems (NeurIPS 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SpikeCLIP: A Contrastive Language-Image <span class="highlight-title">Pretrain</span>ed Spiking Neural
+  Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06488v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06488v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianlong Li, Wenhao Liu, Changze Lv, Jianhan Xu, Cenyuan Zhang, Muling Wu, Xiaoqing Zheng, Xuanjing Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spiking neural networks (SNNs) have demonstrated the capability to achieve
+comparable performance to deep neural networks (DNNs) in both visual and
+linguistic domains while offering the advantages of improved energy efficiency
+and adherence to biological plausibility. However, the extension of such
+single-modality SNNs into the realm of multimodal scenarios remains an
+unexplored territory. Drawing inspiration from the concept of contrastive
+language-image pre-training (CLIP), we introduce a novel framework, named
+SpikeCLIP, to address the gap between two modalities within the context of
+spike-based computing through a two-step recipe involving ``Alignment
+Pre-training + Dual-Loss Fine-tuning". Extensive experiments demonstrate that
+SNNs achieve comparable results to their DNN counterparts while significantly
+reducing energy consumption across a variety of datasets commonly used for
+multimodal model evaluation. Furthermore, SpikeCLIP maintains robust
+performance in image classification tasks that involve class labels not
+predefined within specific categories.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Prompt</span>TTS 2: Describing and Generating Voices with Text <span class="highlight-title">Prompt</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.02285v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.02285v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yichong Leng, Zhifang Guo, Kai Shen, Xu Tan, Zeqian Ju, Yanqing Liu, Yufei Liu, Dongchao Yang, Leying Zhang, Kaitao Song, Lei He, Xiang-Yang Li, Sheng Zhao, Tao Qin, Jiang Bian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech conveys more information than text, as the same word can be uttered in
+various voices to convey diverse information. Compared to traditional
+text-to-speech (TTS) methods relying on speech prompts (reference speech) for
+voice variability, using text prompts (descriptions) is more user-friendly
+since speech prompts can be hard to find or may not exist at all. TTS
+approaches based on the text prompt face two main challenges: 1) the
+one-to-many problem, where not all details about voice variability can be
+described in the text prompt, and 2) the limited availability of text prompt
+datasets, where vendors and large cost of data labeling are required to write
+text prompts for speech. In this work, we introduce PromptTTS 2 to address
+these challenges with a variation network to provide variability information of
+voice not captured by text prompts, and a prompt generation pipeline to utilize
+the large language models (LLM) to compose high quality text prompts.
+Specifically, the variation network predicts the representation extracted from
+the reference speech (which contains full information about voice variability)
+based on the text prompt representation. For the prompt generation pipeline, it
+generates text prompts for speech with a speech language understanding model to
+recognize voice attributes (e.g., gender, speed) from speech and a large
+language model to formulate text prompts based on the recognition results.
+Experiments on a large-scale (44K hours) speech dataset demonstrate that
+compared to the previous works, PromptTTS 2 generates voices more consistent
+with text prompts and supports the sampling of diverse voice variability,
+thereby offering users more choices on voice generation. Additionally, the
+prompt generation pipeline produces high-quality text prompts, eliminating the
+large labeling cost. The demo page of PromptTTS 2 is available online.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Demo page: https://speechresearch.github.io/prompttts2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Guided Online Distillation: Promoting Safe Reinforcement Learning by
+  Offline Demonstration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.09408v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.09408v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinning Li, Xinyi Liu, Banghua Zhu, Jiantao Jiao, Masayoshi Tomizuka, Chen Tang, Wei Zhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safe Reinforcement Learning (RL) aims to find a policy that achieves high
+rewards while satisfying cost constraints. When learning from scratch, safe RL
+agents tend to be overly conservative, which impedes exploration and restrains
+the overall performance. In many realistic tasks, e.g. autonomous driving,
+large-scale expert demonstration data are available. We argue that extracting
+expert policy from offline data to guide online exploration is a promising
+solution to mitigate the conserveness issue. Large-capacity models, e.g.
+decision transformers (DT), have been proven to be competent in offline policy
+learning. However, data collected in real-world scenarios rarely contain
+dangerous cases (e.g., collisions), which makes it prohibitive for the policies
+to learn safety concepts. Besides, these bulk policy networks cannot meet the
+computation speed requirements at inference time on real-world tasks such as
+autonomous driving. To this end, we propose Guided Online Distillation (GOLD),
+an offline-to-online safe RL framework. GOLD distills an offline DT policy into
+a lightweight policy network through guided online safe RL training, which
+outperforms both the offline DT policy and online safe RL algorithms.
+Experiments in both benchmark safe RL tasks and real-world driving tasks based
+on the Waymo Open Motion Dataset (WOMD) demonstrate that GOLD can successfully
+distill lightweight policies and solve decision-making problems in challenging
+safety-critical scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Self-supervised</span> Learning for Segmentation and Quantification of Dopamine
+  Neurons in Parkinson's Disease 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.08141v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.08141v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fatemeh Haghighi, Soumitra Ghosh, Hai Ngu, Sarah Chu, Han Lin, Mohsen Hejrati, Baris Bingol, Somaye Hashemifar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Parkinson's Disease (PD) is the second most common neurodegenerative disease
+in humans. PD is characterized by the gradual loss of dopaminergic neurons in
+the Substantia Nigra (SN). Counting the number of dopaminergic neurons in the
+SN is one of the most important indexes in evaluating drug efficacy in PD
+animal models. Currently, analyzing and quantifying dopaminergic neurons is
+conducted manually by experts through analysis of digital pathology images
+which is laborious, time-consuming, and highly subjective. As such, a reliable
+and unbiased automated system is demanded for the quantification of
+dopaminergic neurons in digital pathology images. Recent years have seen a
+surge in adopting deep learning solutions in medical image processing. However,
+developing high-performing deep learning models hinges on the availability of
+large-scale, high-quality annotated data, which can be expensive to acquire,
+especially in applications like digital pathology image analysis. To this end,
+we propose an end-to-end deep learning framework based on self-supervised
+learning for the segmentation and quantification of dopaminergic neurons in PD
+animal models. To the best of our knowledge, this is the first deep learning
+model that detects the cell body of dopaminergic neurons, counts the number of
+dopaminergic neurons, and provides characteristics of individual dopaminergic
+neurons as a numerical output. Extensive experiments demonstrate the
+effectiveness of our model in quantifying neurons with high precision, which
+can provide a faster turnaround for drug efficacy studies, better understanding
+of dopaminergic neuronal health status, and unbiased results in PD pre-clinical
+research. As part of our contributions, we also provide the first publicly
+available dataset of histology digital images along with expert annotations for
+the segmentation of TH-positive DA neuronal soma.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dynamic DAG Discovery for Interpretable Imitation Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00489v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00489v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        ianxiang Zhao, Wenchao Yu, Suhang Wang, Lu Wang, Xiang Zhang, Yuncong Chen, Yanchi Liu, Wei Cheng, Haifeng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Imitation learning, which learns agent policy by mimicking expert
+demonstration, has shown promising results in many applications such as medical
+treatment regimes and self-driving vehicles. However, it remains a difficult
+task to interpret control policies learned by the agent. Difficulties mainly
+come from two aspects: 1) agents in imitation learning are usually implemented
+as deep neural networks, which are black-box models and lack interpretability;
+2) the latent causal mechanism behind agents' decisions may vary along the
+trajectory, rather than staying static throughout time steps. To increase
+transparency and offer better interpretability of the neural agent, we propose
+to expose its captured knowledge in the form of a directed acyclic causal
+graph, with nodes being action and state variables and edges denoting the
+causal relations behind predictions. Furthermore, we design this causal
+discovery process to be state-dependent, enabling it to model the dynamics in
+latent causal graphs. Concretely, we conduct causal discovery from the
+perspective of Granger causality and propose a self-explainable imitation
+learning framework, {\method}. The proposed framework is composed of three
+parts: a dynamic causal discovery module, a causality encoding module, and a
+prediction module, and is trained in an end-to-end manner. After the model is
+learned, we can obtain causal relations among states and action variables
+behind its decisions, exposing policies learned by it. Experimental results on
+both synthetic and real-world datasets demonstrate the effectiveness of the
+proposed {\method} in learning the dynamic causal graphs for understanding the
+decision-making of imitation learning meanwhile maintaining high prediction
+accuracy.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revisiting minimum description length complexity in overparameterized
+  models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2006.10189v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2006.10189v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raaz Dwivedi, Chandan Singh, Bin Yu, Martin J. Wainwright
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Complexity is a fundamental concept underlying statistical learning theory
+that aims to inform generalization performance. Parameter count, while
+successful in low-dimensional settings, is not well-justified for
+overparameterized settings when the number of parameters is more than the
+number of training samples. We revisit complexity measures based on Rissanen's
+principle of minimum description length (MDL) and define a novel MDL-based
+complexity (MDL-COMP) that remains valid for overparameterized models. MDL-COMP
+is defined via an optimality criterion over the encodings induced by a good
+Ridge estimator class. We provide an extensive theoretical characterization of
+MDL-COMP for linear models and kernel methods and show that it is not just a
+function of parameter count, but rather a function of the singular values of
+the design or the kernel matrix and the signal-to-noise ratio. For a linear
+model with $n$ observations, $d$ parameters, and i.i.d. Gaussian predictors,
+MDL-COMP scales linearly with $d$ when $d<n$, but the scaling is exponentially
+smaller -- $\log d$ for $d>n$. For kernel methods, we show that MDL-COMP
+informs minimax in-sample error, and can decrease as the dimensionality of the
+input increases. We also prove that MDL-COMP upper bounds the in-sample mean
+squared error (MSE). Via an array of simulations and real-data experiments, we
+show that a data-driven Prac-MDL-COMP informs hyper-parameter tuning for
+optimizing test MSE with ridge regression in limited data settings, sometimes
+improving upon cross-validation and (always) saving computational costs.
+Finally, our findings also suggest that the recently observed double decent
+phenomenons in overparameterized models might be a consequence of the choice of
+non-ideal estimators.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First two authors contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Kernel Density Bayesian Inverse Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06827v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06827v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aishwarya Mandyam, Didong Li, Diana Cai, Andrew Jones, Barbara E. Engelhardt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Inverse reinforcement learning~(IRL) is a powerful framework to infer an
+agent's reward function by observing its behavior, but IRL algorithms that
+learn point estimates of the reward function can be misleading because there
+may be several functions that describe an agent's behavior equally well. A
+Bayesian approach to IRL models a distribution over candidate reward functions,
+alleviating the shortcomings of learning a point estimate. However, several
+Bayesian IRL algorithms use a $Q$-value function in place of the likelihood
+function. The resulting posterior is computationally intensive to calculate,
+has few theoretical guarantees, and the $Q$-value function is often a poor
+approximation for the likelihood. We introduce kernel density Bayesian IRL
+(KD-BIRL), which uses conditional kernel density estimation to directly
+approximate the likelihood, providing an efficient framework that, with a
+modified reward function parameterization, is applicable to environments with
+complex and infinite state spaces. We demonstrate KD-BIRL's benefits through a
+series of experiments in Gridworld environments and a simulated sepsis
+treatment task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DP-Fast MH: Private, Fast, and Accurate Metropolis-Hastings for
+  Large-Scale Bayesian Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.06171v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.06171v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wanrong Zhang, Ruqi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bayesian inference provides a principled framework for learning from complex
+data and reasoning under uncertainty. It has been widely applied in machine
+learning tasks such as medical diagnosis, drug design, and policymaking. In
+these common applications, data can be highly sensitive. Differential privacy
+(DP) offers data analysis tools with powerful worst-case privacy guarantees and
+has been developed as the leading approach in privacy-preserving data analysis.
+In this paper, we study Metropolis-Hastings (MH), one of the most fundamental
+MCMC methods, for large-scale Bayesian inference under differential privacy.
+While most existing private MCMC algorithms sacrifice accuracy and efficiency
+to obtain privacy, we provide the first exact and fast DP MH algorithm, using
+only a minibatch of data in most iterations. We further reveal, for the first
+time, a three-way trade-off among privacy, scalability (i.e. the batch size),
+and efficiency (i.e. the convergence rate), theoretically characterizing how
+privacy affects the utility and computational cost in Bayesian inference. We
+empirically demonstrate the effectiveness and efficiency of our algorithm in
+various experiments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SIMMF: Semantics-aware Interactive Multiagent Motion Forecasting for
+  Autonomous Vehicle Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.14941v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.14941v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vidyaa Krishnan Nivash, Ahmed H. Qureshi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Autonomous vehicles require motion forecasting of their surrounding
+multiagents (pedestrians and vehicles) to make optimal decisions for
+navigation. The existing methods focus on techniques to utilize the positions
+and velocities of these agents and fail to capture semantic information from
+the scene. Moreover, to mitigate the increase in computational complexity
+associated with the number of agents in the scene, some works leverage
+Euclidean distance to prune far-away agents. However, distance-based metric
+alone is insufficient to select relevant agents and accurately perform their
+predictions. To resolve these issues, we propose the Semantics-aware
+Interactive Multiagent Motion Forecasting (SIMMF) method to capture semantics
+along with spatial information and optimally select relevant agents for motion
+prediction. Specifically, we achieve this by implementing a semantic-aware
+selection of relevant agents from the scene and passing them through an
+attention mechanism to extract global encodings. These encodings along with
+agents' local information, are passed through an encoder to obtain
+time-dependent latent variables for a motion policy predicting the future
+trajectories. Our results show that the proposed approach outperforms
+state-of-the-art baselines and provides more accurate and scene-consistent
+predictions.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ <span class="highlight-title">★</span> A Generalist Framework for Panoptic Segmentation of Images and Videos <span class="chip">ICCV'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.06366v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.06366v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ting Chen, Lala Li, Saurabh Saxena, <span class="highlight-author">Geoffrey Hinton</span>, David J. Fleet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Panoptic segmentation assigns semantic and instance ID labels to every pixel
+of an image. As permutations of instance IDs are also valid solutions, the task
+requires learning of high-dimensional one-to-many mapping. As a result,
+state-of-the-art approaches use customized architectures and task-specific loss
+functions. We formulate panoptic segmentation as a discrete data generation
+problem, without relying on inductive bias of the task. A diffusion model is
+proposed to model panoptic masks, with a simple architecture and generic loss
+function. By simply adding past predictions as a conditioning signal, our
+method is capable of modeling video (in a streaming setting) and thereby learns
+to track object instances automatically. With extensive experiments, we
+demonstrate that our simple approach can perform competitively to
+state-of-the-art specialist methods in similar settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV'23. Code at https://github.com/google-research/pix2seq</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Semantic Role Labeling from Compatible Label Sequences <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14600v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14600v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tao Li, Ghazaleh Kazeminejad, Susan W. Brown, Martha Palmer, Vivek Srikumar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic role labeling (SRL) has multiple disjoint label sets, e.g., VerbNet
+and PropBank. Creating these datasets is challenging, therefore a natural
+question is how to use each one to help the other. Prior work has shown that
+cross-task interaction helps, but only explored multitask learning so far. A
+common issue with multi-task setup is that argument sequences are still
+separately decoded, running the risk of generating structurally inconsistent
+label sequences (as per lexicons like Semlink). In this paper, we eliminate
+such issue with a framework that jointly models VerbNet and PropBank labels as
+one sequence. In this setup, we show that enforcing Semlink constraints during
+decoding constantly improves the overall F1. With special input constructions,
+our joint model infers VerbNet arguments from given PropBank arguments with
+over 99 F1. For learning, we propose a constrained marginal model that learns
+with knowledge defined in Semlink to further benefit from the large amounts of
+PropBank-only data. On the joint benchmark based on CoNLL05, our models achieve
+state-of-the-art F1's, outperforming the prior best in-domain model by 3.5
+(VerbNet) and 0.8 (PropBank). For out-of-domain generalization, our models
+surpass the prior best by 3.4 (VerbNet) and 0.2 (PropBank).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning from Guided Play: Improving Exploration for Adversarial
+  Imitation Learning with Simple Auxiliary Tasks <span class="chip">IROS'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.00051v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.00051v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Trevor Ablett, Bryan Chan, Jonathan Kelly
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Adversarial imitation learning (AIL) has become a popular alternative to
+supervised imitation learning that reduces the distribution shift suffered by
+the latter. However, AIL requires effective exploration during an online
+reinforcement learning phase. In this work, we show that the standard, naive
+approach to exploration can manifest as a suboptimal local maximum if a policy
+learned with AIL sufficiently matches the expert distribution without fully
+learning the desired task. This can be particularly catastrophic for
+manipulation tasks, where the difference between an expert and a non-expert
+state-action pair is often subtle. We present Learning from Guided Play (LfGP),
+a framework in which we leverage expert demonstrations of multiple exploratory,
+auxiliary tasks in addition to a main task. The addition of these auxiliary
+tasks forces the agent to explore states and actions that standard AIL may
+learn to ignore. Additionally, this particular formulation allows for the
+reusability of expert data between main tasks. Our experimental results in a
+challenging multitask robotic manipulation domain indicate that LfGP
+significantly outperforms both AIL and behaviour cloning, while also being more
+expert sample efficient than these baselines. To explain this performance gap,
+we provide further analysis of a toy problem that highlights the coupling
+between a local maximum and poor exploration, and also visualize the
+differences between the learned models from AIL and LfGP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In IEEE Robotics and Automation Letters (RA-L) and presented at the
+  IEEE/RSJ International Conference on Intelligent Robots and Systems
+  (IROS'23), Detroit, MI, USA, Oct. 1-5, 2023. arXiv admin note: substantial
+  text overlap with arXiv:2112.08932</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hidden Parameter Recurrent State Space Models For Changing Dynamics
+  Scenarios <span class="chip">ICLR 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.14697v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.14697v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vaisakh Shaj, Dieter Buchler, Rohit Sonker, Philipp Becker, Gerhard Neumann
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recurrent State-space models (RSSMs) are highly expressive models for
+learning patterns in time series data and system identification. However, these
+models assume that the dynamics are fixed and unchanging, which is rarely the
+case in real-world scenarios. Many control applications often exhibit tasks
+with similar but not identical dynamics which can be modeled as a latent
+variable. We introduce the Hidden Parameter Recurrent State Space Models
+(HiP-RSSMs), a framework that parametrizes a family of related dynamical
+systems with a low-dimensional set of latent factors. We present a simple and
+effective way of learning and performing inference over this Gaussian graphical
+model that avoids approximations like variational inference. We show that
+HiP-RSSMs outperforms RSSMs and competing multi-task models on several
+challenging robotic benchmarks both on real-world systems and simulations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at the International Conference on Learning
+  Representations, ICLR 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Mechanism for Solving Relational Tasks in <span class="highlight-title">Transformer</span> Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16130v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16130v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jack Merullo, Carsten Eickhoff, Ellie Pavlick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A primary criticism towards language models (LMs) is their inscrutability.
+This paper presents evidence that, despite their size and complexity, LMs
+sometimes exploit a simple computational mechanism to solve one-to-one
+relational tasks (e.g., capital_of(Poland)=Warsaw). We investigate a range of
+language model sizes (from 124M parameters to 176B parameters) in an in-context
+learning setting, and find that for a variety of tasks (involving capital
+cities, upper-casing, and past-tensing) a key part of the mechanism reduces to
+a simple linear update typically applied by the feedforward (FFN) networks.
+These updates also tend to promote the output of the relation in a
+content-independent way (e.g., encoding Poland:Warsaw::China:Beijing),
+revealing a predictable pattern that these models take in solving these tasks.
+We further show that this mechanism is specific to tasks that require retrieval
+from pretraining memory, rather than retrieval from local context. Our results
+contribute to a growing body of work on the mechanistic interpretability of
+LLMs, and offer reason to be optimistic that, despite the massive and
+non-linear nature of the models, the strategies they ultimately use to solve
+tasks can sometimes reduce to familiar and even intuitive algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ BioBridge: Bridging Biomedical Foundation Models via Knowledge Graph 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.03320v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.03320v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zifeng Wang, Zichen Wang, Balasubramaniam Srinivasan, Vassilis N. Ioannidis, Huzefa Rangwala, Rishita Anubhai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models (FMs) are able to leverage large volumes of unlabeled data
+to demonstrate superior performance across a wide range of tasks. However, FMs
+developed for biomedical domains have largely remained unimodal, i.e.,
+independently trained and used for tasks on protein sequences alone, small
+molecule structures alone, or clinical data alone. To overcome this limitation
+of biomedical FMs, we present BioBridge, a novel parameter-efficient learning
+framework, to bridge independently trained unimodal FMs to establish multimodal
+behavior. BioBridge achieves it by utilizing Knowledge Graphs (KG) to learn
+transformations between one unimodal FM and another without fine-tuning any
+underlying unimodal FMs. Our empirical results demonstrate that BioBridge can
+beat the best baseline KG embedding methods (on average by around 76.3%) in
+cross-modal retrieval tasks. We also identify BioBridge demonstrates
+out-of-domain generalization ability by extrapolating to unseen modalities or
+relations. Additionally, we also show that BioBridge presents itself as a
+general purpose retriever that can aid biomedical multimodal question answering
+as well as enhance the guided generation of novel drugs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>this paper needs further internal review for being published</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">6</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Can We Edit Multimodal Large Language Models? <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08475v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08475v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siyuan Cheng, Bozhong Tian, Qingbin Liu, Xi Chen, Yongheng Wang, Huajun Chen, Ningyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we focus on editing Multimodal Large Language Models (MLLMs).
+Compared to editing single-modal LLMs, multimodal model editing is more
+challenging, which demands a higher level of scrutiny and careful consideration
+in the editing process. To facilitate research in this area, we construct a new
+benchmark, dubbed MMEdit, for editing multimodal LLMs and establishing a suite
+of innovative metrics for evaluation. We conduct comprehensive experiments
+involving various model editing baselines and analyze the impact of editing
+different components for multimodal LLMs. Empirically, we notice that previous
+baselines can implement editing multimodal LLMs to some extent, but the effect
+is still barely satisfactory, indicating the potential difficulty of this task.
+We hope that our work can provide the NLP community with insights\footnote{Code
+and dataset are available in https://github.com/zjunlp/EasyEdit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LiveVV: Human-Centered Live Volumetric Video Streaming System 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08205v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08205v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kaiyuan Hu, Yongting Chen, Kaiying Han, Junhua Liu, Haowen Yang, Yili Jin, Boyan Li, Fangxin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Volumetric video has emerged as a prominent medium within the realm of
+eXtended Reality (XR) with the advancements in computer graphics and depth
+capture hardware. Users can fully immersive themselves in volumetric video with
+the ability to switch their viewport in six degree-of-freedom (DOF), including
+three rotational dimensions (yaw, pitch, roll) and three translational
+dimensions (X, Y, Z). Different from traditional 2D videos that are composed of
+pixel matrices, volumetric videos employ point clouds, meshes, or voxels to
+represent a volumetric scene, resulting in significantly larger data sizes.
+While previous works have successfully achieved volumetric video streaming in
+video-on-demand scenarios, the live streaming of volumetric video remains an
+unresolved challenge due to the limited network bandwidth and stringent latency
+constraints. In this paper, we for the first time propose a holistic live
+volumetric video streaming system, LiveVV, which achieves multi-view capture,
+scene segmentation \& reuse, adaptive transmission, and rendering. LiveVV
+contains multiple lightweight volumetric video capture modules that are capable
+of being deployed without prior preparation. To reduce bandwidth consumption,
+LiveVV processes static and dynamic volumetric content separately by reusing
+static data with low disparity and decimating data with low visual saliency.
+Besides, to deal with network fluctuation, LiveVV integrates a volumetric video
+adaptive bitrate streaming algorithm (VABR) to enable fluent playback with the
+maximum quality of experience. Extensive real-world experiment shows that
+LiveVV can achieve live volumetric video streaming at a frame rate of 24 fps
+with a latency of less than 350ms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MCPNS: A Macropixel Collocated Position and Its Neighbors Search for
+  Plenoptic 2.0 Video Coding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.08006v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.08006v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vinh Van Duong, Thuc Nguyen Huu, Jonghoon Yim, Byeungwoo Jeon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, it was demonstrated that a newly focused plenoptic 2.0 camera can
+capture much higher spatial resolution owing to its effective light field
+sampling, as compared to a traditional unfocused plenoptic 1.0 camera. However,
+due to the nature difference of the optical structure between the plenoptic 1.0
+and 2.0 cameras, the existing fast motion estimation (ME) method for plenoptic
+1.0 videos is expected to be sub-optimal for encoding plenoptic 2.0 videos. In
+this paper, we point out the main motion characteristic differences between
+plenoptic 1.0 and 2.0 videos and then propose a new fast ME, called macropixel
+collocated position and its neighbors search (MCPNS) for plenoptic 2.0 videos.
+In detail, we propose to reduce the number of macropixel collocated position
+(MCP) search candidates based on the new observation of center-biased motion
+vector distribution at macropixel resolution. After that, due to large motion
+deviation behavior around each MCP location in plenoptic 2.0 videos, we propose
+to select a certain number of key MCP locations with the lowest matching cost
+to perform the neighbors MCP search to improve the motion search accuracy.
+Different from existing methods, our method can achieve better performance
+without requiring prior knowledge of microlens array orientations. Our
+simulation results confirmed the effectiveness of the proposed algorithm in
+terms of both bitrate savings and computational costs compared to existing
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Soundify: Matching Sound Effects to Video <span class="chip">NeurIPS 2021</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2112.09726v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2112.09726v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Chuan-En Lin, Anastasis Germanidis, Cristóbal Valenzuela, Yining Shi, Nikolas Martelaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the art of video editing, sound helps add character to an object and
+immerse the viewer within a space. Through formative interviews with
+professional editors (N=10), we found that the task of adding sounds to video
+can be challenging. This paper presents Soundify, a system that assists editors
+in matching sounds to video. Given a video, Soundify identifies matching
+sounds, synchronizes the sounds to the video, and dynamically adjusts panning
+and volume to create spatial audio. In a human evaluation study (N=889), we
+show that Soundify is capable of matching sounds to video out-of-the-box for a
+diverse range of audio categories. In a within-subjects expert study (N=12), we
+demonstrate the usefulness of Soundify in helping video editors match sounds to
+video with lighter workload, reduced task completion time, and improved
+usability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Full paper in UIST 2023; Short paper in NeurIPS 2021 ML4CD Workshop;
+  Online demo: https://soundify.cc</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ <span class="highlight-title">★</span> A Generalist Framework for Panoptic Segmentation of Images and Videos <span class="chip">ICCV'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.06366v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.06366v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ting Chen, Lala Li, Saurabh Saxena, <span class="highlight-author">Geoffrey Hinton</span>, David J. Fleet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Panoptic segmentation assigns semantic and instance ID labels to every pixel
+of an image. As permutations of instance IDs are also valid solutions, the task
+requires learning of high-dimensional one-to-many mapping. As a result,
+state-of-the-art approaches use customized architectures and task-specific loss
+functions. We formulate panoptic segmentation as a discrete data generation
+problem, without relying on inductive bias of the task. A diffusion model is
+proposed to model panoptic masks, with a simple architecture and generic loss
+function. By simply adding past predictions as a conditioning signal, our
+method is capable of modeling video (in a streaming setting) and thereby learns
+to track object instances automatically. With extensive experiments, we
+demonstrate that our simple approach can perform competitively to
+state-of-the-art specialist methods in similar settings.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV'23. Code at https://github.com/google-research/pix2seq</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MusiLingo: Bridging Music and Text with <span class="highlight-title">Pre-train</span>ed Language Models for
+  Music Captioning and Query Response 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.08730v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.08730v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihao Deng, Yinghao Ma, Yudong Liu, Rongchen Guo, Ge Zhang, Wenhu Chen, Wenhao Huang, Emmanouil Benetos
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown immense potential in multimodal
+applications, yet the convergence of textual and musical domains remains
+relatively unexplored. To address this gap, we present MusiLingo, a novel
+system for music caption generation and music-related query responses.
+MusiLingo employs a single projection layer to align music representations from
+the pre-trained frozen music audio model MERT with the frozen Vicuna-7B
+language model (an adaption of LLaMA), bridging the gap between music audio and
+textual contexts. We train it on an extensive music caption dataset and
+fine-tune it with instructional data. Due to the scarcity of high-quality music
+Q\&A datasets, we created the Music Instruct (MI) dataset from captions in the
+MusicCaps datasets, tailored for open-ended music inquiries. Empirical
+evaluations demonstrate its competitive performance in generating music
+captions and composing music-related Q&A pairs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+    <section class="day-container">
+        <div class="date">
+            <time datetime="2023-10-11T00:00:00Z">2023-10-11</time>
+        </div>
+            <article>
+                <details>
+                    <Summary>
+                        Computation and Language <span class="chip" style="font-size: 60%">106</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ To Build Our Future, We Must Know Our Past: Contextualizing Paradigm
+  Shifts in Natural Language Processing <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07715v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07715v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sireesh Gururaja, Amanda Bertsch, Clara Na, David Gray Widder, Emma Strubell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  NLP is in a period of disruptive change that is impacting our methodologies,
+funding sources, and public perception. In this work, we seek to understand how
+to shape our future by better understanding our past. We study factors that
+shape NLP as a field, including culture, incentives, and infrastructure by
+conducting long-form interviews with 26 NLP researchers of varying seniority,
+research area, institution, and social identity. Our interviewees identify
+cyclical patterns in the field, as well as new shifts without historical
+parallel, including changes in benchmark culture and software infrastructure.
+We complement this discussion with quantitative analysis of citation,
+authorship, and language use in the ACL Anthology over time. We conclude by
+discussing shared visions, concerns, and hopes for the future of NLP. We hope
+that this study of our field's past and present can prompt informed discussion
+of our community's implicit norms and more deliberate action to consciously
+shape the future.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InstructRetro: Instruction Tuning post Retrieval-Augmented <span class="highlight-title">Pretrain</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07713v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07713v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boxin Wang, Wei Ping, Lawrence McAfee, Peng Xu, Bo Li, Mohammad Shoeybi, Bryan Catanzaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pretraining auto-regressive large language models (LLMs) with retrieval
+demonstrates better perplexity and factual accuracy by leveraging external
+databases. However, the size of existing pretrained retrieval-augmented LLM is
+still limited (e.g., Retro has 7.5B parameters), which limits the effectiveness
+of instruction tuning and zero-shot generalization. In this work, we introduce
+Retro 48B, the largest LLM pretrained with retrieval before instruction tuning.
+Specifically, we continue to pretrain the 43B GPT model on additional 100
+billion tokens using the Retro augmentation method by retrieving from 1.2
+trillion tokens. The obtained foundation model, Retro 48B, largely outperforms
+the original 43B GPT in terms of perplexity. After instruction tuning on Retro,
+InstructRetro demonstrates significant improvement over the instruction tuned
+GPT on zero-shot question answering (QA) tasks. Specifically, the average
+improvement of InstructRetro is 7% over its GPT counterpart across 8 short-form
+QA tasks, and 10% over GPT across 4 challenging long-form QA tasks.
+Surprisingly, we find that one can ablate the encoder from InstructRetro
+architecture and directly use its decoder backbone, while achieving comparable
+results. We hypothesize that pretraining with retrieval makes its decoder good
+at incorporating context for QA. Our results highlights the promising direction
+to obtain a better GPT decoder for QA through continued pretraining with
+retrieval before instruction tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Found in the Middle: Permutation Self-Consistency Improves Listwise
+  Ranking in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07712v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07712v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raphael Tang, Xinyu Zhang, Xueguang Ma, Jimmy Lin, Ferhan Ture
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) exhibit positional bias in how they use context,
+which especially complicates listwise ranking. To address this, we propose
+permutation self-consistency, a form of self-consistency over ranking list
+outputs of black-box LLMs. Our key idea is to marginalize out different list
+orders in the prompt to produce an order-independent ranking with less
+positional bias. First, given some input prompt, we repeatedly shuffle the list
+in the prompt and pass it through the LLM while holding the instructions the
+same. Next, we aggregate the resulting sample of rankings by computing the
+central ranking closest in distance to all of them, marginalizing out prompt
+order biases in the process. Theoretically, we prove the robustness of our
+method, showing convergence to the true ranking in the presence of random
+perturbations. Empirically, on five list-ranking datasets in sorting and
+passage reranking, our approach improves scores from conventional inference by
+up to 7-18% for GPT-3.5 and 8-16% for LLaMA v2 (70B), surpassing the previous
+state of the art in passage reranking. Our code is at
+https://github.com/castorini/perm-sc.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First two authors contributed equally; 10 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiPmark: A Stealthy, Efficient and Resilient Watermark for Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07710v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07710v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihan Wu, Zhengmian Hu, Hongyang Zhang, Heng Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Watermarking techniques offer a promising way to secure data via embedding
+covert information into the data. A paramount challenge in the domain lies in
+preserving the distribution of original data during watermarking. Our research
+extends and refines existing watermarking framework, placing emphasis on the
+importance of a distribution-preserving (DiP) watermark. Contrary to the
+current strategies, our proposed DiPmark preserves the original token
+distribution during watermarking (stealthy), is detectable without access to
+the language model API or weights (efficient), and is robust to moderate
+changes of tokens (resilient). This is achieved by incorporating a novel
+reweight strategy, combined with a hash function that assigns unique
+\textit{i.i.d.} ciphers based on the context. The empirical benchmarks of our
+approach underscore its stealthiness, efficiency, and resilience, making it a
+robust solution for watermarking tasks that demand impeccable quality
+preservation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MatFormer: Nested <span class="highlight-title">Transformer</span> for Elastic Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07707v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07707v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                         Devvrit, Sneha Kudugunta, Aditya Kusupati, Tim Dettmers, Kaifeng Chen, Inderjit Dhillon, Yulia Tsvetkov, Hannaneh Hajishirzi, Sham Kakade, Ali Farhadi, Prateek Jain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer models are deployed in a wide range of settings, from
+multi-accelerator clusters to standalone mobile phones. The diverse inference
+constraints in these scenarios necessitate practitioners to train foundation
+models such as PaLM 2, Llama, & ViTs as a series of models of varying sizes.
+Due to significant training costs, only a select few model sizes are trained
+and supported, limiting more fine-grained control over relevant tradeoffs,
+including latency, cost, and accuracy. This work introduces MatFormer, a nested
+Transformer architecture designed to offer elasticity in a variety of
+deployment constraints. Each Feed Forward Network (FFN) block of a MatFormer
+model is jointly optimized with a few nested smaller FFN blocks. This training
+procedure allows for the Mix'n'Match of model granularities across layers --
+i.e., a trained universal MatFormer model enables extraction of hundreds of
+accurate smaller models, which were never explicitly optimized. We empirically
+demonstrate MatFormer's effectiveness across different model classes (decoders
+& encoders), modalities (language & vision), and scales (up to 2.6B
+parameters). We find that a 2.6B decoder-only MatFormer language model (MatLM)
+allows us to extract smaller models spanning from 1.5B to 2.6B, each exhibiting
+comparable validation loss and one-shot downstream evaluations to their
+independently trained counterparts. Furthermore, we observe that smaller
+encoders extracted from a universal MatFormer-based ViT (MatViT) encoder
+preserve the metric-space structure for adaptive large-scale retrieval.
+Finally, we showcase that speculative decoding with the accurate and consistent
+submodels extracted from MatFormer can further reduce inference latency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 12 figures, first three authors contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ferret: Refer and Ground Anything Anywhere at Any Granularity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07704v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07704v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoxuan You, Haotian Zhang, Zhe Gan, Xianzhi Du, Bowen Zhang, Zirui Wang, Liangliang Cao, Shih-Fu Chang, Yinfei Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Ferret, a new Multimodal Large Language Model (MLLM) capable of
+understanding spatial referring of any shape or granularity within an image and
+accurately grounding open-vocabulary descriptions. To unify referring and
+grounding in the LLM paradigm, Ferret employs a novel and powerful hybrid
+region representation that integrates discrete coordinates and continuous
+features jointly to represent a region in the image. To extract the continuous
+features of versatile regions, we propose a spatial-aware visual sampler, adept
+at handling varying sparsity across different shapes. Consequently, Ferret can
+accept diverse region inputs, such as points, bounding boxes, and free-form
+shapes. To bolster the desired capability of Ferret, we curate GRIT, a
+comprehensive refer-and-ground instruction tuning dataset including 1.1M
+samples that contain rich hierarchical spatial knowledge, with 95K hard
+negative data to promote model robustness. The resulting model not only
+achieves superior performance in classical referring and grounding tasks, but
+also greatly outperforms existing MLLMs in region-based and
+localization-demanded multimodal chatting. Our evaluations also reveal a
+significantly improved capability of describing image details and a remarkable
+alleviation in object hallucination. Code and data will be available at
+https://github.com/apple/ml-ferret
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 10 figures. Code/Project Website:
+  https://github.com/apple/ml-ferret</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Knowledge-enhanced Memory Model for Emotional Support Conversation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07700v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07700v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengzhao Jia, Qianglong Chen, Liqiang Jing, Dawei Fu, Renyu Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The prevalence of mental disorders has become a significant issue, leading to
+the increased focus on Emotional Support Conversation as an effective
+supplement for mental health support. Existing methods have achieved compelling
+results, however, they still face three challenges: 1) variability of emotions,
+2) practicality of the response, and 3) intricate strategy modeling. To address
+these challenges, we propose a novel knowledge-enhanced Memory mODEl for
+emotional suppoRt coNversation (MODERN). Specifically, we first devise a
+knowledge-enriched dialogue context encoding to perceive the dynamic emotion
+change of different periods of the conversation for coherent user state
+modeling and select context-related concepts from ConceptNet for practical
+response generation. Thereafter, we implement a novel memory-enhanced strategy
+modeling module to model the semantic patterns behind the strategy categories.
+Extensive experiments on a widely used large-scale dataset verify the
+superiority of our model over cutting-edge baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Composite Backdoor Attacks Against Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07676v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07676v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hai Huang, Zhengyu Zhao, Michael Backes, Yun Shen, Yang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated superior performance compared
+to previous methods on various tasks, and often serve as the foundation models
+for many researches and services. However, the untrustworthy third-party LLMs
+may covertly introduce vulnerabilities for downstream tasks. In this paper, we
+explore the vulnerability of LLMs through the lens of backdoor attacks.
+Different from existing backdoor attacks against LLMs, ours scatters multiple
+trigger keys in different prompt components. Such a Composite Backdoor Attack
+(CBA) is shown to be stealthier than implanting the same multiple trigger keys
+in only a single component. CBA ensures that the backdoor is activated only
+when all trigger keys appear. Our experiments demonstrate that CBA is effective
+in both natural language processing (NLP) and multimodal tasks. For instance,
+with $3\%$ poisoning samples against the LLaMA-7B model on the Emotion dataset,
+our attack achieves a $100\%$ Attack Success Rate (ASR) with a False Triggered
+Rate (FTR) below $2.06\%$ and negligible model accuracy degradation. The unique
+characteristics of our CBA can be tailored for various practical scenarios,
+e.g., targeting specific user groups. Our work highlights the necessity of
+increased security research on the trustworthiness of foundation LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Well Begun is Half Done: Generator-agnostic Knowledge Pre-Selection for
+  Knowledge-Grounded Dialogue <span class="chip">EMNLP2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07659v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07659v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qin Lang, Zhang Yao, Liang Hongru, Wang jun, Yang Zhenglu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Accurate knowledge selection is critical in knowledge-grounded dialogue
+systems. Towards a closer look at it, we offer a novel perspective to organize
+existing literature, i.e., knowledge selection coupled with, after, and before
+generation. We focus on the third under-explored category of study, which can
+not only select knowledge accurately in advance, but has the advantage to
+reduce the learning, adjustment, and interpretation burden of subsequent
+response generation models, especially LLMs. We propose GATE, a
+generator-agnostic knowledge selection method, to prepare knowledge for
+subsequent response generation models by selecting context-related knowledge
+among different knowledge structures and variable knowledge requirements.
+Experimental results demonstrate the superiority of GATE, and indicate that
+knowledge selection before generation is a lightweight yet effective way to
+facilitate LLMs (e.g., ChatGPT) to generate more informative responses.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by EMNLP2023 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Audio-Visual Neural Syntax Acquisition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07654v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07654v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng-I Jeff Lai, Freda Shi, Puyuan Peng, Yoon Kim, Kevin Gimpel, Shiyu Chang, Yung-Sung Chuang, Saurabhchand Bhati, David Cox, David Harwath, Yang Zhang, Karen Livescu, James Glass
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study phrase structure induction from visually-grounded speech. The core
+idea is to first segment the speech waveform into sequences of word segments,
+and subsequently induce phrase structure using the inferred segment-level
+continuous representations. We present the Audio-Visual Neural Syntax Learner
+(AV-NSL) that learns phrase structure by listening to audio and looking at
+images, without ever being exposed to text. By training on paired images and
+spoken captions, AV-NSL exhibits the capability to infer meaningful phrase
+structures that are comparable to those derived by naturally-supervised text
+parsers, for both English and German. Our findings extend prior work in
+unsupervised language acquisition from speech and grounded grammar induction,
+and present one approach to bridge the gap between the two topics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LLM4Vis: Explainable Visualization Recommendation using Chat<span class="highlight-title">GPT</span> <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07652v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07652v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lei Wang, Songheng Zhang, Yun Wang, Ee-Peng Lim, Yong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data visualization is a powerful tool for exploring and communicating
+insights in various domains. To automate visualization choice for datasets, a
+task known as visualization recommendation has been proposed. Various
+machine-learning-based approaches have been developed for this purpose, but
+they often require a large corpus of dataset-visualization pairs for training
+and lack natural explanations for their results. To address this research gap,
+we propose LLM4Vis, a novel ChatGPT-based prompting approach to perform
+visualization recommendation and return human-like explanations using very few
+demonstration examples. Our approach involves feature description,
+demonstration example selection, explanation generation, demonstration example
+construction, and inference steps. To obtain demonstration examples with
+high-quality explanations, we propose a new explanation generation
+bootstrapping to iteratively refine generated explanations by considering the
+previous generation and template-based hint. Evaluations on the VizML dataset
+show that LLM4Vis outperforms or performs similarly to supervised learning
+models like Random Forest, Decision Tree, and MLP in both few-shot and
+zero-shot settings. The qualitative evaluation also shows the effectiveness of
+explanations generated by LLM4Vis. We make our code publicly available at
+\href{https://github.com/demoleiwang/LLM4Vis}{https://github.com/demoleiwang/LLM4Vis}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 (Industry Track)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking the <span class="highlight-title">BERT</span>-like <span class="highlight-title">Pretrain</span>ing for DNA Sequences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07644v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07644v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoqi Liang, Weiqiang Bai, Lifeng Qiao, Yuchen Ren, Jianle Sun, Peng Ye, Hongliang Yan, Xinzhu Ma, Wangmeng Zuo, Wanli Ouyang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the success of large-scale pretraining in NLP, there is an increasing
+trend of applying it to the domain of life sciences. In particular, pretraining
+methods based on DNA sequences have garnered growing attention due to their
+potential to capture generic information about genes. However, existing
+pretraining methods for DNA sequences largely rely on direct adoptions of BERT
+pretraining from NLP, lacking a comprehensive understanding and a specifically
+tailored approach. To address this research gap, we first conducted a series of
+exploratory experiments and gained several insightful observations: 1) In the
+fine-tuning phase of downstream tasks, when using K-mer overlapping
+tokenization instead of K-mer non-overlapping tokenization, both overlapping
+and non-overlapping pretraining weights show consistent performance
+improvement.2) During the pre-training process, using K-mer overlapping
+tokenization quickly produces clear K-mer embeddings and reduces the loss to a
+very low level, while using K-mer non-overlapping tokenization results in less
+distinct embeddings and continuously decreases the loss. 3) Using overlapping
+tokenization causes the self-attention in the intermediate layers of
+pre-trained models to tend to overly focus on certain tokens, reflecting that
+these layers are not adequately optimized. In summary, overlapping tokenization
+can benefit the fine-tuning of downstream tasks but leads to inadequate
+pretraining with fast convergence. To unleash the pretraining potential, we
+introduce a novel approach called RandomMask, which gradually increases the
+task difficulty of BERT-like pretraining by continuously expanding its mask
+boundary, forcing the model to learn more knowledge. RandomMask is simple but
+effective, achieving top-tier performance across 26 datasets of 28 datasets
+spanning 7 downstream tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Large Language Models at Evaluating Instruction Following 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07641v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07641v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyuan Zeng, Jiatong Yu, Tianyu Gao, Yu Meng, Tanya Goyal, Danqi Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As research in large language models (LLMs) continues to accelerate,
+LLM-based evaluation has emerged as a scalable and cost-effective alternative
+to human evaluations for comparing the ever increasing list of models. This
+paper investigates the efficacy of these "LLM evaluators", particularly in
+using them to assess instruction following, a metric that gauges how closely
+generated text adheres to the given instruction. We introduce a challenging
+meta-evaluation benchmark, LLMBar, designed to test the ability of an LLM
+evaluator in discerning instruction-following outputs. The authors manually
+curated 419 pairs of outputs, one adhering to instructions while the other
+diverging, yet may possess deceptive qualities that mislead an LLM evaluator,
+e.g., a more engaging tone. Contrary to existing meta-evaluation, we discover
+that different evaluators (i.e., combinations of LLMs and prompts) exhibit
+distinct performance on LLMBar and even the highest-scoring ones have
+substantial room for improvement. We also present a novel suite of prompting
+strategies that further close the gap between LLM and human evaluators. With
+LLMBar, we hope to offer more insight into LLM evaluators and foster future
+research in developing better instruction-following models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Past, Present and Better Future of Feedback Learning in Large
+  Language Models for Subjective Human Preferences and Values <span class="chip">EMNLP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07629v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07629v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hannah Rose Kirk, Andrew M. Bean, Bertie Vidgen, Paul Röttger, Scott A. Hale
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Human feedback is increasingly used to steer the behaviours of Large Language
+Models (LLMs). However, it is unclear how to collect and incorporate feedback
+in a way that is efficient, effective and unbiased, especially for highly
+subjective human preferences and values. In this paper, we survey existing
+approaches for learning from human feedback, drawing on 95 papers primarily
+from the ACL and arXiv repositories.First, we summarise the past, pre-LLM
+trends for integrating human feedback into language models. Second, we give an
+overview of present techniques and practices, as well as the motivations for
+using feedback; conceptual frameworks for defining values and preferences; and
+how feedback is collected and from whom. Finally, we encourage a better future
+of feedback learning in LLMs by raising five unresolved conceptual and
+practical challenges.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for the 2023 Conference on Empirical Methods in Natural
+  Language Processing (EMNLP, Main)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Democratizing LLMs: An Exploration of Cost-Performance Trade-offs in
+  Self-Refined Open-Source Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07611v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07611v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sumuk Shashidhar, Abhinav Chinta, Vaibhav Sahai, Zhenhailong Wang, Heng Ji
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The dominance of proprietary LLMs has led to restricted access and raised
+information privacy concerns. High-performing open-source alternatives are
+crucial for information-sensitive and high-volume applications but often lag
+behind in performance. To address this gap, we propose (1) A untargeted variant
+of iterative self-critique and self-refinement devoid of external influence.
+(2) A novel ranking metric - Performance, Refinement, and Inference Cost Score
+(PeRFICS) - to find the optimal model for a given task considering refined
+performance and cost. Our experiments show that SoTA open source models of
+varying sizes from 7B - 65B, on average, improve 8.2% from their baseline
+performance. Strikingly, even models with extremely small memory footprints,
+such as Vicuna-7B, show a 11.74% improvement overall and up to a 25.39%
+improvement in high-creativity, open ended tasks on the Vicuna benchmark.
+Vicuna-13B takes it a step further and outperforms ChatGPT post-refinement.
+This work has profound implications for resource-constrained and
+information-sensitive environments seeking to leverage LLMs without incurring
+prohibitive costs, compromising on performance and privacy. The domain-agnostic
+self-refinement process coupled with our novel ranking metric facilitates
+informed decision-making in model selection, thereby reducing costs and
+democratizing access to high-performing language models, as evidenced by case
+studies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Initial Preprint. Accepted to Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ QACHECK: A Demonstration System for Question-Guided Multi-Hop
+  Fact-Checking <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07609v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07609v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liangming Pan, Xinyuan Lu, Min-Yen Kan, Preslav Nakov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fact-checking real-world claims often requires complex, multi-step reasoning
+due to the absence of direct evidence to support or refute them. However,
+existing fact-checking systems often lack transparency in their
+decision-making, making it challenging for users to comprehend their reasoning
+process. To address this, we propose the Question-guided Multi-hop
+Fact-Checking (QACHECK) system, which guides the model's reasoning process by
+asking a series of questions critical for verifying a claim. QACHECK has five
+key modules: a claim verifier, a question generator, a question-answering
+module, a QA validator, and a reasoner. Users can input a claim into QACHECK,
+which then predicts its veracity and provides a comprehensive report detailing
+its reasoning process, guided by a sequence of (question, answer) pairs.
+QACHECK also provides the source of evidence supporting each question,
+fostering a transparent, explainable, and user-friendly fact-checking process.
+A recorded video of QACHECK is at https://www.youtube.com/watch?v=ju8kxSldM64
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at EMNLP 2023 System Demonstrations Track</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Accurate Use of Label Dependency in Multi-Label Text Classification
+  Through the Lens of Causality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07588v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07588v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Caoyun Fan, Wenqing Chen, Jidong Tian, Yitian Li, Hao He, Yaohui Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-Label Text Classification (MLTC) aims to assign the most relevant
+labels to each given text. Existing methods demonstrate that label dependency
+can help to improve the model's performance. However, the introduction of label
+dependency may cause the model to suffer from unwanted prediction bias. In this
+study, we attribute the bias to the model's misuse of label dependency, i.e.,
+the model tends to utilize the correlation shortcut in label dependency rather
+than fusing text information and label dependency for prediction. Motivated by
+causal inference, we propose a CounterFactual Text Classifier (CFTC) to
+eliminate the correlation bias, and make causality-based predictions.
+Specifically, our CFTC first adopts the predict-then-modify backbone to extract
+precise label information embedded in label dependency, then blocks the
+correlation shortcut through the counterfactual de-bias technique with the help
+of the human causal graph. Experimental results on three datasets demonstrate
+that our CFTC significantly outperforms the baselines and effectively
+eliminates the correlation bias in datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Applied Intelligence 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Survey</span> on Factuality in Large Language Models: Knowledge, Retrieval and
+  Domain-Specificity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07521v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07521v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cunxiang Wang, Xiaoze Liu, Yuanhao Yue, Xiangru Tang, Tianhang Zhang, Cheng Jiayang, Yunzhi Yao, Wenyang Gao, Xuming Hu, Zehan Qi, Yidong Wang, Linyi Yang, Jindong Wang, Xing Xie, Zheng Zhang, Yue Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This survey addresses the crucial issue of factuality in Large Language
+Models (LLMs). As LLMs find applications across diverse domains, the
+reliability and accuracy of their outputs become vital. We define the
+Factuality Issue as the probability of LLMs to produce content inconsistent
+with established facts. We first delve into the implications of these
+inaccuracies, highlighting the potential consequences and challenges posed by
+factual errors in LLM outputs. Subsequently, we analyze the mechanisms through
+which LLMs store and process facts, seeking the primary causes of factual
+errors. Our discussion then transitions to methodologies for evaluating LLM
+factuality, emphasizing key metrics, benchmarks, and studies. We further
+explore strategies for enhancing LLM factuality, including approaches tailored
+for specific domains. We focus two primary LLM configurations standalone LLMs
+and Retrieval-Augmented LLMs that utilizes external data, we detail their
+unique challenges and potential enhancements. Our survey offers a structured
+guide for researchers aiming to fortify the factual reliability of LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>43 pages; 300+ references</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KwaiYiiMath: Technical Report 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07488v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07488v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayi Fu, Lei Lin, Xiaoyang Gao, Pengli Liu, Zhengzong Chen, Zhirui Yang, Shengnan Zhang, Xue Zheng, Yan Li, Yuliang Liu, Xucheng Ye, Yiqiao Liao, Chao Liao, Bin Chen, Chengru Song, Junchen Wan, Zijia Lin, Fuzheng Zhang, Zhongyuan Wang, Di Zhang, Kun Gai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in large language models (LLMs) have demonstrated
+remarkable abilities in handling a variety of natural language processing (NLP)
+downstream tasks, even on mathematical tasks requiring multi-step reasoning. In
+this report, we introduce the KwaiYiiMath which enhances the mathematical
+reasoning abilities of KwaiYiiBase1, by applying Supervised Fine-Tuning (SFT)
+and Reinforced Learning from Human Feedback (RLHF), including on both English
+and Chinese mathematical tasks. Meanwhile, we also constructed a small-scale
+Chinese primary school mathematics test set (named KMath), consisting of 188
+examples to evaluate the correctness of the problem-solving process generated
+by the models. Empirical studies demonstrate that KwaiYiiMath can achieve
+state-of-the-art (SOTA) performance on GSM8k, CMath, and KMath compared with
+the similar size models, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>technical report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Cognate <span class="highlight-title">Transformer</span> for Automated Phonological Reconstruction and
+  Cognate Reflex Prediction <span class="chip">EMNLP-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07487v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07487v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        V. S. D. S. Mahesh Akavarapu, Arnab Bhattacharya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Phonological reconstruction is one of the central problems in historical
+linguistics where a proto-word of an ancestral language is determined from the
+observed cognate words of daughter languages. Computational approaches to
+historical linguistics attempt to automate the task by learning models on
+available linguistic data. Several ideas and techniques drawn from
+computational biology have been successfully applied in the area of
+computational historical linguistics. Following these lines, we adapt MSA
+Transformer, a protein language model, to the problem of automated phonological
+reconstruction. MSA Transformer trains on multiple sequence alignments as input
+and is, thus, apt for application on aligned cognate words. We, hence, name our
+model as Cognate Transformer. We also apply the model on another associated
+task, namely, cognate reflex prediction, where a reflex word in a daughter
+language is predicted based on cognate words from other daughter languages. We
+show that our model outperforms the existing models on both tasks, especially
+when it is pre-trained on masked word prediction task.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to appear at the conference of EMNLP-2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adapting the adapters for code-switching in multilingual ASR <span class="chip">ICASSP 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07423v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07423v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Atharva Kulkarni, Ajinkya Kulkarni, Miguel Couceiro, Hanan Aldarmaki
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, large pre-trained multilingual speech models have shown potential
+in scaling Automatic Speech Recognition (ASR) to many low-resource languages.
+Some of these models employ language adapters in their formulation, which helps
+to improve monolingual performance and avoids some of the drawbacks of
+multi-lingual modeling on resource-rich languages. However, this formulation
+restricts the usability of these models on code-switched speech, where two
+languages are mixed together in the same utterance. In this work, we propose
+ways to effectively fine-tune such models on code-switched speech, by
+assimilating information from both language adapters at each language
+adaptation point in the network. We also model code-switching as a sequence of
+latent binary sequences that can be used to guide the flow of information from
+each language adapter at the frame level. The proposed approaches are evaluated
+on three code-switched datasets encompassing Arabic, Mandarin, and Hindi
+languages paired with English, showing consistent improvements in
+code-switching performance with at least 10\% absolute reduction in CER across
+all test sets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ICASSP 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DASpeech: Directed Acyclic <span class="highlight-title">Transformer</span> for Fast and High-quality
+  Speech-to-Speech Translation <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07403v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07403v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingkai Fang, Yan Zhou, Yang Feng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Direct speech-to-speech translation (S2ST) translates speech from one
+language into another using a single model. However, due to the presence of
+linguistic and acoustic diversity, the target speech follows a complex
+multimodal distribution, posing challenges to achieving both high-quality
+translations and fast decoding speeds for S2ST models. In this paper, we
+propose DASpeech, a non-autoregressive direct S2ST model which realizes both
+fast and high-quality S2ST. To better capture the complex distribution of the
+target speech, DASpeech adopts the two-pass architecture to decompose the
+generation process into two steps, where a linguistic decoder first generates
+the target text, and an acoustic decoder then generates the target speech based
+on the hidden states of the linguistic decoder. Specifically, we use the
+decoder of DA-Transformer as the linguistic decoder, and use FastSpeech 2 as
+the acoustic decoder. DA-Transformer models translations with a directed
+acyclic graph (DAG). To consider all potential paths in the DAG during
+training, we calculate the expected hidden states for each target token via
+dynamic programming, and feed them into the acoustic decoder to predict the
+target mel-spectrogram. During inference, we select the most probable path and
+take hidden states on that path as input to the acoustic decoder. Experiments
+on the CVSS Fr-En benchmark demonstrate that DASpeech can achieve comparable or
+even better performance than the state-of-the-art S2ST model Translatotron 2,
+while preserving up to 18.53x speedup compared to the autoregressive baseline.
+Compared with the previous non-autoregressive S2ST model, DASpeech does not
+rely on knowledge distillation and iterative decoding, achieving significant
+improvements in both translation quality and decoding speed. Furthermore,
+DASpeech shows the ability to preserve the speaker's voice of the source speech
+during translation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023. Audio samples are available at
+  https://ictnlp.github.io/daspeech-demo/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Target-oriented Proactive Dialogue Systems with Personalization: Problem
+  Formulation and <span class="highlight-title">Dataset</span> Curation <span class="chip">EMNLP-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07397v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07397v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jian Wang, Yi Cheng, Dongding Lin, Chak Tou Leong, Wenjie Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Target-oriented dialogue systems, designed to proactively steer conversations
+toward predefined targets or accomplish specific system-side goals, are an
+exciting area in conversational AI. In this work, by formulating a <dialogue
+act, topic> pair as the conversation target, we explore a novel problem of
+personalized target-oriented dialogue by considering personalization during the
+target accomplishment process. However, there remains an emergent need for
+high-quality datasets, and building one from scratch requires tremendous human
+effort. To address this, we propose an automatic dataset curation framework
+using a role-playing approach. Based on this framework, we construct a
+large-scale personalized target-oriented dialogue dataset, TopDial, which
+comprises about 18K multi-turn dialogues. The experimental results show that
+this dataset is of high quality and could contribute to exploring personalized
+target-oriented dialogue.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP-2023 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Linguistic laws in biology 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07387v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07387v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stuart Semple, Ramon Ferrer-i-Cancho, Morgan L. Gustison
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Linguistic laws, the common statistical patterns of human language, have been
+investigated by quantitative linguists for nearly a century. Recently,
+biologists from a range of disciplines have started to explore the prevalence
+of these laws beyond language, finding patterns consistent with linguistic laws
+across multiple levels of biological organisation, from molecular (genomes,
+genes, and proteins) to organismal (animal behaviour) to ecological
+(populations and ecosystems). We propose a new conceptual framework for the
+study of linguistic laws in biology, comprising and integrating distinct levels
+of analysis, from description to prediction to theory building. Adopting this
+framework will provide critical new insights into the fundamental rules of
+organisation underpinning natural systems, unifying linguistic laws and core
+theory in biology.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast-ELECTRA for Efficient <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07347v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07347v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengyu Dong, Liyuan Liu, Hao Cheng, Jingbo Shang, Jianfeng Gao, Xiaodong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  ELECTRA pre-trains language models by detecting tokens in a sequence that
+have been replaced by an auxiliary model. Although ELECTRA offers a significant
+boost in efficiency, its potential is constrained by the training cost brought
+by the auxiliary model. Notably, this model, which is jointly trained with the
+main model, only serves to assist the training of the main model and is
+discarded post-training. This results in a substantial amount of training cost
+being expended in vain. To mitigate this issue, we propose Fast-ELECTRA, which
+leverages an existing language model as the auxiliary model. To construct a
+learning curriculum for the main model, we smooth its output distribution via
+temperature scaling following a descending schedule. Our approach rivals the
+performance of state-of-the-art ELECTRA-style pre-training methods, while
+significantly eliminating the computation and memory cost brought by the joint
+training of the auxiliary model. Our method also reduces the sensitivity to
+hyper-parameters and enhances the pre-training stability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Investigating the Effect of Language Models in Sequence Discriminative
+  Training for Neural Transducers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07345v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07345v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijian Yang, Wei Zhou, Ralf Schlüter, Hermann Ney
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we investigate the effect of language models (LMs) with
+different context lengths and label units (phoneme vs. word) used in sequence
+discriminative training for phoneme-based neural transducers. Both lattice-free
+and N-best-list approaches are examined. For lattice-free methods with
+phoneme-level LMs, we propose a method to approximate the context history to
+employ LMs with full-context dependency. This approximation can be extended to
+arbitrary context length and enables the usage of word-level LMs in
+lattice-free methods. Moreover, a systematic comparison is conducted across
+lattice-free and N-best-list-based methods. Experimental results on Librispeech
+show that using the word-level LM in training outperforms the phoneme-level LM.
+Besides, we find that the context size of the LM used for probability
+computation has a limited effect on performance. Moreover, our results reveal
+the pivotal importance of the hypothesis space quality in sequence
+discriminative training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>accepted at ASRU 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ How Do Large Language Models Capture the Ever-changing World Knowledge?
+  A <span class="highlight-title">Review</span> of Recent Advances <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07343v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07343v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zihan Zhang, Meng Fang, Ling Chen, Mohammad-Reza Namazi-Rad, Jun Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Although large language models (LLMs) are impressive in solving various
+tasks, they can quickly be outdated after deployment. Maintaining their
+up-to-date status is a pressing concern in the current era. This paper provides
+a comprehensive review of recent advances in aligning LLMs with the
+ever-changing world knowledge without re-training from scratch. We categorize
+research works systemically and provide in-depth comparisons and discussion. We
+also discuss existing challenges and highlight future directions to facilitate
+research in this field. We release the paper list at
+https://github.com/hyintell/awesome-refreshing-llms
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 main conference, paper link at
+  https://github.com/hyintell/awesome-refreshing-llms</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Empirical Study of Instruction-tuning Large Language Models in
+  Chinese <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07328v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07328v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qingyi Si, Tong Wang, Zheng Lin, Xu Zhang, Yanan Cao, Weiping Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The success of ChatGPT validates the potential of large language models
+(LLMs) in artificial general intelligence (AGI). Subsequently, the release of
+LLMs has sparked the open-source community's interest in instruction-tuning,
+which is deemed to accelerate ChatGPT's replication process. However, research
+on instruction-tuning LLMs in Chinese, the world's most spoken language, is
+still in its early stages. Therefore, this paper makes an in-depth empirical
+study of instruction-tuning LLMs in Chinese, which can serve as a cookbook that
+provides valuable findings for effectively customizing LLMs that can better
+respond to Chinese instructions. Specifically, we systematically explore the
+impact of LLM bases, parameter-efficient methods, instruction data types, which
+are the three most important elements for instruction-tuning. Besides, we also
+conduct experiment to study the impact of other factors, e.g., chain-of-thought
+data and human-value alignment. We hope that this empirical study can make a
+modest contribution to the open Chinese version of ChatGPT. This paper will
+release a powerful Chinese LLMs that is comparable to ChatGLM. The code and
+data are available at https://github.com/PhoebusSi/Alpaca-CoT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Impact of Cross-Domain Data on German Language Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07321v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07321v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amin Dada, Aokun Chen, Cheng Peng, Kaleb E Smith, Ahmad Idrissi-Yaghir, Constantin Marc Seibold, Jianning Li, Lars Heiliger, Christoph M. Friedrich, Daniel Truhn, Jan Egger, Jiang Bian, Jens Kleesiek, Yonghui Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditionally, large language models have been either trained on general web
+crawls or domain-specific data. However, recent successes of generative large
+language models, have shed light on the benefits of cross-domain datasets. To
+examine the significance of prioritizing data diversity over quality, we
+present a German dataset comprising texts from five domains, along with another
+dataset aimed at containing high-quality data. Through training a series of
+models ranging between 122M and 750M parameters on both datasets, we conduct a
+comprehensive benchmark on multiple downstream tasks. Our findings demonstrate
+that the models trained on the cross-domain dataset outperform those trained on
+quality data alone, leading to improvements up to $4.45\%$ over the previous
+state-of-the-art. The models are available at
+https://huggingface.co/ikim-uk-essen
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 1 figure, accepted at Findings of the Association for
+  Computational Linguistics: EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SNOiC: Soft Labeling and Noisy Mixup based Open Intent Classification
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07306v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07306v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aditi Kanwar, Aditi Seetha, Satyendra Singh Chouhan, Rajdeep Niyogi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a Soft Labeling and Noisy Mixup-based open intent
+classification model (SNOiC). Most of the previous works have used
+threshold-based methods to identify open intents, which are prone to
+overfitting and may produce biased predictions. Additionally, the need for more
+available data for an open intent class presents another limitation for these
+existing models. SNOiC combines Soft Labeling and Noisy Mixup strategies to
+reduce the biasing and generate pseudo-data for open intent class. The
+experimental results on four benchmark datasets show that the SNOiC model
+achieves a minimum and maximum performance of 68.72\% and 94.71\%,
+respectively, in identifying open intents. Moreover, compared to
+state-of-the-art models, the SNOiC model improves the performance of
+identifying open intents by 0.93\% (minimum) and 12.76\% (maximum). The model's
+efficacy is further established by analyzing various parameters used in the
+proposed model. An ablation study is also conducted, which involves creating
+three model variants to validate the effectiveness of the SNOiC model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 Pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Parrot: Enhancing Multi-Turn Chat Models by Learning to Ask Questions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07301v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07301v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchong Sun, Che Liu, Jinwen Huang, Ruihua Song, Fuzheng Zhang, Di Zhang, Zhongyuan Wang, Kun Gai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Impressive progress has been made on chat models based on Large Language
+Models (LLMs) recently; however, there is a noticeable lag in multi-turn
+conversations between open-source chat models (e.g., Alpaca and Vicuna) and the
+leading chat models (e.g., ChatGPT and GPT-4). Through a series of analyses, we
+attribute the lag to the lack of enough high-quality multi-turn
+instruction-tuning data. The available instruction-tuning data for the
+community are either single-turn conversations or multi-turn ones with certain
+issues, such as non-human-like instructions, less detailed responses, or rare
+topic shifts. In this paper, we address these challenges by introducing Parrot,
+a highly scalable solution designed to automatically generate high-quality
+instruction-tuning data, which are then used to enhance the effectiveness of
+chat models in multi-turn conversations. Specifically, we start by training the
+Parrot-Ask model, which is designed to emulate real users in generating
+instructions. We then utilize Parrot-Ask to engage in multi-turn conversations
+with ChatGPT across a diverse range of topics, resulting in a collection of 40K
+high-quality multi-turn dialogues (Parrot-40K). These data are subsequently
+employed to train a chat model that we have named Parrot-Chat. We demonstrate
+that the dialogues gathered from Parrot-Ask markedly outperform existing
+multi-turn instruction-following datasets in critical metrics, including topic
+diversity, number of turns, and resemblance to human conversation. With only
+40K training examples, Parrot-Chat achieves strong performance against other
+13B open-source models across a range of instruction-following benchmarks, and
+particularly excels in evaluations of multi-turn capabilities. We make all
+codes, datasets, and two versions of the Parrot-Ask model based on LLaMA2-13B
+and KuaiYii-13B available at https://github.com/kwai/KwaiYii/Parrot.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RobustGEC: Robust Grammatical Error Correction Against Subtle Context
+  Perturbation <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07299v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07299v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Zhang, Leyang Cui, Enbo Zhao, Wei Bi, Shuming Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Grammatical Error Correction (GEC) systems play a vital role in assisting
+people with their daily writing tasks. However, users may sometimes come across
+a GEC system that initially performs well but fails to correct errors when the
+inputs are slightly modified. To ensure an ideal user experience, a reliable
+GEC system should have the ability to provide consistent and accurate
+suggestions when encountering irrelevant context perturbations, which we refer
+to as context robustness. In this paper, we introduce RobustGEC, a benchmark
+designed to evaluate the context robustness of GEC systems. RobustGEC comprises
+5,000 GEC cases, each with one original error-correct sentence pair and five
+variants carefully devised by human annotators. Utilizing RobustGEC, we reveal
+that state-of-the-art GEC systems still lack sufficient robustness against
+context perturbations. In addition, we propose a simple yet effective method
+for remitting this issue.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP 2023 (main conference, long paper)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Factuality: A Comprehensive Evaluation of Large Language Models
+  as Knowledge Generators <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07289v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07289v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Chen, Yang Deng, Yatao Bian, Zeyu Qin, Bingzhe Wu, Tat-Seng Chua, Kam-Fai Wong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) outperform information retrieval techniques for
+downstream knowledge-intensive tasks when being prompted to generate world
+knowledge. However, community concerns abound regarding the factuality and
+potential implications of using this uncensored knowledge. In light of this, we
+introduce CONNER, a COmpreheNsive kNowledge Evaluation fRamework, designed to
+systematically and automatically evaluate generated knowledge from six
+important perspectives -- Factuality, Relevance, Coherence, Informativeness,
+Helpfulness and Validity. We conduct an extensive empirical analysis of the
+generated knowledge from three different types of LLMs on two widely studied
+knowledge-intensive tasks, i.e., open-domain question answering and
+knowledge-grounded dialogue. Surprisingly, our study reveals that the
+factuality of generated knowledge, even if lower, does not significantly hinder
+downstream tasks. Instead, the relevance and coherence of the outputs are more
+important than small factual mistakes. Further, we show how to use CONNER to
+improve knowledge-intensive tasks by designing two strategies: Prompt
+Engineering and Knowledge Selection. Our evaluation code and LLM-generated
+knowledge with human annotations will be released to facilitate future
+research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP 2023 main conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Typing to Listen at the Cocktail Party: Text-Guided Target Speaker
+  Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07284v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07284v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiang Hao, Jibin Wu, Jianwei Yu, Chenglin Xu, Kay Chen Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans possess an extraordinary ability to selectively focus on the sound
+source of interest amidst complex acoustic environments, commonly referred to
+as cocktail party scenarios. In an attempt to replicate this remarkable
+auditory attention capability in machines, target speaker extraction (TSE)
+models have been developed. These models leverage the pre-registered cues of
+the target speaker to extract the sound source of interest. However, the
+effectiveness of these models is hindered in real-world scenarios due to the
+potential variation or even absence of pre-registered cues. To address this
+limitation, this study investigates the integration of natural language to
+enhance the flexibility and controllability of existing TSE models.
+Specifically, we propose a model named LLM-TSE, wherein a large language model
+(LLM) to extract useful semantic cues from the user's typed text input, which
+can complement the pre-registered cues or work independently to control the TSE
+process. Our experimental results demonstrate competitive performance when only
+text-based cues are presented, and a new state-of-the-art is set when combined
+with pre-registered acoustic cues. To the best of our knowledge, this is the
+first work that has successfully incorporated text-based cues to guide target
+speaker extraction, which can be a cornerstone for cocktail party problem
+research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Analysis on Large Language Models in Healthcare: A Case Study of
+  Bio<span class="highlight-title">BERT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07282v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07282v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shyni Sharaf, V. S. Anoop
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper conducts a comprehensive investigation into applying large
+language models, particularly on BioBERT, in healthcare. It begins with
+thoroughly examining previous natural language processing (NLP) approaches in
+healthcare, shedding light on the limitations and challenges these methods
+face. Following that, this research explores the path that led to the
+incorporation of BioBERT into healthcare applications, highlighting its
+suitability for addressing the specific requirements of tasks related to
+biomedical text mining. The analysis outlines a systematic methodology for
+fine-tuning BioBERT to meet the unique needs of the healthcare domain. This
+approach includes various components, including the gathering of data from a
+wide range of healthcare sources, data annotation for tasks like identifying
+medical entities and categorizing them, and the application of specialized
+preprocessing techniques tailored to handle the complexities found in
+biomedical texts. Additionally, the paper covers aspects related to model
+evaluation, with a focus on healthcare benchmarks and functions like processing
+of natural language in biomedical, question-answering, clinical document
+classification, and medical entity recognition. It explores techniques to
+improve the model's interpretability and validates its performance compared to
+existing healthcare-focused language models. The paper thoroughly examines
+ethical considerations, particularly patient privacy and data security. It
+highlights the benefits of incorporating BioBERT into healthcare contexts,
+including enhanced clinical decision support and more efficient information
+retrieval. Nevertheless, it acknowledges the impediments and complexities of
+this integration, encompassing concerns regarding data privacy, transparency,
+resource-intensive requirements, and the necessity for model customization to
+align with diverse healthcare domains.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing expressivity transfer in textless speech-to-speech translation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07279v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07279v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jarod Duret, Benjamin O'Brien, Yannick Estève, Titouan Parcollet
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Textless speech-to-speech translation systems are rapidly advancing, thanks
+to the integration of self-supervised learning techniques. However, existing
+state-of-the-art systems fall short when it comes to capturing and transferring
+expressivity accurately across different languages. Expressivity plays a vital
+role in conveying emotions, nuances, and cultural subtleties, thereby enhancing
+communication across diverse languages. To address this issue this study
+presents a novel method that operates at the discrete speech unit level and
+leverages multilingual emotion embeddings to capture language-agnostic
+information. Specifically, we demonstrate how these embeddings can be used to
+effectively predict the pitch and duration of speech units in the target
+language. Through objective and subjective experiments conducted on a
+French-to-English translation task, our findings highlight the superior
+expressivity transfer achieved by our approach compared to current
+state-of-the-art systems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BioT5: Enriching Cross-modal Integration in Biology with Chemical
+  Knowledge and Natural Language Associations <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07276v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07276v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qizhi Pei, Wei Zhang, Jinhua Zhu, Kehan Wu, Kaiyuan Gao, Lijun Wu, Yingce Xia, Rui Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in biological research leverage the integration of
+molecules, proteins, and natural language to enhance drug discovery. However,
+current models exhibit several limitations, such as the generation of invalid
+molecular SMILES, underutilization of contextual information, and equal
+treatment of structured and unstructured knowledge. To address these issues, we
+propose $\mathbf{BioT5}$, a comprehensive pre-training framework that enriches
+cross-modal integration in biology with chemical knowledge and natural language
+associations. $\mathbf{BioT5}$ utilizes SELFIES for $100%$ robust molecular
+representations and extracts knowledge from the surrounding context of
+bio-entities in unstructured biological literature. Furthermore,
+$\mathbf{BioT5}$ distinguishes between structured and unstructured knowledge,
+leading to more effective utilization of information. After fine-tuning, BioT5
+shows superior performance across a wide range of tasks, demonstrating its
+strong capability of capturing underlying relations and properties of
+bio-entities. Our code is available at
+$\href{https://github.com/QizhiPei/BioT5}{Github}$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Empirical Methods in Natural Language Processing (EMNLP 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ethical Reasoning over Moral Alignment: A Case and Framework for
+  In-Context Ethical Policies in LLMs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07251v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07251v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhinav Rao, Aditi Khandelwal, Kumar Tanmay, Utkarsh Agarwal, Monojit Choudhury
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this position paper, we argue that instead of morally aligning LLMs to
+specific set of ethical principles, we should infuse generic ethical reasoning
+capabilities into them so that they can handle value pluralism at a global
+scale. When provided with an ethical policy, an LLM should be capable of making
+decisions that are ethically consistent to the policy. We develop a framework
+that integrates moral dilemmas with moral principles pertaining to different
+foramlisms of normative ethics, and at different levels of abstractions.
+Initial experiments with GPT-x models shows that while GPT-4 is a nearly
+perfect ethical reasoner, the models still have bias towards the moral values
+of Western and English speaking societies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Landscape of Large Language Models In Medical Question
+  Answering: Observations and Open Questions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07225v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07225v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karolina Korgul, Andrew M. Bean, Felix Krones, Robert McCraith, Adam Mahdi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have shown promise in medical question answering
+by achieving passing scores in standardised exams and have been suggested as
+tools for supporting healthcare workers. Deploying LLMs into such a high-risk
+context requires a clear understanding of the limitations of these models. With
+the rapid development and release of new LLMs, it is especially valuable to
+identify patterns which exist across models and may, therefore, continue to
+appear in newer versions. In this paper, we evaluate a wide range of popular
+LLMs on their knowledge of medical questions in order to better understand
+their properties as a group. From this comparison, we provide preliminary
+observations and raise open questions for further research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>11 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Adaptive Gating in Mixture-of-Experts based Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07188v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07188v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiamin Li, Qiang Su, Yitao Yang, Yimin Jiang, Cong Wang, Hong Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models, such as OpenAI's ChatGPT, have demonstrated
+exceptional language understanding capabilities in various NLP tasks. Sparsely
+activated mixture-of-experts (MoE) has emerged as a promising solution for
+scaling models while maintaining a constant number of computational operations.
+Existing MoE model adopts a fixed gating network where each token is computed
+by the same number of experts. However, this approach contradicts our intuition
+that the tokens in each sequence vary in terms of their linguistic complexity
+and, consequently, require different computational costs. Little is discussed
+in prior research on the trade-off between computation per token and model
+performance. This paper introduces adaptive gating in MoE, a flexible training
+strategy that allows tokens to be processed by a variable number of experts
+based on expert probability distribution. The proposed framework preserves
+sparsity while improving training efficiency. Additionally, curriculum learning
+is leveraged to further reduce training time. Extensive experiments on diverse
+NLP tasks show that adaptive gating reduces at most 22.5% training time while
+maintaining inference quality. Moreover, we conduct a comprehensive analysis of
+the routing decisions and present our insights when adaptive gating is used.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Online Speculative Decoding 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07177v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07177v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoxuan Liu, Lanxiang Hu, Peter Bailis, Ion Stoica, Zhijie Deng, Alvin Cheung, Hao Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speculative decoding is a pivotal technique to accelerate the inference of
+large language models (LLMs) by employing a smaller draft model to predict the
+target model's outputs. However, its efficacy can be limited due to the low
+predictive accuracy of the draft model, particularly when faced with diverse
+text inputs and a significant capability gap between the draft and target
+models. We introduce online speculative decoding (OSD) to address this
+challenge. The main idea is to continually update (multiple) draft model(s) on
+observed user query data using the abundant excess computational power in an
+LLM serving cluster. Given that LLM inference is memory-bounded, the surplus
+computational power in a typical LLM serving cluster can be repurposed for
+online retraining of draft models, thereby making the training cost-neutral.
+Since the query distribution of an LLM service is relatively simple, retraining
+on query distribution enables the draft model to more accurately predict the
+target model's outputs, particularly on data originating from query
+distributions. As the draft model evolves online, it aligns with the query
+distribution in real time, mitigating distribution shifts. We develop a
+prototype of online speculative decoding based on online knowledge distillation
+and evaluate it using both synthetic and real query data on several popular
+LLMs. The results show a substantial increase in the token acceptance rate by
+0.1 to 0.65, which translates into 1.22x to 3.06x latency reduction.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PHALM: Building a Knowledge Graph from Scratch by <span class="highlight-title">Prompt</span>ing Humans and a
+  Language Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07170v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07170v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tatsuya Ide, Eiki Murata, Daisuke Kawahara, Takato Yamazaki, Shengzhe Li, Kenta Shinzato, Toshinori Sato
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the remarkable progress in natural language understanding with
+pretrained Transformers, neural language models often do not handle commonsense
+knowledge well. Toward commonsense-aware models, there have been attempts to
+obtain knowledge, ranging from automatic acquisition to crowdsourcing. However,
+it is difficult to obtain a high-quality knowledge base at a low cost,
+especially from scratch. In this paper, we propose PHALM, a method of building
+a knowledge graph from scratch, by prompting both crowdworkers and a large
+language model (LLM). We used this method to build a Japanese event knowledge
+graph and trained Japanese commonsense generation models. Experimental results
+revealed the acceptability of the built graph and inferences generated by the
+trained models. We also report the difference in prompting humans and an LLM.
+Our code, data, and models are available at
+github.com/nlp-waseda/comet-atomic-ja.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Psychoacoustic Challenges Of Speech Enhancement On VoIP Platforms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07161v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07161v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joseph Konan, Ojas Bhargave, Shikhar Agnihotri, Shuo Han, Yunyang Zeng, Ankit Shah, Bhiksha Raj
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Within the ambit of VoIP (Voice over Internet Protocol) telecommunications,
+the complexities introduced by acoustic transformations merit rigorous
+analysis. This research, rooted in the exploration of proprietary sender-side
+denoising effects, meticulously evaluates platforms such as Google Meets and
+Zoom. The study draws upon the Deep Noise Suppression (DNS) 2020 dataset,
+ensuring a structured examination tailored to various denoising settings and
+receiver interfaces. A methodological novelty is introduced via the Oaxaca
+decomposition, traditionally an econometric tool, repurposed herein to analyze
+acoustic-phonetic perturbations within VoIP systems. To further ground the
+implications of these transformations, psychoacoustic metrics, specifically
+PESQ and STOI, were harnessed to furnish a comprehensive understanding of
+speech alterations. Cumulatively, the insights garnered underscore the
+intricate landscape of VoIP-influenced acoustic dynamics. In addition to the
+primary findings, a multitude of metrics are reported, extending the research
+purview. Moreover, out-of-domain benchmarking for both time and time-frequency
+domain speech enhancement models is included, thereby enhancing the depth and
+applicability of this inquiry.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ "A Tale of Two Movements": Identifying and Comparing Perspectives in
+  #BlackLivesMatter and #BlueLivesMatter Movements-related Tweets using Weakly
+  Supervised Graph-based Structured Prediction <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07155v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07155v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shamik Roy, Dan Goldwasser
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social media has become a major driver of social change, by facilitating the
+formation of online social movements. Automatically understanding the
+perspectives driving the movement and the voices opposing it, is a challenging
+task as annotated data is difficult to obtain. We propose a weakly supervised
+graph-based approach that explicitly models perspectives in
+#BackLivesMatter-related tweets. Our proposed approach utilizes a
+social-linguistic representation of the data. We convert the text to a graph by
+breaking it into structured elements and connect it with the social network of
+authors, then structured prediction is done over the elements for identifying
+perspectives. Our approach uses a small seed set of labeled examples. We
+experiment with large language models for generating artificial training
+examples, compare them to manual annotation, and find that it achieves
+comparable performance. We perform quantitative and qualitative analyses using
+a human-annotated test set. Our model outperforms multitask baselines by a
+large margin, successfully characterizing the perspectives supporting and
+opposing #BLM.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted version to Findings of EMNLP 2023 (camera ready coming soon)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ QFT: Quantized Full-parameter Tuning of LLMs with Affordable Resources 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07147v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07147v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhikai Li, Xiaoxuan Liu, Banghua Zhu, Zhen Dong, Qingyi Gu, Kurt Keutzer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) have showcased remarkable impacts across a wide
+spectrum of natural language processing tasks. Fine-tuning these pre-trained
+models on downstream datasets provides further significant performance gains,
+but this process has been challenging due to its extraordinary resource
+requirements. To this end, existing efforts focus on parameter-efficient
+fine-tuning, which, unfortunately, fail to capitalize on the powerful potential
+of full-parameter fine-tuning. In this work, we propose QFT, a novel Quantized
+Full-parameter Tuning framework for LLMs that enables memory-efficient
+fine-tuning without harming performance. Our framework incorporates two novel
+ideas: (i) we adopt the efficient Lion optimizer, which only keeps track of the
+momentum and has consistent update magnitudes for each parameter, an inherent
+advantage for robust quantization; and (ii) we quantize all model states and
+store them as integer values, and present a gradient flow and parameter update
+scheme for the quantized weights. As a result, QFT reduces the model state
+memory to 21% of the standard solution while achieving comparable performance,
+e.g., tuning a LLaMA-7B model requires only <30GB of memory, satisfied by a
+single A6000 GPU.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Empowering Psychotherapy with Large Language Models: Cognitive
+  Distortion Detection through Diagnosis of Thought <span class="highlight-title">Prompt</span>ing <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07146v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07146v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyu Chen, Yujie Lu, William Yang Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Mental illness remains one of the most critical public health issues of our
+time, due to the severe scarcity and accessibility limit of professionals.
+Psychotherapy requires high-level expertise to conduct deep, complex reasoning
+and analysis on the cognition modeling of the patients. In the era of Large
+Language Models, we believe it is the right time to develop AI assistance for
+computational psychotherapy. We study the task of cognitive distortion
+detection and propose the Diagnosis of Thought (DoT) prompting. DoT performs
+diagnosis on the patient's speech via three stages: subjectivity assessment to
+separate the facts and the thoughts; contrastive reasoning to elicit the
+reasoning processes supporting and contradicting the thoughts; and schema
+analysis to summarize the cognition schemas. The generated diagnosis rationales
+through the three stages are essential for assisting the professionals.
+Experiments demonstrate that DoT obtains significant improvements over ChatGPT
+for cognitive distortion detection, while generating high-quality rationales
+approved by human experts.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AE-smnsMLC: Multi-Label Classification with Semantic Matching and
+  Negative Label Sampling for Product Attribute Value Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07137v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07137v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongfen Deng, Wei-Te Chen, Lei Chen, Philip S. Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Product attribute value extraction plays an important role for many
+real-world applications in e-Commerce such as product search and
+recommendation. Previous methods treat it as a sequence labeling task that
+needs more annotation for position of values in the product text. This limits
+their application to real-world scenario in which only attribute values are
+weakly-annotated for each product without their position. Moreover, these
+methods only use product text (i.e., product title and description) and do not
+consider the semantic connection between the multiple attribute values of a
+given product and its text, which can help attribute value extraction. In this
+paper, we reformulate this task as a multi-label classification task that can
+be applied for real-world scenario in which only annotation of attribute values
+is available to train models (i.e., annotation of positional information of
+attribute values is not available). We propose a classification model with
+semantic matching and negative label sampling for attribute value extraction.
+Semantic matching aims to capture semantic interactions between attribute
+values of a given product and its text. Negative label sampling aims to enhance
+the model's ability of distinguishing similar values belonging to the same
+attribute. Experimental results on three subsets of a large real-world
+e-Commerce dataset demonstrate the effectiveness and superiority of our
+proposed model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Comparing Styles across Languages <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07135v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07135v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shreya Havaldar, Matthew Pressimone, Eric Wong, Lyle Ungar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding how styles differ across languages is advantageous for training
+both humans and computers to generate culturally appropriate text. We introduce
+an explanation framework to extract stylistic differences from multilingual LMs
+and compare styles across languages. Our framework (1) generates comprehensive
+style lexica in any language and (2) consolidates feature importances from LMs
+into comparable lexical categories. We apply this framework to compare
+politeness, creating the first holistic multilingual politeness dataset and
+exploring how politeness varies across four languages. Our approach enables an
+effective evaluation of how distinct linguistic categories contribute to
+stylistic variations and provides interpretable insights into how people
+communicate differently around the world.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To appear in EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Temporal Structure of Language Processing in the Human Brain
+  Corresponds to The Layered Hierarchy of Deep Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07106v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07106v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ariel Goldstein, Eric Ham, Mariano Schain, Samuel Nastase, Zaid Zada, Avigail Dabush, Bobbi Aubrey, Harshvardhan Gazula, Amir Feder, Werner K Doyle, Sasha Devore, Patricia Dugan, Daniel Friedman, Roi Reichart, Michael Brenner, Avinatan Hassidim, Orrin Devinsky, Adeen Flinker, Omer Levy, Uri Hasson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Language Models (DLMs) provide a novel computational paradigm for
+understanding the mechanisms of natural language processing in the human brain.
+Unlike traditional psycholinguistic models, DLMs use layered sequences of
+continuous numerical vectors to represent words and context, allowing a
+plethora of emerging applications such as human-like text generation. In this
+paper we show evidence that the layered hierarchy of DLMs may be used to model
+the temporal dynamics of language comprehension in the brain by demonstrating a
+strong correlation between DLM layer depth and the time at which layers are
+most predictive of the human brain. Our ability to temporally resolve
+individual layers benefits from our use of electrocorticography (ECoG) data,
+which has a much higher temporal resolution than noninvasive methods like fMRI.
+Using ECoG, we record neural activity from participants listening to a
+30-minute narrative while also feeding the same narrative to a high-performing
+DLM (GPT2-XL). We then extract contextual embeddings from the different layers
+of the DLM and use linear encoding models to predict neural activity. We first
+focus on the Inferior Frontal Gyrus (IFG, or Broca's area) and then extend our
+model to track the increasing temporal receptive window along the linguistic
+processing hierarchy from auditory to syntactic and semantic areas. Our results
+reveal a connection between human language processing and DLMs, with the DLM's
+layer-by-layer accumulation of contextual information mirroring the timing of
+neural activity in high-order language areas.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sparse Universal <span class="highlight-title">Transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07096v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07096v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shawn Tan, Yikang Shen, Zhenfang Chen, Aaron Courville, Chuang Gan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Universal Transformer (UT) is a variant of the Transformer that shares
+parameters across its layers. Empirical evidence shows that UTs have better
+compositional generalization than Vanilla Transformers (VTs) in formal language
+tasks. The parameter-sharing also affords it better parameter efficiency than
+VTs. Despite its many advantages, scaling UT parameters is much more compute
+and memory intensive than scaling up a VT. This paper proposes the Sparse
+Universal Transformer (SUT), which leverages Sparse Mixture of Experts (SMoE)
+and a new stick-breaking-based dynamic halting mechanism to reduce UT's
+computation complexity while retaining its parameter efficiency and
+generalization ability. Experiments show that SUT achieves the same performance
+as strong baseline models while only using half computation and parameters on
+WMT'14 and strong generalization results on formal language tasks (Logical
+inference and CFQ). The new halting mechanism also enables around 50\%
+reduction in computation during inference with very little performance decrease
+on formal language tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Argumentative Stance Prediction: An Exploratory Study on Multimodality
+  and Few-Shot Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07093v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07093v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Arushi Sharma, Abhibha Gupta, Maneesh Bilalpur
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To advance argumentative stance prediction as a multimodal problem, the First
+Shared Task in Multimodal Argument Mining hosted stance prediction in crucial
+social topics of gun control and abortion. Our exploratory study attempts to
+evaluate the necessity of images for stance prediction in tweets and compare
+out-of-the-box text-based large-language models (LLM) in few-shot settings
+against fine-tuned unimodal and multimodal models. Our work suggests an
+ensemble of fine-tuned text-based language models (0.817 F1-score) outperforms
+both the multimodal (0.677 F1-score) and text-based few-shot prediction using a
+recent state-of-the-art LLM (0.550 F1-score). In addition to the differences in
+performance, our findings suggest that the multimodal models tend to perform
+better when image content is summarized as natural language over their native
+pixel structure and, using in-context examples improves few-shot performance of
+LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Jaeger: A Concatenation-Based Multi-<span class="highlight-title">Transformer</span> VQA Model <span class="chip">CIKM 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07091v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07091v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jieting Long, Zewei Shi, Penghao Jiang, Yidong Gan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Document-based Visual Question Answering poses a challenging task between
+linguistic sense disambiguation and fine-grained multimodal retrieval. Although
+there has been encouraging progress in document-based question answering due to
+the utilization of large language and open-world prior models\cite{1}, several
+challenges persist, including prolonged response times, extended inference
+durations, and imprecision in matching. In order to overcome these challenges,
+we propose Jaegar, a concatenation-based multi-transformer VQA model. To derive
+question features, we leverage the exceptional capabilities of RoBERTa
+large\cite{2} and GPT2-xl\cite{3} as feature extractors. Subsequently, we
+subject the outputs from both models to a concatenation process. This operation
+allows the model to consider information from diverse sources concurrently,
+strengthening its representational capability. By leveraging pre-trained models
+for feature extraction, our approach has the potential to amplify the
+performance of these models through concatenation. After concatenation, we
+apply dimensionality reduction to the output features, reducing the model's
+computational effectiveness and inference time. Empirical results demonstrate
+that our proposed model achieves competitive performance on Task C of the
+PDF-VQA Dataset. If the user adds any new data, they should make sure to style
+it as per the instructions provided in previous sections.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper is the technical research paper of CIKM 2023 DocIU
+  challenges. The authors received the CIKM 2023 DocIU Winner Award, sponsored
+  by Google, Microsoft, and the Centre for data-driven geoscience</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diversity of Thought Improves Reasoning Abilities of Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07088v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07088v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ranjita Naik, Varun Chandrasekaran, Mert Yuksekgonul, Hamid Palangi, Besmira Nushi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) are documented to struggle in settings that
+require complex reasoning. Nevertheless, instructing the model to break down
+the problem into smaller reasoning steps (Wei et al., 2022), or ensembling
+various generations through modifying decoding steps (Wang et al., 2023) boosts
+performance. Current methods assume that the input prompt is fixed and expect
+the decoding strategies to introduce the diversity needed for ensembling. In
+this work, we relax this assumption and discuss how one can create and leverage
+variations of the input prompt as a means to diversity of thought to improve
+model performance. We propose a method that automatically improves prompt
+diversity by soliciting feedback from the LLM to ideate approaches that fit for
+the problem. We then ensemble the diverse prompts in our method DIV-SE (DIVerse
+reasoning path Self-Ensemble) across multiple inference calls. We also propose
+a cost-effective alternative where diverse prompts are used within a single
+inference call; we call this IDIV-SE (In-call DIVerse reasoning path
+Self-Ensemble). Under a fixed generation budget, DIV-SE and IDIV-SE outperform
+the previously discussed baselines using both GPT-3.5 and GPT-4 on several
+reasoning benchmarks, without modifying the decoding process. Additionally,
+DIV-SE advances state-of-the-art performance on recent planning benchmarks
+(Valmeekam et al., 2023), exceeding the highest previously reported accuracy by
+at least 29.6 percentage points on the most challenging 4/5 Blocksworld task.
+Our results shed light on how to enforce prompt diversity toward LLM reasoning
+and thereby improve the pareto frontier of the accuracy-cost trade-off.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ D2 Pruning: Message Passing for Balancing Diversity and Difficulty in
+  Data Pruning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07931v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07931v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adyasha Maharana, Prateek Yadav, Mohit Bansal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Analytical theories suggest that higher-quality data can lead to lower test
+errors in models trained on a fixed data budget. Moreover, a model can be
+trained on a lower compute budget without compromising performance if a dataset
+can be stripped of its redundancies. Coreset selection (or data pruning) seeks
+to select a subset of the training data so as to maximize the performance of
+models trained on this subset, also referred to as coreset. There are two
+dominant approaches: (1) geometry-based data selection for maximizing data
+diversity in the coreset, and (2) functions that assign difficulty scores to
+samples based on training dynamics. Optimizing for data diversity leads to a
+coreset that is biased towards easier samples, whereas, selection by difficulty
+ranking omits easy samples that are necessary for the training of deep learning
+models. This demonstrates that data diversity and importance scores are two
+complementary factors that need to be jointly considered during coreset
+selection. We represent a dataset as an undirected graph and propose a novel
+pruning algorithm, D2 Pruning, that uses forward and reverse message passing
+over this dataset graph for coreset selection. D2 Pruning updates the
+difficulty scores of each example by incorporating the difficulty of its
+neighboring examples in the dataset graph. Then, these updated difficulty
+scores direct a graph-based sampling method to select a coreset that
+encapsulates both diverse and difficult regions of the dataset space. We
+evaluate supervised and self-supervised versions of our method on various
+vision and language datasets. Results show that D2 Pruning improves coreset
+selection over previous state-of-the-art methods for up to 70% pruning rates.
+Additionally, we find that using D2 Pruning for filtering large multimodal
+datasets leads to increased diversity in the dataset and improved
+generalization of pretrained models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages (Our code is available at
+  https://github.com/adymaharana/d2pruning)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Crosslingual Structural Priming and the <span class="highlight-title">Pre-Train</span>ing Dynamics of
+  Bilingual Language Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07929v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07929v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Catherine Arnett, Tyler A. Chang, James A. Michaelov, Benjamin K. Bergen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Do multilingual language models share abstract grammatical representations
+across languages, and if so, when do these develop? Following Sinclair et al.
+(2022), we use structural priming to test for abstract grammatical
+representations with causal effects on model outputs. We extend the approach to
+a Dutch-English bilingual setting, and we evaluate a Dutch-English language
+model during pre-training. We find that crosslingual structural priming effects
+emerge early after exposure to the second language, with less than 1M tokens of
+data in that language. We discuss implications for data contamination,
+low-resource transfer, and how abstract grammatical representations emerge in
+multilingual models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Extended abstract accepted to the 3rd Multilingual Representation
+  Learning workshop at EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The Expresssive Power of <span class="highlight-title">Transformer</span>s with Chain of Thought 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07923v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07923v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        William Merrill, Ashish Sabharwal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent theoretical work has identified surprisingly simple reasoning
+problems, such as checking if two nodes in a graph are connected or simulating
+finite-state machines, that are provably unsolvable by standard transformers
+that answer immediately after reading their input. However, in practice,
+transformers' reasoning can be improved by allowing them to use a "chain of
+thought" or "scratchpad", i.e., generate and condition on a sequence of
+intermediate tokens before answering. Motivated by this, we ask: Does such
+intermediate generation fundamentally extend the computational power of a
+decoder-only transformer? We show that the answer is yes, but the amount of
+increase depends crucially on the amount of intermediate generation. For
+instance, we find that transformer decoders with a logarithmic number of
+decoding steps (w.r.t. the input length) push the limits of standard
+transformers only slightly, while a linear number of decoding steps adds a
+clear new ability (under standard complexity conjectures): recognizing all
+regular languages. Our results also imply that linear steps keep transformer
+decoders within context-sensitive languages, and polynomial steps make them
+recognize exactly the class of polynomial-time solvable problems -- the first
+exact characterization of a type of transformers in terms of standard
+complexity classes. Together, our results provide a nuanced framework for
+understanding how the length of a transformer's chain of thought or scratchpad
+impacts its reasoning power.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9-page preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Pit One Against Many: Leveraging Attention-head Embeddings for
+  Parameter-efficient Multi-head Attention <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07911v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07911v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huiyin Xue, Nikolaos Aletras
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scaling pre-trained language models has resulted in large performance gains
+in various natural language processing tasks but comes with a large cost in
+memory requirements. Inspired by the position embeddings in transformers, we
+aim to simplify and reduce the memory footprint of the multi-head attention
+(MHA) mechanism. We propose an alternative module that uses only a single
+shared projection matrix and multiple head embeddings (MHE), i.e. one per head.
+We empirically demonstrate that our MHE attention is substantially more memory
+efficient compared to alternative attention mechanisms while achieving high
+predictive performance retention ratio to vanilla MHA on several downstream
+tasks. MHE attention only requires a negligible fraction of additional
+parameters ($3nd$, where $n$ is the number of attention heads and $d$ the size
+of the head embeddings) compared to a single-head attention, while MHA requires
+$(3n^2-3n)d^2-3nd$ additional parameters.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at EMNLP 2023 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LangNav: Language as a Perceptual Representation for Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07889v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07889v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Pan, Rameswar Panda, SouYoung Jin, Rogerio Feris, Aude Oliva, Phillip Isola, Yoon Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore the use of language as a perceptual representation for
+vision-and-language navigation. Our approach uses off-the-shelf vision systems
+(for image captioning and object detection) to convert an agent's egocentric
+panoramic view at each time step into natural language descriptions. We then
+finetune a pretrained language model to select an action, based on the current
+view and the trajectory history, that would best fulfill the navigation
+instructions. In contrast to the standard setup which adapts a pretrained
+language model to work directly with continuous visual features from pretrained
+vision models, our approach instead uses (discrete) language as the perceptual
+representation. We explore two use cases of our language-based navigation
+(LangNav) approach on the R2R vision-and-language navigation benchmark:
+generating synthetic trajectories from a prompted large language model (GPT-4)
+with which to finetune a smaller language model; and sim-to-real transfer where
+we transfer a policy learned on a simulated environment (ALFRED) to a
+real-world environment (R2R). Our approach is found to improve upon strong
+baselines that rely on visual features in settings where only a few gold
+trajectories (10-100) are available, demonstrating the potential of using
+language as a perceptual representation for navigation tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ TabLib: A <span class="highlight-title">Dataset</span> of 627M Tables with Context 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07875v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07875v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gus Eggert, Kevin Huo, Mike Biven, Justin Waugh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  It is well-established that large, diverse datasets play a pivotal role in
+the performance of modern AI systems for text and image modalities. However,
+there are no datasets for tabular data of comparable size and diversity to
+those available for text and images. Thus we present "TabLib'', a compilation
+of 627 million tables totaling 69 TiB, along with 867B tokens of context.
+TabLib was extracted from numerous file formats, including CSV, HTML, SQLite,
+PDF, Excel, and others, sourced from GitHub and Common Crawl. The size and
+diversity of TabLib offer considerable promise in the table modality,
+reminiscent of the original promise of foundational datasets for text and
+images, such as The Pile and LAION.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Assessing Evaluation Metrics for Neural Test Oracle Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07856v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07856v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiho Shin, Hadi Hemmati, Moshi Wei, Song Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we revisit existing oracle generation studies plus ChatGPT to
+empirically investigate the current standing of their performance in both
+NLG-based and test adequacy metrics. Specifically, we train and run four
+state-of-the-art test oracle generation models on five NLG-based and two test
+adequacy metrics for our analysis. We apply two different correlation analyses
+between these two different sets of metrics. Surprisingly, we found no
+significant correlation between the NLG-based metrics and test adequacy
+metrics. For instance, oracles generated from ChatGPT on the project
+activemq-artemis had the highest performance on all the NLG-based metrics among
+the studied NOGs, however, it had the most number of projects with a decrease
+in test adequacy metrics compared to all the studied NOGs. We further conduct a
+qualitative analysis to explore the reasons behind our observations, we found
+that oracles with high NLG-based metrics but low test adequacy metrics tend to
+have complex or multiple chained method invocations within the oracle's
+parameters, making it hard for the model to generate completely, affecting the
+test adequacy metrics. On the other hand, oracles with low NLG-based metrics
+but high test adequacy metrics tend to have to call different assertion types
+or a different method that functions similarly to the ones in the ground truth.
+Overall, this work complements prior studies on test oracle generation with an
+extensive performance evaluation with both NLG and test adequacy metrics and
+provides guidelines for better assessment of deep learning applications in
+software test generation in the future.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages + reference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Synthetic Data Generation with Large Language Models for Text
+  Classification: Potential and Limitations <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07849v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07849v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuoyan Li, Hangxiao Zhu, Zhuoran Lu, Ming Yin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The collection and curation of high-quality training data is crucial for
+developing text classification models with superior performance, but it is
+often associated with significant costs and time investment. Researchers have
+recently explored using large language models (LLMs) to generate synthetic
+datasets as an alternative approach. However, the effectiveness of the
+LLM-generated synthetic data in supporting model training is inconsistent
+across different classification tasks. To better understand factors that
+moderate the effectiveness of the LLM-generated synthetic data, in this study,
+we look into how the performance of models trained on these synthetic data may
+vary with the subjectivity of classification. Our results indicate that
+subjectivity, at both the task level and instance level, is negatively
+associated with the performance of the model trained on synthetic data. We
+conclude by discussing the implications of our work on the potential and
+limitations of leveraging LLM for synthetic data generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Framework for Question-Answering in Sanskrit through Automated
+  Construction of Knowledge Graphs <span class="chip">SC</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07848v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07848v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hrishikesh Terdalkar, Arnab Bhattacharya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Sanskrit (sa\d{m}sk\d{r}ta) enjoys one of the largest and most varied
+literature in the whole world. Extracting the knowledge from it, however, is a
+challenging task due to multiple reasons including complexity of the language
+and paucity of standard natural language processing tools. In this paper, we
+target the problem of building knowledge graphs for particular types of
+relationships from sa\d{m}sk\d{r}ta texts. We build a natural language
+question-answering system in sa\d{m}sk\d{r}ta that uses the knowledge graph to
+answer factoid questions. We design a framework for the overall system and
+implement two separate instances of the system on human relationships from
+mah\=abh\=arata and r\=am\=aya\d{n}a, and one instance on synonymous
+relationships from bh\=avaprak\=a\'sa nigha\d{n}\d{t}u, a technical text from
+\=ayurveda. We show that about 50% of the factoid questions can be answered
+correctly by the system. More importantly, we analyse the shortcomings of the
+system in detail for each step, and discuss the possible ways forward.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at 6th International Sanskrit Computational Linguistics
+  Symposium (ISCLS) 2019</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Does Synthetic Data Make Large Language Models More Efficient? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07830v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07830v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sia Gholami, Marwan Omar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Natural Language Processing (NLP) has undergone transformative changes with
+the advent of deep learning methodologies. One challenge persistently
+confronting researchers is the scarcity of high-quality, annotated datasets
+that drive these models. This paper explores the nuances of synthetic data
+generation in NLP, with a focal point on template-based question generation. By
+assessing its advantages, including data augmentation potential and the
+introduction of structured variety, we juxtapose these benefits against
+inherent limitations, such as the risk of overfitting and the constraints posed
+by pre-defined templates. Drawing from empirical evaluations, we demonstrate
+the impact of template-based synthetic data on the performance of modern
+transformer models. We conclude by emphasizing the delicate balance required
+between synthetic and real-world data, and the future trajectories of
+integrating synthetic data in model training pipelines. The findings aim to
+guide NLP practitioners in harnessing synthetic data's potential, ensuring
+optimal model performance in diverse applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Antarlekhaka: A Comprehensive Tool for Multi-task Natural Language
+  Annotation <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07826v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07826v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hrishikesh Terdalkar, Arnab Bhattacharya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  One of the primary obstacles in the advancement of Natural Language
+Processing (NLP) technologies for low-resource languages is the lack of
+annotated datasets for training and testing machine learning models. In this
+paper, we present Antarlekhaka, a tool for manual annotation of a comprehensive
+set of tasks relevant to NLP. The tool is Unicode-compatible,
+language-agnostic, Web-deployable and supports distributed annotation by
+multiple simultaneous annotators. The system sports user-friendly interfaces
+for 8 categories of annotation tasks. These, in turn, enable the annotation of
+a considerably larger set of NLP tasks. The task categories include two
+linguistic tasks not handled by any other tool, namely, sentence boundary
+detection and deciding canonical word order, which are important tasks for text
+that is in the form of poetry. We propose the idea of sequential annotation
+based on small text units, where an annotator performs several tasks related to
+a single text unit before proceeding to the next unit. The research
+applications of the proposed mode of multi-task annotation are also discussed.
+Antarlekhaka outperforms other annotation tools in objective evaluation. It has
+been also used for two real-life annotation tasks on two different languages,
+namely, Sanskrit and Bengali. The tool is available at
+https://github.com/Antarlekhaka/code.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted: 3rd Workshop for Natural Language Processing Open Source
+  Software (NLP-OSS) @ EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Non-autoregressive Text Editing with Copy-aware Latent Alignments <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07821v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07821v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Zhang, Yue Zhang, Leyang Cui, Guohong Fu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent work has witnessed a paradigm shift from Seq2Seq to Seq2Edit in the
+field of text editing, with the aim of addressing the slow autoregressive
+inference problem posed by the former. Despite promising results, Seq2Edit
+approaches still face several challenges such as inflexibility in generation
+and difficulty in generalizing to other languages. In this work, we propose a
+novel non-autoregressive text editing method to circumvent the above issues, by
+modeling the edit process with latent CTC alignments. We make a crucial
+extension to CTC by introducing the copy operation into the edit space, thus
+enabling more efficient management of textual overlap in editing. We conduct
+extensive experiments on GEC and sentence fusion tasks, showing that our
+proposed method significantly outperforms existing Seq2Edit models and achieves
+similar or even better results than Seq2Seq with over $4\times$ speedup.
+Moreover, it demonstrates good generalizability on German and Russian. In-depth
+analyses reveal the strengths of our method in terms of the robustness under
+various scenarios and generating fluent and flexible outputs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Faithfulness Measurable Masked Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07819v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07819v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andreas Madsen, Siva Reddy, Sarath Chandar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A common approach to explain NLP models, is to use importance measures that
+express which tokens are important for a prediction. Unfortunately, such
+explanations are often wrong despite being persuasive. Therefore, it is
+essential to measure their faithfulness. One such metric is if tokens are truly
+important, then masking them should result in worse model performance. However,
+token masking introduces out-of-distribution issues and existing solutions are
+computationally expensive and employ proxy-models. Furthermore, other metrics
+are very limited in scope. In this work, we propose an inherently faithfulness
+measurable model that addresses these challenges. This is achieved by using a
+novel fine-tuning method that incorporates masking, such that masking tokens
+become in-distribution by design. This differs from existing approaches, which
+are completely model-agnostic but are inapplicable in practice. We demonstrate
+the generality of our approach by applying it to various tasks and validate it
+using statistical in-distribution tests. Additionally, because masking is
+in-distribution, importance measures which themselves use masking become more
+faithful, thus our model becomes more explainable.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring the Relationship between Analogy Identification and Sentence
+  Structure Encoding in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07818v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07818v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Thilini Wijesiriwardene, Ruwan Wickramarachchi, Aishwarya Naresh Reganti, Vinija Jain, Aman Chadha, Amit Sheth, Amitava Das
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying analogies plays a pivotal role in human cognition and language
+proficiency. In the last decade, there has been extensive research on word
+analogies in the form of ``A is to B as C is to D.'' However, there is a
+growing interest in analogies that involve longer text, such as sentences and
+collections of sentences, which convey analogous meanings. While the current
+NLP research community evaluates the ability of Large Language Models (LLMs) to
+identify such analogies, the underlying reasons behind these abilities warrant
+deeper investigation. Furthermore, the capability of LLMs to encode both
+syntactic and semantic structures of language within their embeddings has
+garnered significant attention with the surge in their utilization. In this
+work, we examine the relationship between the abilities of multiple LLMs to
+identify sentence analogies, and their capacity to encode syntactic and
+semantic structures. Through our analysis, we find that analogy identification
+ability of LLMs is positively correlated with their ability to encode syntactic
+and semantic structures of sentences. Specifically, we find that the LLMs which
+capture syntactic structures better, also have higher abilities in identifying
+sentence analogies.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language Models As Semantic Indexers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07815v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07815v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Jin, Hansi Zeng, Guoyin Wang, Xiusi Chen, Tianxin Wei, Ruirui Li, Zhengyang Wang, Zheng Li, Yang Li, Hanqing Lu, Suhang Wang, Jiawei Han, Xianfeng Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic identifier (ID) is an important concept in information retrieval
+that aims to preserve the semantics of objects such as documents and items
+inside their IDs. Previous studies typically adopt a two-stage pipeline to
+learn semantic IDs by first procuring embeddings using off-the-shelf text
+encoders and then deriving IDs based on the embeddings. However, each step
+introduces potential information loss and there is usually an inherent mismatch
+between the distribution of embeddings within the latent space produced by text
+encoders and the anticipated distribution required for semantic indexing.
+Nevertheless, it is non-trivial to design a method that can learn the
+document's semantic representations and its hierarchical structure
+simultaneously, given that semantic IDs are discrete and sequentially
+structured, and the semantic supervision is deficient. In this paper, we
+introduce LMINDEXER, a self-supervised framework to learn semantic IDs with a
+generative language model. We tackle the challenge of sequential discrete ID by
+introducing a semantic indexer capable of generating neural sequential discrete
+representations with progressive training and contrastive learning. In response
+to the semantic supervision deficiency, we propose to train the model with a
+self-supervised document reconstruction objective. The learned semantic indexer
+can facilitate various downstream tasks, such as recommendation and retrieval.
+We conduct experiments on three tasks including recommendation, product search,
+and document retrieval on five datasets from various domains, where LMINDEXER
+outperforms competitive baselines significantly and consistently.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 3 appendix pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A general mechanism of humor: reformulating the semantic overlap 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07803v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07803v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Javier Martínez
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This article proposes a cognitive mechanism of humour of general
+applicability, not restricted to verbal communication. It is indebted to
+Raskin's concept of script overlap, and conforms to the incongruity-resolution
+theoretical framework, but it is built on the notion of constraint, an abstract
+correspondence between sets of data. Under this view, script overlap is an
+outcome of a more abstractly described phenomenon, constraint overlap. The
+important concept of the overlooked argument is introduced to characterise the
+two overlapping constraints -- overt and covert. Their inputs and outputs are
+not directly encoded in utterances, but implicated by them, and their overlap
+results in another overlap at the level of the communicated utterances, that
+the incongruity reveals. Our hypothesis assumes as a given that the evocation
+of such constraints is a cognitive effect of the inferential process by which a
+hearer interprets utterances. We base this assumption on Hofstadter's theory of
+analogy-making as the essence of human thought. By substituting "stimuli" of
+any kind for "utterances" in this model, we obtain a mechanism as easily
+applicable to non-verbal communication -- slapstick, cartoons -- and we propose
+it describes the necessary and sufficient conditions for a communicative act in
+any modality to carry humour.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ontology Enrichment for Effective Fine-grained Entity Typing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07795v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07795v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siru Ouyang, Jiaxin Huang, Pranav Pillai, Yunyi Zhang, Yu Zhang, Jiawei Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-grained entity typing (FET) is the task of identifying specific entity
+types at a fine-grained level for entity mentions based on their contextual
+information. Conventional methods for FET require extensive human annotation,
+which is time-consuming and costly. Recent studies have been developing weakly
+supervised or zero-shot approaches. We study the setting of zero-shot FET where
+only an ontology is provided. However, most existing ontology structures lack
+rich supporting information and even contain ambiguous relations, making them
+ineffective in guiding FET. Recently developed language models, though
+promising in various few-shot and zero-shot NLP tasks, may face challenges in
+zero-shot FET due to their lack of interaction with task-specific ontology. In
+this study, we propose OnEFET, where we (1) enrich each node in the ontology
+structure with two types of extra information: instance information for
+training sample augmentation and topic information to relate types to contexts,
+and (2) develop a coarse-to-fine typing algorithm that exploits the enriched
+information by training an entailment model with contrasting topics and
+instance-based augmented training samples. Our experiments show that OnEFET
+achieves high-quality fine-grained entity typing without human annotation,
+outperforming existing zero-shot methods by a large margin and rivaling
+supervised methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GenTKG: Generative Forecasting on Temporal Knowledge Graph 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07793v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07793v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruotong Liao, Xu Jia, Yunpu Ma, Volker Tresp
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid advancements in large language models (LLMs) have ignited interest
+in the temporal knowledge graph (tKG) domain, where conventional carefully
+designed embedding-based and rule-based models dominate. The question remains
+open of whether pre-trained LLMs can understand structured temporal relational
+data and replace them as the foundation model for temporal relational
+forecasting. Therefore, we bring temporal knowledge forecasting into the
+generative setting. However, challenges occur in the huge chasms between
+complex temporal graph data structure and sequential natural expressions LLMs
+can handle, and between the enormous data sizes of tKGs and heavy computation
+costs of finetuning LLMs. To address these challenges, we propose a novel
+retrieval augmented generation framework that performs generative forecasting
+on tKGs named GenTKG, which combines a temporal logical rule-based retrieval
+strategy and lightweight parameter-efficient instruction tuning. Extensive
+experiments have shown that GenTKG outperforms conventional methods of temporal
+relational forecasting under low computation resources. GenTKG also highlights
+remarkable transferability with exceeding performance on unseen datasets
+without re-training. Our work reveals the huge potential of LLMs in the tKG
+domain and opens a new frontier for generative forecasting on tKGs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TidyBot: Personalized Robot Assistance with Large Language Models <span class="chip">IROS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.05658v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.05658v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jimmy Wu, Rika Antonova, Adam Kan, Marion Lepert, Andy Zeng, Shuran Song, Jeannette Bohg, Szymon Rusinkiewicz, Thomas Funkhouser
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For a robot to personalize physical assistance effectively, it must learn
+user preferences that can be generally reapplied to future scenarios. In this
+work, we investigate personalization of household cleanup with robots that can
+tidy up rooms by picking up objects and putting them away. A key challenge is
+determining the proper place to put each object, as people's preferences can
+vary greatly depending on personal taste or cultural background. For instance,
+one person may prefer storing shirts in the drawer, while another may prefer
+them on the shelf. We aim to build systems that can learn such preferences from
+just a handful of examples via prior interactions with a particular person. We
+show that robots can combine language-based planning and perception with the
+few-shot summarization capabilities of large language models (LLMs) to infer
+generalized user preferences that are broadly applicable to future
+interactions. This approach enables fast adaptation and achieves 91.2% accuracy
+on unseen objects in our benchmark dataset. We also demonstrate our approach on
+a real-world mobile manipulator called TidyBot, which successfully puts away
+85.0% of objects in real-world test scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Autonomous Robots (AuRo) - Special Issue: Large Language
+  Models in Robotics, 2023 and IEEE/RSJ International Conference on Intelligent
+  Robots and Systems (IROS), 2023. Project page:
+  https://tidybot.cs.princeton.edu</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SelfCheck<span class="highlight-title">GPT</span>: Zero-Resource Black-Box Hallucination Detection for
+  Generative Large Language Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08896v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08896v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Potsawee Manakul, Adian Liusie, Mark J. F. Gales
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Large Language Models (LLMs) such as GPT-3 are capable of
+generating highly fluent responses to a wide variety of user prompts. However,
+LLMs are known to hallucinate facts and make non-factual statements which can
+undermine trust in their output. Existing fact-checking approaches either
+require access to the output probability distribution (which may not be
+available for systems such as ChatGPT) or external databases that are
+interfaced via separate, often complex, modules. In this work, we propose
+"SelfCheckGPT", a simple sampling-based approach that can be used to fact-check
+the responses of black-box models in a zero-resource fashion, i.e. without an
+external database. SelfCheckGPT leverages the simple idea that if an LLM has
+knowledge of a given concept, sampled responses are likely to be similar and
+contain consistent facts. However, for hallucinated facts, stochastically
+sampled responses are likely to diverge and contradict one another. We
+investigate this approach by using GPT-3 to generate passages about individuals
+from the WikiBio dataset, and manually annotate the factuality of the generated
+passages. We demonstrate that SelfCheckGPT can: i) detect non-factual and
+factual sentences; and ii) rank passages in terms of factuality. We compare our
+approach to several baselines and show that our approach has considerably
+higher AUC-PR scores in sentence-level hallucination detection and higher
+correlation scores in passage-level factuality assessment compared to grey-box
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 (main conference)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Hermes: Unlocking Security Analysis of Cellular Network Protocols by
+  Synthesizing Finite State Machines from Natural Language Specifications <span class="chip">USENIX Security 24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04381v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04381v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdullah Al Ishtiaq, Sarkar Snigdha Sarathi Das, Syed Md Mukit Rashid, Ali Ranjbar, Kai Tu, Tianwei Wu, Zhezheng Song, Weixuan Wang, Mujtahid Akon, Rui Zhang, Syed Rafiul Hussain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we present Hermes, an end-to-end framework to automatically
+generate formal representations from natural language cellular specifications.
+We first develop a neural constituency parser, NEUTREX, to process
+transition-relevant texts and extract transition components (i.e., states,
+conditions, and actions). We also design a domain-specific language to
+translate these transition components to logical formulas by leveraging
+dependency parse trees. Finally, we compile these logical formulas to generate
+transitions and create the formal model as finite state machines. To
+demonstrate the effectiveness of Hermes, we evaluate it on 4G NAS, 5G NAS, and
+5G RRC specifications and obtain an overall accuracy of 81-87%, which is a
+substantial improvement over the state-of-the-art. Our security analysis of the
+extracted models uncovers 3 new vulnerabilities and identifies 19 previous
+attacks in 4G and 5G specifications, and 7 deviations in commercial 4G
+basebands.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at USENIX Security 24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VisoGender: A <span class="highlight-title">dataset</span> for benchmarking gender bias in image-text pronoun
+  resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.12424v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.12424v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siobhan Mackenzie Hall, Fernanda Gonçalves Abrantes, Hanwen Zhu, Grace Sodunke, Aleksandar Shtedritski, Hannah Rose Kirk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce VisoGender, a novel dataset for benchmarking gender bias in
+vision-language models. We focus on occupation-related biases within a
+hegemonic system of binary gender, inspired by Winograd and Winogender schemas,
+where each image is associated with a caption containing a pronoun relationship
+of subjects and objects in the scene. VisoGender is balanced by gender
+representation in professional roles, supporting bias evaluation in two ways:
+i) resolution bias, where we evaluate the difference between pronoun resolution
+accuracies for image subjects with gender presentations perceived as masculine
+versus feminine by human annotators and ii) retrieval bias, where we compare
+ratios of professionals perceived to have masculine and feminine gender
+presentations retrieved for a gender-neutral search query. We benchmark several
+state-of-the-art vision-language models and find that they demonstrate bias in
+resolving binary gender in complex scenes. While the direction and magnitude of
+gender bias depends on the task and the model being evaluated, captioning
+models are generally less biased than Vision-Language Encoders. Dataset and
+code are available at https://github.com/oxai/visogender
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Data and code available at https://github.com/oxai/visogender</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Schema-adaptable Knowledge Graph Construction <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08703v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08703v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongbin Ye, Honghao Gui, Xin Xu, Huajun Chen, Ningyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conventional Knowledge Graph Construction (KGC) approaches typically follow
+the static information extraction paradigm with a closed set of pre-defined
+schema. As a result, such approaches fall short when applied to dynamic
+scenarios or domains, whereas a new type of knowledge emerges. This
+necessitates a system that can handle evolving schema automatically to extract
+information for KGC. To address this need, we propose a new task called
+schema-adaptable KGC, which aims to continually extract entity, relation, and
+event based on a dynamically changing schema graph without re-training. We
+first split and convert existing datasets based on three principles to build a
+benchmark, i.e., horizontal schema expansion, vertical schema expansion, and
+hybrid schema expansion; then investigate the schema-adaptable performance of
+several well-known approaches such as Text2Event, TANL, UIE and GPT-3.5. We
+further propose a simple yet effective baseline dubbed \textsc{AdaKGC}, which
+contains schema-enriched prefix instructor and schema-conditioned dynamic
+decoding to better handle evolving schema. Comprehensive experimental results
+illustrate that AdaKGC can outperform baselines but still have room for
+improvement. We hope the proposed work can deliver benefits to the community.
+Code and datasets available at https://github.com/zjunlp/AdaKGC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 (Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Editing Large Language Models: Problems, Methods, and Opportunities <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13172v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13172v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunzhi Yao, Peng Wang, Bozhong Tian, Siyuan Cheng, Zhoubo Li, Shumin Deng, Huajun Chen, Ningyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the ability to train capable LLMs, the methodology for maintaining
+their relevancy and rectifying errors remains elusive. To this end, the past
+few years have witnessed a surge in techniques for editing LLMs, the objective
+of which is to efficiently alter the behavior of LLMs within a specific domain
+without negatively impacting performance across other inputs. This paper
+embarks on a deep exploration of the problems, methods, and opportunities
+related to model editing for LLMs. In particular, we provide an exhaustive
+overview of the task definition and challenges associated with model editing,
+along with an in-depth empirical analysis of the most progressive methods
+currently at our disposal. We also build a new benchmark dataset to facilitate
+a more robust evaluation and pinpoint enduring issues intrinsic to existing
+techniques. Our objective is to provide valuable insights into the
+effectiveness and feasibility of each editing technique, thereby assisting the
+community in making informed decisions on the selection of the most appropriate
+method for a specific task or context. Code and datasets are available at
+https://github.com/zjunlp/EasyEdit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023. Updated with new experiments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CHATREPORT: Democratizing Sustainability Disclosure Analysis through
+  LLM-based Tools 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.15770v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.15770v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingwei Ni, Julia Bingler, Chiara Colesanti-Senni, Mathias Kraus, Glen Gostlow, Tobias Schimanski, Dominik Stammbach, Saeid Ashraf Vaghefi, Qian Wang, Nicolas Webersinke, Tobias Wekhof, Tingyu Yu, Markus Leippold
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the face of climate change, are companies really taking substantial steps
+toward more sustainable operations? A comprehensive answer lies in the dense,
+information-rich landscape of corporate sustainability reports. However, the
+sheer volume and complexity of these reports make human analysis very costly.
+Therefore, only a few entities worldwide have the resources to analyze these
+reports at scale, which leads to a lack of transparency in sustainability
+reporting. Empowering stakeholders with LLM-based automatic analysis tools can
+be a promising way to democratize sustainability report analysis. However,
+developing such tools is challenging due to (1) the hallucination of LLMs and
+(2) the inefficiency of bringing domain experts into the AI development loop.
+In this paper, we ChatReport, a novel LLM-based system to automate the analysis
+of corporate sustainability reports, addressing existing challenges by (1)
+making the answers traceable to reduce the harm of hallucination and (2)
+actively involving domain experts in the development loop. We make our
+methodology, annotated datasets, and generated analyses of 1015 reports
+publicly available.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 pages. arXiv admin note: text overlap with arXiv:2306.15518</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analysis of the Cambridge Multiple-Choice Questions Reading <span class="highlight-title">Dataset</span> with
+  a Focus on Candidate Response Distribution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13047v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13047v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adian Liusie, Vatsal Raina, Andrew Mullooly, Kate Knill, Mark J. F. Gales
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multiple choice exams are widely used to assess candidates across a diverse
+range of domains and tasks. To moderate question quality, newly proposed
+questions often pass through pre-test evaluation stages before being deployed
+into real-world exams. Currently, this evaluation process is manually
+intensive, which can lead to time lags in the question development cycle.
+Streamlining this process via automation can significantly enhance efficiency,
+however, there's a current lack of datasets with adequate pre-test analysis
+information. In this paper we analyse the Cambridge Multiple-Choice Questions
+Reading Dataset; a multiple-choice comprehension dataset of questions at
+different target levels, with corresponding candidate selection distributions.
+We introduce the task of candidate distribution matching, propose several
+evaluation metrics for the task, and demonstrate that automatic systems trained
+on RACE++ can be leveraged as baselines for our task. We further demonstrate
+that these automatic systems can be used for practical pre-test evaluation
+tasks such as detecting underperforming distractors, where our detection
+systems can automatically identify poor distractors that few candidates select.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chat with the Environment: Interactive Multimodal Perception Using Large
+  Language Models <span class="chip">IROS2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08268v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08268v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xufeng Zhao, Mengdi Li, Cornelius Weber, Muhammad Burhan Hafez, Stefan Wermter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Programming robot behavior in a complex world faces challenges on multiple
+levels, from dextrous low-level skills to high-level planning and reasoning.
+Recent pre-trained Large Language Models (LLMs) have shown remarkable reasoning
+ability in few-shot robotic planning. However, it remains challenging to ground
+LLMs in multimodal sensory input and continuous action output, while enabling a
+robot to interact with its environment and acquire novel information as its
+policies unfold. We develop a robot interaction scenario with a partially
+observable state, which necessitates a robot to decide on a range of epistemic
+actions in order to sample sensory information among multiple modalities,
+before being able to execute the task correctly. Matcha (Multimodal environment
+chatting) agent, an interactive perception framework, is therefore proposed
+with an LLM as its backbone, whose ability is exploited to instruct epistemic
+actions and to reason over the resulting multimodal sensations (vision, sound,
+haptics, proprioception), as well as to plan an entire task execution based on
+the interactively acquired information. Our study demonstrates that LLMs can
+provide high-level planning and reasoning skills and control interactive robot
+behavior in a multimodal environment, while multimodal modules with the context
+of the environmental state help ground the LLMs and extend their processing
+ability. The project website can be found at https://matcha-agent.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IROS2023, Detroit. See the project website at
+  https://matcha-agent.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Empirical Study of Multimodal Model Merging <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14933v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14933v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi-Lin Sung, Linjie Li, Kevin Lin, Zhe Gan, Mohit Bansal, Lijuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model merging (e.g., via interpolation or task arithmetic) fuses multiple
+models trained on different tasks to generate a multi-task solution. The
+technique has been proven successful in previous studies, where the models are
+trained on similar tasks and with the same initialization. In this paper, we
+expand on this concept to a multimodal setup by merging transformers trained on
+different modalities. Furthermore, we conduct our study for a novel goal where
+we can merge vision, language, and cross-modal transformers of a
+modality-specific architecture to create a parameter-efficient
+modality-agnostic architecture. Through comprehensive experiments, we
+systematically investigate the key factors impacting model performance after
+merging, including initialization, merging mechanisms, and model architectures.
+We also propose two metrics that assess the distance between weights to be
+merged and can serve as an indicator of the merging outcomes. Our analysis
+leads to an effective training recipe for matching the performance of the
+modality-agnostic baseline (i.e., pre-trained from scratch) via model merging.
+Our method also outperforms naive merging significantly on various tasks, with
+improvements of 3% on VQA, 7% on COCO retrieval, 25% on NLVR2, 14% on Flickr30k
+and 3% on ADE20k. Our code is available at https://github.com/ylsung/vl-merging
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Instruction<span class="highlight-title">GPT</span>-4: A 200-Instruction Paradigm for Fine-Tuning Mini<span class="highlight-title">GPT</span>-4 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.12067v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.12067v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lai Wei, Zihao Jiang, Weiran Huang, Lichao Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal large language models are typically trained in two stages: first
+pre-training on image-text pairs, and then fine-tuning using supervised
+vision-language instruction data. Recent studies have shown that large language
+models can achieve satisfactory results even with a limited amount of
+high-quality instruction-following data. In this paper, we introduce
+InstructionGPT-4, which is fine-tuned on a small dataset comprising only 200
+examples, amounting to approximately 6\% of the instruction-following data used
+in the alignment dataset for MiniGPT-4. To achieve this, we first propose
+several metrics to access the quality of multimodal instruction data. Based on
+these metrics, we present an effective and trainable data selector to
+automatically identify and filter low-quality vision-language data. By
+employing this method, InstructionGPT-4 outperforms the original MiniGPT-4 on
+various evaluations. Overall, our findings demonstrate that less but
+high-quality instruction tuning data is efficient in enabling multimodal large
+language models to generate better output. Our code is available at
+https://github.com/waltonfuture/InstructionGPT-4.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Knowledge Rumination for <span class="highlight-title">Pre-train</span>ed Language Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08732v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08732v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunzhi Yao, Peng Wang, Shengyu Mao, Chuanqi Tan, Fei Huang, Huajun Chen, Ningyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Previous studies have revealed that vanilla pre-trained language models
+(PLMs) lack the capacity to handle knowledge-intensive NLP tasks alone; thus,
+several works have attempted to integrate external knowledge into PLMs.
+However, despite the promising outcome, we empirically observe that PLMs may
+have already encoded rich knowledge in their pre-trained parameters but fail to
+fully utilize them when applying them to knowledge-intensive tasks. In this
+paper, we propose a new paradigm dubbed Knowledge Rumination to help the
+pre-trained language model utilize that related latent knowledge without
+retrieving it from the external corpus. By simply adding a prompt like "As far
+as I know" to the PLMs, we try to review related latent knowledge and inject
+them back into the model for knowledge consolidation. We apply the proposed
+knowledge rumination to various language models, including RoBERTa, DeBERTa,
+and GPT-3. Experimental results on six commonsense reasoning tasks and GLUE
+benchmarks demonstrate the effectiveness of our proposed approach, which proves
+that the knowledge stored in PLMs can be better exploited to enhance
+performance. Code is available in
+https://github.com/zjunlp/knowledge-rumination.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Meta-CoT: Generalizable Chain-of-Thought <span class="highlight-title">Prompt</span>ing in Mixed-task
+  Scenarios with Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06692v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06692v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Anni Zou, Zhuosheng Zhang, Hai Zhao, Xiangru Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have unveiled remarkable reasoning capabilities
+by exploiting chain-of-thought (CoT) prompting, which generates intermediate
+reasoning chains to serve as the rationale for deriving the answer. However,
+current CoT methods either simply employ general prompts such as Let's think
+step by step, or heavily rely on handcrafted task-specific demonstrations to
+attain preferable performances, thereby engendering an inescapable gap between
+performance and generalization. To bridge this gap, we propose Meta-CoT, a
+generalizable CoT prompting method in mixed-task scenarios where the type of
+input questions is unknown. Meta-CoT firstly categorizes the scenario based on
+the input question and subsequently constructs diverse demonstrations from the
+corresponding data pool in an automatic pattern. Meta-CoT simultaneously enjoys
+remarkable performances on ten public benchmark reasoning tasks and superior
+generalization capabilities. Notably, Meta-CoT achieves the state-of-the-art
+result on SVAMP (93.7%) without any additional program-aided methods. Our
+further experiments on five out-of-distribution datasets verify the stability
+and generality of Meta-CoT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Dissecting Recall of Factual Associations in Auto-Regressive Language
+  Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14767v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14767v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mor Geva, Jasmijn Bastings, Katja Filippova, Amir Globerson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer-based language models (LMs) are known to capture factual
+knowledge in their parameters. While previous work looked into where factual
+associations are stored, only little is known about how they are retrieved
+internally during inference. We investigate this question through the lens of
+information flow. Given a subject-relation query, we study how the model
+aggregates information about the subject and relation to predict the correct
+attribute. With interventions on attention edges, we first identify two
+critical points where information propagates to the prediction: one from the
+relation positions followed by another from the subject positions. Next, by
+analyzing the information at these points, we unveil a three-step internal
+mechanism for attribute extraction. First, the representation at the
+last-subject position goes through an enrichment process, driven by the early
+MLP sublayers, to encode many subject-related attributes. Second, information
+from the relation propagates to the prediction. Third, the prediction
+representation "queries" the enriched subject to extract the attribute. Perhaps
+surprisingly, this extraction is typically done via attention heads, which
+often encode subject-attribute mappings in their parameters. Overall, our
+findings introduce a comprehensive view of how factual associations are stored
+and extracted internally in LMs, facilitating future research on knowledge
+localization and editing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Fundamental Limitations of Alignment in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.11082v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.11082v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yotam Wolf, Noam Wies, Oshri Avnery, Yoav Levine, Amnon Shashua
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  An important aspect in developing language models that interact with humans
+is aligning their behavior to be useful and unharmful for their human users.
+This is usually achieved by tuning the model in a way that enhances desired
+behaviors and inhibits undesired ones, a process referred to as alignment. In
+this paper, we propose a theoretical approach called Behavior Expectation
+Bounds (BEB) which allows us to formally investigate several inherent
+characteristics and limitations of alignment in large language models.
+Importantly, we prove that within the limits of this framework, for any
+behavior that has a finite probability of being exhibited by the model, there
+exist prompts that can trigger the model into outputting this behavior, with
+probability that increases with the length of the prompt. This implies that any
+alignment process that attenuates an undesired behavior but does not remove it
+altogether, is not safe against adversarial prompting attacks. Furthermore, our
+framework hints at the mechanism by which leading alignment approaches such as
+reinforcement learning from human feedback make the LLM prone to being prompted
+into the undesired behaviors. This theoretical result is being experimentally
+demonstrated in large scale by the so called contemporary "chatGPT jailbreaks",
+where adversarial users trick the LLM into breaking its alignment guardrails by
+triggering it into acting as a malicious persona. Our results expose
+fundamental limitations in alignment of LLMs and bring to the forefront the
+need to devise reliable mechanisms for ensuring AI safety.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Query2doc: Query Expansion with Large Language Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.07678v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.07678v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Wang, Nan Yang, Furu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a simple yet effective query expansion approach,
+denoted as query2doc, to improve both sparse and dense retrieval systems. The
+proposed method first generates pseudo-documents by few-shot prompting large
+language models (LLMs), and then expands the query with generated
+pseudo-documents. LLMs are trained on web-scale text corpora and are adept at
+knowledge memorization. The pseudo-documents from LLMs often contain highly
+relevant information that can aid in query disambiguation and guide the
+retrievers. Experimental results demonstrate that query2doc boosts the
+performance of BM25 by 3% to 15% on ad-hoc IR datasets, such as MS-MARCO and
+TREC DL, without any model fine-tuning. Furthermore, our method also benefits
+state-of-the-art dense retrievers in terms of both in-domain and out-of-domain
+results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Interpretable Mental Health Analysis with Large Language Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03347v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03347v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kailai Yang, Shaoxiong Ji, Tianlin Zhang, Qianqian Xie, Ziyan Kuang, Sophia Ananiadou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The latest large language models (LLMs) such as ChatGPT, exhibit strong
+capabilities in automated mental health analysis. However, existing relevant
+studies bear several limitations, including inadequate evaluations, lack of
+prompting strategies, and ignorance of exploring LLMs for explainability. To
+bridge these gaps, we comprehensively evaluate the mental health analysis and
+emotional reasoning ability of LLMs on 11 datasets across 5 tasks. We explore
+the effects of different prompting strategies with unsupervised and distantly
+supervised emotional information. Based on these prompts, we explore LLMs for
+interpretable mental health analysis by instructing them to generate
+explanations for each of their decisions. We convey strict human evaluations to
+assess the quality of the generated explanations, leading to a novel dataset
+with 163 human-assessed explanations. We benchmark existing automatic
+evaluation metrics on this dataset to guide future related works. According to
+the results, ChatGPT shows strong in-context learning ability but still has a
+significant gap with advanced task-specific methods. Careful prompt engineering
+with emotional cues and expert-written few-shot examples can also effectively
+improve performance on mental health analysis. In addition, ChatGPT generates
+explanations that approach human performance, showing its great potential in
+explainable mental health analysis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by EMNLP 2023 main conference as a long paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Constructive Large Language Models Alignment with Diverse Feedback 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06450v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06450v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tianshu Yu, Ting-En Lin, Yuchuan Wu, Min Yang, Fei Huang, Yongbin Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In recent research on large language models (LLMs), there has been a growing
+emphasis on aligning these models with human values to reduce the impact of
+harmful content. However, current alignment methods often rely solely on
+singular forms of human feedback, such as preferences, annotated labels, or
+natural language critiques, overlooking the potential advantages of combining
+these feedback types. This limitation leads to suboptimal performance, even
+when ample training data is available. In this paper, we introduce Constructive
+and Diverse Feedback (CDF) as a novel method to enhance LLM alignment, inspired
+by constructivist learning theory. Our approach involves collecting three
+distinct types of feedback tailored to problems of varying difficulty levels
+within the training dataset. Specifically, we exploit critique feedback for
+easy problems, refinement feedback for medium problems, and preference feedback
+for hard problems. By training our model with this diversified feedback, we
+achieve enhanced alignment performance while using less training data. To
+assess the effectiveness of CDF, we evaluate it against previous methods in
+three downstream tasks: question answering, dialog generation, and text
+summarization. Experimental results demonstrate that CDF achieves superior
+performance even with a smaller training dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Revisiting Large Language Models as Zero-shot Relation Extractors <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05028v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05028v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guozheng Li, Peng Wang, Wenjun Ke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Relation extraction (RE) consistently involves a certain degree of labeled or
+unlabeled data even if under zero-shot setting. Recent studies have shown that
+large language models (LLMs) transfer well to new tasks out-of-the-box simply
+given a natural language prompt, which provides the possibility of extracting
+relations from text without any data and parameter tuning. This work focuses on
+the study of exploring LLMs, such as ChatGPT, as zero-shot relation extractors.
+On the one hand, we analyze the drawbacks of existing RE prompts and attempt to
+incorporate recent prompt techniques such as chain-of-thought (CoT) to improve
+zero-shot RE. We propose the summarize-and-ask (\textsc{SumAsk}) prompting, a
+simple prompt recursively using LLMs to transform RE inputs to the effective
+question answering (QA) format. On the other hand, we conduct comprehensive
+experiments on various benchmarks and settings to investigate the capabilities
+of LLMs on zero-shot RE. Specifically, we have the following findings: (i)
+\textsc{SumAsk} consistently and significantly improves LLMs performance on
+different model sizes, benchmarks and settings; (ii) Zero-shot prompting with
+ChatGPT achieves competitive or superior results compared with zero-shot and
+fully supervised methods; (iii) LLMs deliver promising performance in
+extracting overlapping relations; (iv) The performance varies greatly regarding
+different relations. Different from small language models, LLMs are effective
+in handling challenge none-of-the-above (NoTA) relation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ UPRISE: Universal <span class="highlight-title">Prompt</span> Retrieval for Improving Zero-Shot Evaluation <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08518v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08518v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daixuan Cheng, Shaohan Huang, Junyu Bi, Yuefeng Zhan, Jianfeng Liu, Yujing Wang, Hao Sun, Furu Wei, Denvy Deng, Qi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large Language Models (LLMs) are popular for their impressive abilities, but
+the need for model-specific fine-tuning or task-specific prompt engineering can
+hinder their generalization. We propose UPRISE (Universal Prompt Retrieval for
+Improving zero-Shot Evaluation), which tunes a lightweight and versatile
+retriever that automatically retrieves prompts for a given zero-shot task
+input. Specifically, we demonstrate universality in a cross-task and
+cross-model scenario: the retriever is tuned on a diverse set of tasks, but
+tested on unseen task types; we use a small frozen LLM, GPT-Neo-2.7B, for
+tuning the retriever, but test the retriever on different LLMs of much larger
+scales, such as BLOOM-7.1B, OPT-66B and GPT3-175B. Additionally, we show that
+UPRISE mitigates the hallucination problem in our experiments with ChatGPT,
+suggesting its potential to improve even the strongest LLMs. Our model and code
+are available at https://github.com/microsoft/LMOps.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 Main Conference</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Segmented Recurrent <span class="highlight-title">Transformer</span>: An Efficient Sequence-to-Sequence Model <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16340v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16340v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinghan Long, Sayeed Shafayet Chowdhury, Kaushik Roy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformers have shown dominant performance across a range of domains
+including language and vision. However, their computational cost grows
+quadratically with the sequence length, making their usage prohibitive for
+resource-constrained applications. To counter this, our approach is to divide
+the whole sequence into segments and use local attention mechanism on the
+individual segments. We propose a segmented recurrent transformer (SRformer)
+that combines segmented (local) attention with recurrent attention. The loss
+caused by reducing the attention window length is compensated by aggregating
+information across segments with recurrent attention. SRformer leverages
+Recurrent Accumulate-and-Fire (RAF) neurons' inherent memory to update the
+cumulative product of keys and values. The segmented attention and lightweight
+RAF neurons ensure the efficiency of the proposed transformer. Such an approach
+leads to models with sequential processing capability at a lower
+computation/memory cost. We apply the proposed method to T5 and BART
+transformers. The modified models are tested on summarization datasets
+including CNN-dailymail, XSUM, ArXiv, and MediaSUM. Notably, using segmented
+inputs of varied sizes, the proposed model achieves $6-22\%$ higher ROUGE1
+scores than a segmented transformer and outperforms other recurrent transformer
+approaches. Furthermore, compared to full attention, the proposed model reduces
+the computational complexity of cross attention by around $40\%$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Findings of EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FActScore: Fine-grained Atomic Evaluation of Factual Precision in Long
+  Form Text Generation <span class="chip">EMNLP
+  2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.14251v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.14251v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sewon Min, Kalpesh Krishna, Xinxi Lyu, Mike Lewis, Wen-tau Yih, Pang Wei Koh, Mohit Iyyer, Luke Zettlemoyer, Hannaneh Hajishirzi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evaluating the factuality of long-form text generated by large language
+models (LMs) is non-trivial because (1) generations often contain a mixture of
+supported and unsupported pieces of information, making binary judgments of
+quality inadequate, and (2) human evaluation is time-consuming and costly. In
+this paper, we introduce FACTSCORE, a new evaluation that breaks a generation
+into a series of atomic facts and computes the percentage of atomic facts
+supported by a reliable knowledge source. We conduct an extensive human
+evaluation to obtain FACTSCOREs of people biographies generated by several
+state-of-the-art commercial LMs -- InstructGPT, ChatGPT, and the
+retrieval-augmented PerplexityAI -- and report new analysis demonstrating the
+need for such a fine-grained score (e.g., ChatGPT only achieves 58%). Since
+human evaluation is costly, we also introduce an automated model that estimates
+FACTSCORE using retrieval and a strong language model, with less than a 2%
+error rate. Finally, we use this automated metric to evaluate 6,500 generations
+from a new set of 13 recent LMs that would have cost $26K if evaluated by
+humans, with various findings: GPT-4 and ChatGPT are more factual than public
+models, and Vicuna and Alpaca are some of the best public models. FACTSCORE is
+available for public use via `pip install factscore`.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages; 7 figures. Published as a main conference paper at EMNLP
+  2023. Code available at https://github.com/shmsw25/FActScore</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Knowledgeable Salient Span Mask for Enhancing Language Models as
+  Knowledge Base <span class="chip">NLPCC-2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2204.07994v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2204.07994v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cunxiang Wang, Fuli Luo, Yanyang Li, Runxin Xu, Fei Huang, Yue Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pre-trained language models (PLMs) like BERT have made significant progress
+in various downstream NLP tasks. However, by asking models to do cloze-style
+tests, recent work finds that PLMs are short in acquiring knowledge from
+unstructured text. To understand the internal behaviour of PLMs in retrieving
+knowledge, we first define knowledge-baring (K-B) tokens and knowledge-free
+(K-F) tokens for unstructured text and ask professional annotators to label
+some samples manually. Then, we find that PLMs are more likely to give wrong
+predictions on K-B tokens and attend less attention to those tokens inside the
+self-attention module. Based on these observations, we develop two solutions to
+help the model learn more knowledge from unstructured text in a fully
+self-supervised manner. Experiments on knowledge-intensive tasks show the
+effectiveness of the proposed methods. To our best knowledge, we are the first
+to explore fully self-supervised learning of knowledge in continual
+pre-training.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NLPCC-2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Dawn of LMMs: Preliminary Explorations with <span class="highlight-title">GPT</span>-4V(ision) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17421v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17421v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengyuan Yang, Linjie Li, Kevin Lin, Jianfeng Wang, Chung-Ching Lin, Zicheng Liu, Lijuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large multimodal models (LMMs) extend large language models (LLMs) with
+multi-sensory skills, such as visual understanding, to achieve stronger generic
+intelligence. In this paper, we analyze the latest model, GPT-4V(ision), to
+deepen the understanding of LMMs. The analysis focuses on the intriguing tasks
+that GPT-4V can perform, containing test samples to probe the quality and
+genericity of GPT-4V's capabilities, its supported inputs and working modes,
+and the effective ways to prompt the model. In our approach to exploring
+GPT-4V, we curate and organize a collection of carefully designed qualitative
+samples spanning a variety of domains and tasks. Observations from these
+samples demonstrate that GPT-4V's unprecedented ability in processing
+arbitrarily interleaved multimodal inputs and the genericity of its
+capabilities together make GPT-4V a powerful multimodal generalist system.
+Furthermore, GPT-4V's unique capability of understanding visual markers drawn
+on input images can give rise to new human-computer interaction methods such as
+visual referring prompting. We conclude the report with in-depth discussions on
+the emerging application scenarios and the future research directions for
+GPT-4V-based systems. We hope that this preliminary exploration will inspire
+future research on the next-generation multimodal task formulation, new ways to
+exploit and enhance LMMs to solve real-world problems, and gaining better
+understanding of multimodal foundation models. Finally, we acknowledge that the
+model under our study is solely the product of OpenAI's innovative work, and
+they should be fully credited for its development. Please see the GPT-4V
+contributions paper for the authorship and credit attribution:
+https://cdn.openai.com/contributions/gpt-4v.pdf
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Importance of <span class="highlight-title">Prompt</span> Tuning for Automated Neuron Explanations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06200v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06200v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Justin Lee, Tuomas Oikarinen, Arjun Chatha, Keng-Chi Chang, Yilan Chen, Tsui-Wei Weng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances have greatly increased the capabilities of large language
+models (LLMs), but our understanding of the models and their safety has not
+progressed as fast. In this paper we aim to understand LLMs deeper by studying
+their individual neurons. We build upon previous work showing large language
+models such as GPT-4 can be useful in explaining what each neuron in a language
+model does. Specifically, we analyze the effect of the prompt used to generate
+explanations and show that reformatting the explanation prompt in a more
+natural way can significantly improve neuron explanation quality and greatly
+reduce computational cost. We demonstrate the effects of our new prompts in
+three different ways, incorporating both automated and human evaluations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Active Learning for Multilingual Semantic Parser <span class="chip">EACL 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.12920v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.12920v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhuang Li, Gholamreza Haffari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current multilingual semantic parsing (MSP) datasets are almost all collected
+by translating the utterances in the existing datasets from the resource-rich
+language to the target language. However, manual translation is costly. To
+reduce the translation effort, this paper proposes the first active learning
+procedure for MSP (AL-MSP). AL-MSP selects only a subset from the existing
+datasets to be translated. We also propose a novel selection method that
+prioritizes the examples diversifying the logical form structures with more
+lexical choices, and a novel hyperparameter tuning method that needs no extra
+annotation cost. Our experiments show that AL-MSP significantly reduces
+translation costs with ideal selection methods. Our selection method with
+proper hyperparameters yields better parsing performance than the other
+baselines on two multilingual datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EACL 2023 (findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Have LLMs Advanced Enough? A Challenging Problem Solving Benchmark For
+  Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.15074v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.15074v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Daman Arora, Himanshu Gaurav Singh,  Mausam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The performance of large language models (LLMs) on existing reasoning
+benchmarks has significantly improved over the past years. In response, we
+present JEEBench, a considerably more challenging benchmark dataset for
+evaluating the problem solving abilities of LLMs. We curate 515 challenging
+pre-engineering mathematics, physics and chemistry problems from the highly
+competitive IIT JEE-Advanced exam. Long-horizon reasoning on top of deep
+in-domain knowledge is essential for solving problems in this benchmark. Our
+evaluation on various open-source and proprietary models reveals that the
+highest performance, even after using techniques like self-consistency,
+self-refinement and chain-of-thought prompting, is less than 40\%. The typical
+failure modes of GPT-4, the best model, are errors in algebraic manipulation,
+difficulty in grounding abstract concepts into mathematical equations
+accurately and failure in retrieving relevant domain-specific concepts. We also
+observe that by mere prompting, GPT-4 is unable to assess risk introduced by
+negative marking for incorrect answers. For this, we develop a post-hoc
+confidence-thresholding method over self-consistency, which enables effective
+response selection. We hope that our challenging benchmark will guide future
+re-search in problem-solving using LLMs.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>v2</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VerifAI: Verified Generative AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02796v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02796v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nan Tang, Chenyu Yang, Ju Fan, Lei Cao, Yuyu Luo, Alon Halevy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative AI has made significant strides, yet concerns about the accuracy
+and reliability of its outputs continue to grow. Such inaccuracies can have
+serious consequences such as inaccurate decision-making, the spread of false
+information, privacy violations, legal liabilities, and more. Although efforts
+to address these risks are underway, including explainable AI and responsible
+AI practices such as transparency, privacy protection, bias mitigation, and
+social and environmental responsibility, misinformation caused by generative AI
+will remain a significant challenge. We propose that verifying the outputs of
+generative AI from a data management perspective is an emerging issue for
+generative AI. This involves analyzing the underlying data from multi-modal
+data lakes, including text files, tables, and knowledge graphs, and assessing
+its quality and consistency. By doing so, we can establish a stronger
+foundation for evaluating the outputs of generative AI models. Such an approach
+can ensure the correctness of generative AI, promote transparency, and enable
+decision-making with greater confidence. Our vision is to promote the
+development of verifiable generative AI and contribute to a more trustworthy
+and responsible use of AI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Are Personalized Stochastic Parrots More Dangerous? Evaluating Persona
+  Biases in Dialogue Systems 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05280v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05280v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yixin Wan, Jieyu Zhao, Aman Chadha, Nanyun Peng, Kai-Wei Chang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in Large Language Models empower them to follow freeform
+instructions, including imitating generic or specific demographic personas in
+conversations. Generic personas refer to an individual from a demographic group
+(e.g. an Asian person), whereas specific personas can be actual names of
+historical figures. While the adoption of personas allows dialogue systems to
+be more engaging and approachable to users, it also carries the potential risk
+of exacerbating social biases in model responses, further causing societal
+harms through interactions with users. In this paper, we systematically study
+"persona biases", which we define to be the sensitivity of harmful dialogue
+model behaviors to different persona adoptions. We categorize persona biases
+into biases in harmful expression and harmful agreement, as well as establish a
+comprehensive evaluation framework to measure persona biases in five aspects:
+Offensiveness, Toxic Continuation, Regard, Stereotype Agreement, and Toxic
+Agreement. Additionally, we propose to comprehensively investigate persona
+biases through experimenting with UniversalPersona, a systematized persona
+dataset with a comprehensive list of both generic and specific model personas.
+Through benchmarking on four different models, including Blender, ChatGPT,
+Alpaca, and Vicuna, our study uncovers significant persona biases in these
+dialogue systems.Findings of our study underscores the immediate need to
+revisit the use of persona traits in dialogue agents, to ensure their safe
+application.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Explore, Establish, Exploit: Red Teaming Language Models from Scratch 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.09442v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.09442v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Stephen Casper, Jason Lin, Joe Kwon, Gatlen Culp, Dylan Hadfield-Menell
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deploying large language models (LMs) can pose hazards from harmful outputs
+such as toxic or false text. Prior work has introduced automated tools that
+elicit harmful outputs to identify these risks. While this is a valuable step
+toward securing models, these approaches rely on a pre-existing way to
+efficiently classify undesirable outputs. Using a pre-existing classifier does
+not allow for red-teaming to be tailored to the target model. Furthermore, when
+failures can be easily classified in advance, red-teaming has limited marginal
+value because problems can be avoided by simply filtering training data and/or
+model outputs. Here, we consider red-teaming "from scratch," in which the
+adversary does not begin with a way to classify failures. Our framework
+consists of three steps: 1) Exploring the model's range of behaviors in the
+desired context; 2) Establishing a definition and measurement for undesired
+behavior (e.g., a classifier trained to reflect human evaluations); and 3)
+Exploiting the model's flaws using this measure to develop diverse adversarial
+prompts. We use this approach to red-team GPT-3 to discover classes of inputs
+that elicit false statements. In doing so, we construct the CommonClaim dataset
+of 20,000 statements labeled by humans as common-knowledge-true, common
+knowledge-false, or neither. We are making code and data available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MBR and QE Finetuning: Training-time Distillation of the Best and Most
+  Expensive Decoding Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.10966v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.10966v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mara Finkelstein, Subhajit Naskar, Mehdi Mirzazadeh, Apurva Shah, Markus Freitag
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent research in decoding methods for Natural Language Generation (NLG)
+tasks has shown that MAP decoding is not optimal, because model probabilities
+do not always align with human preferences. Stronger decoding methods,
+including Quality Estimation (QE) reranking and Minimum Bayes' Risk (MBR)
+decoding, have since been proposed to mitigate the model-perplexity-vs-quality
+mismatch. While these decoding methods achieve state-of-the-art performance,
+they are prohibitively expensive to compute. In this work, we propose MBR
+finetuning and QE finetuning which distill the quality gains from these
+decoding methods at training time, while using an efficient decoding algorithm
+at inference time. Using the canonical NLG task of Neural Machine Translation
+(NMT), we show that even with self-training, these finetuning methods
+significantly outperform the base model. Moreover, when using an external LLM
+as a teacher model, these finetuning methods outperform finetuning on
+human-generated references. These findings suggest new ways to leverage
+monolingual data to achieve improvements in model quality that are on par with,
+or even exceed, improvements from human-curated data, while maintaining maximum
+efficiency during decoding.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distilling Large Vision-Language Model with Out-of-Distribution
+  Generalizability <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03135v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03135v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanlin Li, Yunhao Fang, Minghua Liu, Zhan Ling, Zhuowen Tu, Hao Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large vision-language models have achieved outstanding performance, but their
+size and computational requirements make their deployment on
+resource-constrained devices and time-sensitive tasks impractical. Model
+distillation, the process of creating smaller, faster models that maintain the
+performance of larger models, is a promising direction towards the solution.
+This paper investigates the distillation of visual representations in large
+teacher vision-language models into lightweight student models using a small-
+or mid-scale dataset. Notably, this study focuses on open-vocabulary
+out-of-distribution (OOD) generalization, a challenging problem that has been
+overlooked in previous model distillation literature. We propose two principles
+from vision and language modality perspectives to enhance student's OOD
+generalization: (1) by better imitating teacher's visual representation space,
+and carefully promoting better coherence in vision-language alignment with the
+teacher; (2) by enriching the teacher's language representations with
+informative and finegrained semantic attributes to effectively distinguish
+between different labels. We propose several metrics and conduct extensive
+experiments to investigate their techniques. The results demonstrate
+significant improvements in zero-shot and few-shot student performance on
+open-vocabulary out-of-distribution classification, highlighting the
+effectiveness of our proposed approaches. Poster:
+https://xuanlinli17.github.io/pdfs/iccv23_large_vlm_distillation_poster.pdf
+Code: https://github.com/xuanlinli17/large_vlm_distillation_ood
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at International Conference on Computer Vision (ICCV) 2023.
+  Poster at
+  https://xuanlinli17.github.io/pdfs/iccv23_large_vlm_distillation_poster.pdf</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exploring zero-shot capability of large language models in inferences
+  from medical oncology notes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.03853v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.03853v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Madhumita Sushil, Vanessa E. Kennedy, Divneet Mandair, Brenda Y. Miao, Travis Zack, Atul J. Butte
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Both medical care and observational studies in oncology require a thorough
+understanding of a patient's disease progression and treatment history, often
+elaborately documented in clinical notes. Despite their vital role, no current
+oncology information representation and annotation schema fully encapsulates
+the diversity of information recorded within these notes. Although large
+language models (LLMs) have recently exhibited impressive performance on
+various medical natural language processing tasks, due to the current lack of
+comprehensively annotated oncology datasets, an extensive evaluation of LLMs in
+extracting and reasoning with the complex rhetoric in oncology notes remains
+understudied. We developed a detailed schema for annotating textual oncology
+information, encompassing patient characteristics, tumor characteristics,
+tests, treatments, and temporality. Using a corpus of 40 de-identified breast
+and pancreatic cancer progress notes at University of California, San
+Francisco, we applied this schema to assess the abilities of three
+recently-released LLMs (GPT-4, GPT-3.5-turbo, and FLAN-UL2) to perform
+zero-shot extraction of detailed oncological history from two narrative
+sections of clinical progress notes. Our team annotated 9028 entities, 9986
+modifiers, and 5312 relationships. The GPT-4 model exhibited overall best
+performance, with an average BLEU score of 0.68, an average ROUGE score of
+0.71, and an average accuracy of 67% on complex tasks (expert manual evaluation
+on subset). Notably, it was proficient in tumor characteristic and medication
+extraction, and demonstrated superior performance in advanced tasks of
+inferring symptoms due to cancer and considerations of future medications.
+GPT-4 may already be usable to extract important facts from cancer progress
+notes needed for clinical research, complex population management, and
+documenting quality patient care.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Source code available at:
+  https://github.com/MadhumitaSushil/OncLLMExtraction</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SatLM: Satisfiability-Aided Language Models Using Declarative <span class="highlight-title">Prompt</span>ing <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.09656v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.09656v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xi Ye, Qiaochu Chen, Isil Dillig, Greg Durrett
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prior work has combined chain-of-thought prompting in large language models
+(LLMs) with programmatic representations to perform effective and transparent
+reasoning. While such an approach works well for tasks that only require
+forward reasoning (e.g., straightforward arithmetic), it is less effective for
+constraint solving problems that require more sophisticated planning and
+search. In this paper, we propose a new satisfiability-aided language modeling
+(SatLM) approach for improving the reasoning capabilities of LLMs. We use an
+LLM to generate a declarative task specification rather than an imperative
+program and leverage an off-the-shelf automated theorem prover to derive the
+final answer. This approach has two key advantages. The declarative
+specification is closer to the problem description than the reasoning steps
+are, so the LLM can parse it out of the description more accurately.
+Furthermore, by offloading the actual reasoning task to an automated theorem
+prover, our approach can guarantee the correctness of the answer with respect
+to the parsed specification and avoid planning errors in the solving process.
+We evaluate SATLM on 8 different datasets and show that it consistently
+outperforms program-aided LMs in the imperative paradigm. In particular, SATLM
+outperforms program-aided LMs by 23% on a challenging subset of the GSM
+arithmetic reasoning dataset; SATLM also achieves a new SoTA on LSAT and
+BoardgameQA, surpassing previous models that are trained on the respective
+training sets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Measuring Social Dimensions of Self-Presentation in Social Media
+  Biographies with an Identity-based Approach 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.09548v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.09548v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Navid Madani, Rabiraj Bandyopadhyay, Briony Swire-Thompson, Michael Miller Yoder, Kenneth Joseph
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Social media users on sites like Twitter, Instagram, and Tiktok use the
+profile description, or bio, field of user profiles to present themselves to
+the world. In contrast to the ``offline'' world, where social context often
+encourages us to adopt a single identity, the profile description is a
+free-text field in which users are encouraged to present the self using
+multiple, sometimes conflicting, social identities. While sociologists, social
+psychologists, sociolinguists, and increasingly computational social
+scientists, have developed a large and growing array of methods to estimate the
+meaning of individual social identities, little work has attended to the ways
+in which social meanings emerge from the collections of social identities
+present in social media bios. The present work proposes and evaluate three
+novel, identity-based methods to measure the social dimensions of meaning
+expressed in Twitter bios. We show that these models outperform reasonable
+baselines with respect to 1) predicting which sets of identities are more
+likely to co-occur within a single biography and 2) quantifying perceptions of
+entire social media biographies along salient dimensions of social meaning on
+Twitter, in particular partisanship. We demonstrate the utility of our method
+in a computational social science setting by using model outputs to better
+understand how self presentation along dimensions of partisanship, religion,
+age, and gender are related to the sharing of URLs on Twitter from low versus
+high quality news sites.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Computer Vision and Pattern Recognition <span class="chip" style="font-size: 60%">137</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PAD: A <span class="highlight-title">Dataset</span> and Benchmark for Pose-agnostic Anomaly Detection <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07716v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07716v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiang Zhou, Weize Li, Lihan Jiang, Guoliang Wang, Guyue Zhou, Shanghang Zhang, Hao Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Object anomaly detection is an important problem in the field of machine
+vision and has seen remarkable progress recently. However, two significant
+challenges hinder its research and application. First, existing datasets lack
+comprehensive visual information from various pose angles. They usually have an
+unrealistic assumption that the anomaly-free training dataset is pose-aligned,
+and the testing samples have the same pose as the training data. However, in
+practice, anomaly may exist in any regions on a object, the training and query
+samples may have different poses, calling for the study on pose-agnostic
+anomaly detection. Second, the absence of a consensus on experimental protocols
+for pose-agnostic anomaly detection leads to unfair comparisons of different
+methods, hindering the research on pose-agnostic anomaly detection. To address
+these issues, we develop Multi-pose Anomaly Detection (MAD) dataset and
+Pose-agnostic Anomaly Detection (PAD) benchmark, which takes the first step to
+address the pose-agnostic anomaly detection problem. Specifically, we build MAD
+using 20 complex-shaped LEGO toys including 4K views with various poses, and
+high-quality and diverse 3D anomalies in both simulated and real environments.
+Additionally, we propose a novel method OmniposeAD, trained using MAD,
+specifically designed for pose-agnostic anomaly detection. Through
+comprehensive evaluations, we demonstrate the relevance of our dataset and
+method. Furthermore, we provide an open-source benchmark library, including
+dataset and baseline methods that cover 8 anomaly detection paradigms, to
+facilitate future research and application in this domain. Code, data, and
+models are publicly available at https://github.com/EricLee0224/PAD.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2023. Codes are available at
+  https://github.com/EricLee0224/PAD</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MatFormer: Nested <span class="highlight-title">Transformer</span> for Elastic Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07707v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07707v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                         Devvrit, Sneha Kudugunta, Aditya Kusupati, Tim Dettmers, Kaifeng Chen, Inderjit Dhillon, Yulia Tsvetkov, Hannaneh Hajishirzi, Sham Kakade, Ali Farhadi, Prateek Jain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer models are deployed in a wide range of settings, from
+multi-accelerator clusters to standalone mobile phones. The diverse inference
+constraints in these scenarios necessitate practitioners to train foundation
+models such as PaLM 2, Llama, & ViTs as a series of models of varying sizes.
+Due to significant training costs, only a select few model sizes are trained
+and supported, limiting more fine-grained control over relevant tradeoffs,
+including latency, cost, and accuracy. This work introduces MatFormer, a nested
+Transformer architecture designed to offer elasticity in a variety of
+deployment constraints. Each Feed Forward Network (FFN) block of a MatFormer
+model is jointly optimized with a few nested smaller FFN blocks. This training
+procedure allows for the Mix'n'Match of model granularities across layers --
+i.e., a trained universal MatFormer model enables extraction of hundreds of
+accurate smaller models, which were never explicitly optimized. We empirically
+demonstrate MatFormer's effectiveness across different model classes (decoders
+& encoders), modalities (language & vision), and scales (up to 2.6B
+parameters). We find that a 2.6B decoder-only MatFormer language model (MatLM)
+allows us to extract smaller models spanning from 1.5B to 2.6B, each exhibiting
+comparable validation loss and one-shot downstream evaluations to their
+independently trained counterparts. Furthermore, we observe that smaller
+encoders extracted from a universal MatFormer-based ViT (MatViT) encoder
+preserve the metric-space structure for adaptive large-scale retrieval.
+Finally, we showcase that speculative decoding with the accurate and consistent
+submodels extracted from MatFormer can further reduce inference latency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 12 figures, first three authors contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Ferret: Refer and Ground Anything Anywhere at Any Granularity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07704v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07704v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoxuan You, Haotian Zhang, Zhe Gan, Xianzhi Du, Bowen Zhang, Zirui Wang, Liangliang Cao, Shih-Fu Chang, Yinfei Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Ferret, a new Multimodal Large Language Model (MLLM) capable of
+understanding spatial referring of any shape or granularity within an image and
+accurately grounding open-vocabulary descriptions. To unify referring and
+grounding in the LLM paradigm, Ferret employs a novel and powerful hybrid
+region representation that integrates discrete coordinates and continuous
+features jointly to represent a region in the image. To extract the continuous
+features of versatile regions, we propose a spatial-aware visual sampler, adept
+at handling varying sparsity across different shapes. Consequently, Ferret can
+accept diverse region inputs, such as points, bounding boxes, and free-form
+shapes. To bolster the desired capability of Ferret, we curate GRIT, a
+comprehensive refer-and-ground instruction tuning dataset including 1.1M
+samples that contain rich hierarchical spatial knowledge, with 95K hard
+negative data to promote model robustness. The resulting model not only
+achieves superior performance in classical referring and grounding tasks, but
+also greatly outperforms existing MLLMs in region-based and
+localization-demanded multimodal chatting. Our evaluations also reveal a
+significantly improved capability of describing image details and a remarkable
+alleviation in object hallucination. Code and data will be available at
+https://github.com/apple/ml-ferret
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>30 pages, 10 figures. Code/Project Website:
+  https://github.com/apple/ml-ferret</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ScaleCrafter: Tuning-free Higher-Resolution Visual Generation with
+  Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07702v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07702v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yingqing He, Shaoshu Yang, Haoxin Chen, Xiaodong Cun, Menghan Xia, Yong Zhang, Xintao Wang, Ran He, Qifeng Chen, Ying Shan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we investigate the capability of generating images from
+pre-trained diffusion models at much higher resolutions than the training image
+sizes. In addition, the generated images should have arbitrary image aspect
+ratios. When generating images directly at a higher resolution, 1024 x 1024,
+with the pre-trained Stable Diffusion using training images of resolution 512 x
+512, we observe persistent problems of object repetition and unreasonable
+object structures. Existing works for higher-resolution generation, such as
+attention-based and joint-diffusion approaches, cannot well address these
+issues. As a new perspective, we examine the structural components of the U-Net
+in diffusion models and identify the crucial cause as the limited perception
+field of convolutional kernels. Based on this key observation, we propose a
+simple yet effective re-dilation that can dynamically adjust the convolutional
+perception field during inference. We further propose the dispersed convolution
+and noise-damped classifier-free guidance, which can enable
+ultra-high-resolution image generation (e.g., 4096 x 4096). Notably, our
+approach does not require any training or optimization. Extensive experiments
+demonstrate that our approach can address the repetition issue well and achieve
+state-of-the-art performance on higher-resolution image synthesis, especially
+in texture details. Our work also suggests that a pre-trained diffusion model
+trained on low-resolution images can be directly used for high-resolution
+visual generation without further tuning, which may provide insights for future
+research on ultra-high-resolution image and video synthesis.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page: https://yingqinghe.github.io/scalecrafter/ Github:
+  https://github.com/YingqingHe/ScaleCrafter</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Scarcity to Efficiency: Improving CLIP Training via Visual-enriched
+  Captions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07699v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07699v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengfeng Lai, Haotian Zhang, Wentao Wu, Haoping Bai, Aleksei Timofeev, Xianzhi Du, Zhe Gan, Jiulong Shan, Chen-Nee Chuah, Yinfei Yang, Meng Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Web-crawled datasets are pivotal to the success of pre-training
+vision-language models, exemplified by CLIP. However, web-crawled AltTexts can
+be noisy and potentially irrelevant to images, thereby undermining the crucial
+image-text alignment. Existing methods for rewriting captions using large
+language models (LLMs) have shown promise on small, curated datasets like CC3M
+and CC12M. Nevertheless, their efficacy on massive web-captured captions is
+constrained by the inherent noise and randomness in such data. In this study,
+we address this limitation by focusing on two key aspects: data quality and
+data variety. Unlike recent LLM rewriting techniques, we emphasize exploiting
+visual concepts and their integration into the captions to improve data
+quality. For data variety, we propose a novel mixed training scheme that
+optimally leverages AltTexts alongside newly generated Visual-enriched Captions
+(VeC). We use CLIP as one example and adapt the method for CLIP training on
+large-scale web-crawled datasets, named VeCLIP. We conduct a comprehensive
+evaluation of VeCLIP across small, medium, and large scales of raw data. Our
+results show significant advantages in image-text alignment and overall model
+performance, underscoring the effectiveness of VeCLIP in improving CLIP
+training. For example, VeCLIP achieves a remarkable over 20% improvement in
+COCO and Flickr30k retrieval tasks under the 12M setting. For data efficiency,
+we also achieve a notable over 3% improvement while using only 14% of the data
+employed in the vanilla CLIP and 11% in ALIGN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CV/ML</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ConditionVideo: Training-Free Condition-Guided Text-to-Video Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07697v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07697v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Peng, Xinyuan Chen, Yaohui Wang, Chaochao Lu, Yu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent works have successfully extended large-scale text-to-image models to
+the video domain, producing promising results but at a high computational cost
+and requiring a large amount of video data. In this work, we introduce
+ConditionVideo, a training-free approach to text-to-video generation based on
+the provided condition, video, and input text, by leveraging the power of
+off-the-shelf text-to-image generation methods (e.g., Stable Diffusion).
+ConditionVideo generates realistic dynamic videos from random noise or given
+scene videos. Our method explicitly disentangles the motion representation into
+condition-guided and scenery motion components. To this end, the ConditionVideo
+model is designed with a UNet branch and a control branch. To improve temporal
+coherence, we introduce sparse bi-directional spatial-temporal attention
+(sBiST-Attn). The 3D control network extends the conventional 2D controlnet
+model, aiming to strengthen conditional generation accuracy by additionally
+leveraging the bi-directional frames in the temporal domain. Our method
+exhibits superior performance in terms of frame consistency, clip score, and
+conditional accuracy, outperforming other compared methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Orbital Polarimetric Tomography of a Flare Near the Sagittarius A*
+  Supermassive Black Hole 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07687v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07687v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aviad Levis, Andrew A. Chael, Katherine L. Bouman, Maciek Wielgus, Pratul P. Srinivasan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The interaction between the supermassive black hole at the center of the
+Milky Way, Sagittarius A$^*$, and its accretion disk, occasionally produces
+high energy flares seen in X-ray, infrared and radio. One mechanism for
+observed flares is the formation of compact bright regions that appear within
+the accretion disk and close to the event horizon. Understanding these flares
+can provide a window into black hole accretion processes. Although
+sophisticated simulations predict the formation of these flares, their
+structure has yet to be recovered by observations. Here we show the first
+three-dimensional (3D) reconstruction of an emission flare in orbit recovered
+from ALMA light curves observed on April 11, 2017. Our recovery results show
+compact bright regions at a distance of roughly 6 times the event horizon.
+Moreover, our recovery suggests a clockwise rotation in a low-inclination
+orbital plane, a result consistent with prior studies by EHT and GRAVITY
+collaborations. To recover this emission structure we solve a highly ill-posed
+tomography problem by integrating a neural 3D representation (an emergent
+artificial intelligence approach for 3D reconstruction) with a gravitational
+model for black holes. Although the recovered 3D structure is subject, and
+sometimes sensitive, to the model assumptions, under physically motivated
+choices we find that our results are stable and our approach is successful on
+simulated data. We anticipate that in the future, this approach could be used
+to analyze a richer collection of time-series data that could shed light on the
+mechanisms governing black hole and plasma dynamics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prediction of MET Overexpression in Non-Small Cell Lung Adenocarcinomas
+  from Hematoxylin and Eosin Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07682v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07682v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kshitij Ingale, Sun Hae Hong, Josh S. K. Bell, Abbas Rizvi, Amy Welch, Lingdao Sha, Irvin Ho, Kunal Nagpal, Aicha BenTaieb, Rohan P Joshi, Martin C Stumpe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  MET protein overexpression is a targetable event in non-small cell lung
+cancer (NSCLC) and is the subject of active drug development. Challenges in
+identifying patients for these therapies include lack of access to validated
+testing, such as standardized immunohistochemistry (IHC) assessment, and
+consumption of valuable tissue for a single gene/protein assay. Development of
+pre-screening algorithms using routinely available digitized hematoxylin and
+eosin (H&E)-stained slides to predict MET overexpression could promote testing
+for those who will benefit most. While assessment of MET expression using IHC
+is currently not routinely performed in NSCLC, next-generation sequencing is
+common and in some cases includes RNA expression panel testing. In this work,
+we leveraged a large database of matched H&E slides and RNA expression data to
+train a weakly supervised model to predict MET RNA overexpression directly from
+H&E images. This model was evaluated on an independent holdout test set of 300
+over-expressed and 289 normal patients, demonstrating an ROC-AUC of 0.70 (95th
+percentile interval: 0.66 - 0.74) with stable performance characteristics
+across different patient clinical variables and robust to synthetic noise on
+the test set. These results suggest that H&E-based predictive models could be
+useful to prioritize patients for confirmatory testing of MET protein or MET
+gene expression status.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable Image Similarity: Integrating Siamese Networks and Grad-CAM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07678v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07678v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ioannis E. Livieris, Emmanuel Pintelas, Niki Kiriakidou, Panagiotis Pintelas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the proliferation of image-based applications in various domains, the
+need for accurate and interpretable image similarity measures has become
+increasingly critical. Existing image similarity models often lack
+transparency, making it challenging to understand the reasons why two images
+are considered similar. In this paper, we propose the concept of explainable
+image similarity, where the goal is the development of an approach, which is
+capable of providing similarity scores along with visual factual and
+counterfactual explanations. Along this line, we present a new framework, which
+integrates Siamese Networks and Grad-CAM for providing explainable image
+similarity and discuss the potential benefits and challenges of adopting this
+approach. In addition, we provide a comprehensive discussion about factual and
+counterfactual explanations provided by the proposed framework for assisting
+decision making. The proposed approach has the potential to enhance the
+interpretability, trustworthiness and user acceptance of image-based systems in
+real-world image similarity applications. The implementation code can be found
+in https://github.com/ioannislivieris/Grad_CAM_Siamese.git.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The manuscript has been submitted for publication in "Journal of
+  Imaging"</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HaarNet: Large-scale Linear-Morphological Hybrid Network for RGB-D
+  Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07669v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07669v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rick Groenendijk, Leo Dorst, Theo Gevers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Signals from different modalities each have their own combination algebra
+which affects their sampling processing. RGB is mostly linear; depth is a
+geometric signal following the operations of mathematical morphology. If a
+network obtaining RGB-D input has both kinds of operators available in its
+layers, it should be able to give effective output with fewer parameters. In
+this paper, morphological elements in conjunction with more familiar linear
+modules are used to construct a mixed linear-morphological network called
+HaarNet. This is the first large-scale linear-morphological hybrid, evaluated
+on a set of sizeable real-world datasets. In the network, morphological Haar
+sampling is applied to both feature channels in several layers, which splits
+extreme values and high-frequency information such that both can be processed
+to improve both modalities. Moreover, morphologically parameterised ReLU is
+used, and morphologically-sound up-sampling is applied to obtain a
+full-resolution output. Experiments show that HaarNet is competitive with a
+state-of-the-art CNN, implying that morphological networks are a promising
+research direction for geometry-based learning tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Accelerating Vision <span class="highlight-title">Transformer</span>s Based on Heterogeneous Attention
+  Patterns 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07664v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07664v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deli Yu, Teng Xi, Jianwei Li, Baopu Li, Gang Zhang, Haocheng Feng, Junyu Han, Jingtuo Liu, Errui Ding, Jingdong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Vision Transformers (ViTs) have attracted a lot of attention in the
+field of computer vision. Generally, the powerful representative capacity of
+ViTs mainly benefits from the self-attention mechanism, which has a high
+computation complexity. To accelerate ViTs, we propose an integrated
+compression pipeline based on observed heterogeneous attention patterns across
+layers. On one hand, different images share more similar attention patterns in
+early layers than later layers, indicating that the dynamic query-by-key
+self-attention matrix may be replaced with a static self-attention matrix in
+early layers. Then, we propose a dynamic-guided static self-attention (DGSSA)
+method where the matrix inherits self-attention information from the replaced
+dynamic self-attention to effectively improve the feature representation
+ability of ViTs. On the other hand, the attention maps have more low-rank
+patterns, which reflect token redundancy, in later layers than early layers. In
+a view of linear dimension reduction, we further propose a method of global
+aggregation pyramid (GLAD) to reduce the number of tokens in later layers of
+ViTs, such as Deit. Experimentally, the integrated compression pipeline of
+DGSSA and GLAD can accelerate up to 121% run-time throughput compared with
+DeiT, which surpasses all SOTA approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Video Inpainting Guided by Audio-Visual Self-Supervision <span class="chip">ICASSP 2022</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07663v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07663v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kyuyeon Kim, Junsik Jung, Woo Jae Kim, Sung-Eui Yoon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans can easily imagine a scene from auditory information based on their
+prior knowledge of audio-visual events. In this paper, we mimic this innate
+human ability in deep learning models to improve the quality of video
+inpainting. To implement the prior knowledge, we first train the audio-visual
+network, which learns the correspondence between auditory and visual
+information. Then, the audio-visual network is employed as a guider that
+conveys the prior knowledge of audio-visual correspondence to the video
+inpainting network. This prior knowledge is transferred through our proposed
+two novel losses: audio-visual attention loss and audio-visual pseudo-class
+consistency loss. These two losses further improve the performance of the video
+inpainting by encouraging the inpainting result to have a high correspondence
+to its synchronized audio. Experimental results demonstrate that our proposed
+method can restore a wider domain of video scenes and is particularly effective
+when the sounding object in the scene is partially blinded.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICASSP 2022</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Context-Enhanced Detector For Building Detection From Remote Sensing
+  Images 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07638v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07638v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziyue Huang, Mingming Zhang, Qingjie Liu, Wei Wang, Zhe Dong, Yunhong Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of building detection from remote sensing images has made
+significant progress, but faces challenges in achieving high-accuracy detection
+due to the diversity in building appearances and the complexity of vast scenes.
+To address these challenges, we propose a novel approach called
+Context-Enhanced Detector (CEDet). Our approach utilizes a three-stage cascade
+structure to enhance the extraction of contextual information and improve
+building detection accuracy. Specifically, we introduce two modules: the
+Semantic Guided Contextual Mining (SGCM) module, which aggregates multi-scale
+contexts and incorporates an attention mechanism to capture long-range
+interactions, and the Instance Context Mining Module (ICMM), which captures
+instance-level relationship context by constructing a spatial relationship
+graph and aggregating instance features. Additionally, we introduce a semantic
+segmentation loss based on pseudo-masks to guide contextual information
+extraction. Our method achieves state-of-the-art performance on three building
+detection benchmarks, including CNBuilding-9P, CNBuilding-23P, and SpaceNet.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Attention-Map Augmentation for Hypercomplex Breast Cancer Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07633v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07633v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eleonora Lopez, Filippo Betello, Federico Carmignani, Eleonora Grassucci, Danilo Comminiello
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Breast cancer is the most widespread neoplasm among women and early detection
+of this disease is critical. Deep learning techniques have become of great
+interest to improve diagnostic performance. Nonetheless, discriminating between
+malignant and benign masses from whole mammograms remains challenging due to
+them being almost identical to an untrained eye and the region of interest
+(ROI) occupying a minuscule portion of the entire image. In this paper, we
+propose a framework, parameterized hypercomplex attention maps (PHAM), to
+overcome these problems. Specifically, we deploy an augmentation step based on
+computing attention maps. Then, the attention maps are used to condition the
+classification step by constructing a multi-dimensional input comprised of the
+original breast cancer image and the corresponding attention map. In this step,
+a parameterized hypercomplex neural network (PHNN) is employed to perform
+breast cancer classification. The framework offers two main advantages. First,
+attention maps provide critical information regarding the ROI and allow the
+neural model to concentrate on it. Second, the hypercomplex architecture has
+the ability to model local relations between input dimensions thanks to
+hypercomplex algebra rules, thus properly exploiting the information provided
+by the attention map. We demonstrate the efficacy of the proposed framework on
+both mammography images as well as histopathological ones, surpassing
+attention-based state-of-the-art networks and the real-valued counterpart of
+our method. The code of our work is available at
+https://github.com/elelo22/AttentionBCS.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to Pattern Recognition Letters</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span> Backdoors in Visual <span class="highlight-title">Prompt</span> Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07632v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07632v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hai Huang, Zhengyu Zhao, Michael Backes, Yun Shen, Yang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning large pre-trained computer vision models is infeasible for
+resource-limited users. Visual prompt learning (VPL) has thus emerged to
+provide an efficient and flexible alternative to model fine-tuning through
+Visual Prompt as a Service (VPPTaaS). Specifically, the VPPTaaS provider
+optimizes a visual prompt given downstream data, and downstream users can use
+this prompt together with the large pre-trained model for prediction. However,
+this new learning paradigm may also pose security risks when the VPPTaaS
+provider instead provides a malicious visual prompt. In this paper, we take the
+first step to explore such risks through the lens of backdoor attacks.
+Specifically, we propose BadVisualPrompt, a simple yet effective backdoor
+attack against VPL. For example, poisoning $5\%$ CIFAR10 training data leads to
+above $99\%$ attack success rates with only negligible model accuracy drop by
+$1.5\%$. In particular, we identify and then address a new technical challenge
+related to interactions between the backdoor trigger and visual prompt, which
+does not exist in conventional, model-level backdoors. Moreover, we provide
+in-depth analyses of seven backdoor defenses from model, prompt, and input
+levels. Overall, all these defenses are either ineffective or impractical to
+mitigate our BadVisualPrompt, implying the critical vulnerability of VPL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual Quaternion Rotational and Translational Equivariance in 3D Rigid
+  Motion Modelling <span class="chip">SP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07623v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07623v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guilherme Vieira, Eleonora Grassucci, Marcos Eduardo Valle, Danilo Comminiello
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Objects' rigid motions in 3D space are described by rotations and
+translations of a highly-correlated set of points, each with associated $x,y,z$
+coordinates that real-valued networks consider as separate entities, losing
+information. Previous works exploit quaternion algebra and their ability to
+model rotations in 3D space. However, these algebras do not properly encode
+translations, leading to sub-optimal performance in 3D learning tasks. To
+overcome these limitations, we employ a dual quaternion representation of rigid
+motions in the 3D space that jointly describes rotations and translations of
+point sets, processing each of the points as a single entity. Our approach is
+translation and rotation equivariant, so it does not suffer from shifts in the
+data and better learns object trajectories, as we validate in the experimental
+evaluations. Models endowed with this formulation outperform previous
+approaches in a human pose forecasting application, attesting to the
+effectiveness of the proposed dual quaternion formulation for rigid motions in
+3D space.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IEEE MLSP 2023 (Honorable Mention Top 10% Outstanding
+  Paper)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dual Radar: A Multi-modal <span class="highlight-title">Dataset</span> with Dual 4D Radar for Autononous
+  Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07602v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07602v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Zhang, Li Wang, Jian Chen, Cheng Fang, Lei Yang, Ziying Song, Guangqi Yang, Yichen Wang, Xiaofei Zhang, Jun Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Radar has stronger adaptability in adverse scenarios for autonomous driving
+environmental perception compared to widely adopted cameras and LiDARs.
+Compared with commonly used 3D radars, latest 4D radars have precise vertical
+resolution and higher point cloud density, making it a highly promising sensor
+for autonomous driving in complex environmental perception. However, due to the
+much higher noise than LiDAR, manufacturers choose different filtering
+strategies, resulting in an inverse ratio between noise level and point cloud
+density. There is still a lack of comparative analysis on which method is
+beneficial for deep learning-based perception algorithms in autonomous driving.
+One of the main reasons is that current datasets only adopt one type of 4D
+radar, making it difficult to compare different 4D radars in the same scene.
+Therefore, in this paper, we introduce a novel large-scale multi-modal dataset
+featuring, for the first time, two types of 4D radars captured simultaneously.
+This dataset enables further research into effective 4D radar perception
+algorithms.Our dataset consists of 151 consecutive series, most of which last
+20 seconds and contain 10,007 meticulously synchronized and annotated frames.
+Moreover, our dataset captures a variety of challenging driving scenarios,
+including many road conditions, weather conditions, nighttime and daytime with
+different lighting intensities and periods. Our dataset annotates consecutive
+frames, which can be applied to 3D object detection and tracking, and also
+supports the study of multi-modal tasks. We experimentally validate our
+dataset, providing valuable results for studying different types of 4D radars.
+This dataset is released on https://github.com/adept-thu/Dual-Radar.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PeP: a Point enhanced Painting method for unified point cloud tasks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07591v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07591v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zichao Dong, Hang Ji, Xufeng Huang, Weikun Zhang, Xin Zhan, Junbo Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Point encoder is of vital importance for point cloud recognition. As the very
+beginning step of whole model pipeline, adding features from diverse sources
+and providing stronger feature encoding mechanism would provide better input
+for downstream modules. In our work, we proposed a novel PeP module to tackle
+above issue. PeP contains two main parts, a refined point painting method and a
+LM-based point encoder. Experiments results on the nuScenes and KITTI datasets
+validate the superior performance of our PeP. The advantages leads to strong
+performance on both semantic segmentation and object detection, in both lidar
+and multi-modal settings. Notably, our PeP module is model agnostic and
+plug-and-play. Our code will be publicly available soon.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Discrepancy Aware Framework for Robust Anomaly Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07585v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07585v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuxuan Cai, Dingkang Liang, Dongliang Luo, Xinwei He, Xin Yang, Xiang Bai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Defect detection is a critical research area in artificial intelligence.
+Recently, synthetic data-based self-supervised learning has shown great
+potential on this task. Although many sophisticated synthesizing strategies
+exist, little research has been done to investigate the robustness of models
+when faced with different strategies. In this paper, we focus on this issue and
+find that existing methods are highly sensitive to them. To alleviate this
+issue, we present a Discrepancy Aware Framework (DAF), which demonstrates
+robust performance consistently with simple and cheap strategies across
+different anomaly detection benchmarks. We hypothesize that the high
+sensitivity to synthetic data of existing self-supervised methods arises from
+their heavy reliance on the visual appearance of synthetic data during
+decoding. In contrast, our method leverages an appearance-agnostic cue to guide
+the decoder in identifying defects, thereby alleviating its reliance on
+synthetic appearance. To this end, inspired by existing knowledge distillation
+methods, we employ a teacher-student network, which is trained based on
+synthesized outliers, to compute the discrepancy map as the cue. Extensive
+experiments on two challenging datasets prove the robustness of our method.
+Under the simple synthesis strategies, it outperforms existing methods by a
+large margin. Furthermore, it also achieves the state-of-the-art localization
+performance. Code is available at: https://github.com/caiyuxuan1120/DAF.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Transactions on Industrial Informatics. Code is
+  available at: https://github.com/caiyuxuan1120/DAF</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Centrality of the Fingerprint Core Location 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07584v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07584v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Laurenz Ruzicka, Bernhard Strobl, Bernhard Kohn, Clemens Heitzinger
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fingerprints have long been recognized as a unique and reliable means of
+personal identification. Central to the analysis and enhancement of
+fingerprints is the concept of the fingerprint core. Although the location of
+the core is used in many applications, to the best of our knowledge, this study
+is the first to investigate the empirical distribution of the core over a
+large, combined dataset of rolled, as well as plain fingerprint recordings. We
+identify and investigate the extent of incomplete rolling during the rolled
+fingerprint acquisition and investigate the centrality of the core. After
+correcting for the incomplete rolling, we find that the core deviates from the
+fingerprint center by 5.7% $\pm$ 5.2% to 7.6% $\pm$ 6.9%, depending on the
+finger. Additionally, we find that the assumption of normal distribution of the
+core position of plain fingerprint recordings cannot be rejected, but for
+rolled ones it can. Therefore, we use a multi-step process to find the
+distribution of the rolled fingerprint recordings. The process consists of an
+Anderson-Darling normality test, the Bayesian Information Criterion to reduce
+the number of possible candidate distributions and finally a Generalized Monte
+Carlo goodness-of-fit procedure to find the best fitting distribution. We find
+the non-central Fischer distribution best describes the cores' horizontal
+positions. Finally, we investigate the correlation between mean core position
+offset and the NFIQ 2 score and find that the NFIQ 2 prefers rolled fingerprint
+recordings where the core sits slightly below the fingerprint center.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Relational Prior Knowledge Graphs for Detection and Instance
+  Segmentation <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07573v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07573v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Osman Ülger, Yu Wang, Ysbrand Galama, Sezer Karaoglu, Theo Gevers, Martin R. Oswald
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans have a remarkable ability to perceive and reason about the world
+around them by understanding the relationships between objects. In this paper,
+we investigate the effectiveness of using such relationships for object
+detection and instance segmentation. To this end, we propose a Relational
+Prior-based Feature Enhancement Model (RP-FEM), a graph transformer that
+enhances object proposal features using relational priors. The proposed
+architecture operates on top of scene graphs obtained from initial proposals
+and aims to concurrently learn relational context modeling for object detection
+and instance segmentation. Experimental evaluations on COCO show that the
+utilization of scene graphs, augmented with relational priors, offer benefits
+for object detection and instance segmentation. RP-FEM demonstrates its
+capacity to suppress improbable class predictions within the image while also
+preventing the model from generating duplicate predictions, leading to
+improvements over the baseline model on which it is built.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published in ICCV2023 SG2RL Workshop</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Impact of Label Types on Training SWIN Models with Overhead Imagery 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07572v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07572v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryan Ford, Kenneth Hutchison, Nicholas Felts, Benjamin Cheng, Jesse Lew, Kyle Jackson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding the impact of data set design on model training and performance
+can help alleviate the costs associated with generating remote sensing and
+overhead labeled data. This work examined the impact of training shifted window
+transformers using bounding boxes and segmentation labels, where the latter are
+more expensive to produce. We examined classification tasks by comparing models
+trained with both target and backgrounds against models trained with only
+target pixels, extracted by segmentation labels. For object detection models,
+we compared performance using either label type when training. We found that
+the models trained on only target pixels do not show performance improvement
+for classification tasks, appearing to conflate background pixels in the
+evaluation set with target pixels. For object detection, we found that models
+trained with either label type showed equivalent performance across testing. We
+found that bounding boxes appeared to be sufficient for tasks that did not
+require more complex labels, such as object segmentation. Continuing work to
+determine consistency of this result across data types and model architectures
+could potentially result in substantial savings in generating remote sensing
+data sets for deep learning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Does resistance to Style-Transfer equal Shape Bias? Evaluating Shape
+  Bias by Distorted Shape 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07555v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07555v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqi Wen, Tianqin Li, Tai Sing Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning models are known to exhibit a strong texture bias, while human
+tends to rely heavily on global shape for object recognition. The current
+benchmark for evaluating a model's shape bias is a set of style-transferred
+images with the assumption that resistance to the attack of style transfer is
+related to the development of shape sensitivity in the model. In this work, we
+show that networks trained with style-transfer images indeed learn to ignore
+style, but its shape bias arises primarily from local shapes. We provide a
+Distorted Shape Testbench (DiST) as an alternative measurement of global shape
+sensitivity. Our test includes 2400 original images from ImageNet-1K, each of
+which is accompanied by two images with the global shapes of the original image
+distorted while preserving its texture via the texture synthesis program. We
+found that (1) models that performed well on the previous shape bias evaluation
+do not fare well in the proposed DiST; (2) the widely adopted ViT models do not
+show significant advantages over Convolutional Neural Networks (CNNs) on this
+benchmark despite that ViTs rank higher on the previous shape bias tests. (3)
+training with DiST images bridges the significant gap between human and
+existing SOTA models' performance while preserving the models' accuracy on
+standard image classification tasks; training with DiST images and
+style-transferred images are complementary, and can be combined to train
+network together to enhance both the global and local shape sensitivity of the
+network. Our code will be host at: https://github.com/leelabcnbc/DiST
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ProtoHPE: Prototype-guided High-frequency Patch Enhancement for
+  Visible-Infrared Person Re-identification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07552v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07552v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guiwei Zhang, Yongfei Zhang, Zichang Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visible-infrared person re-identification is challenging due to the large
+modality gap. To bridge the gap, most studies heavily rely on the correlation
+of visible-infrared holistic person images, which may perform poorly under
+severe distribution shifts. In contrast, we find that some cross-modal
+correlated high-frequency components contain discriminative visual patterns and
+are less affected by variations such as wavelength, pose, and background
+clutter than holistic images. Therefore, we are motivated to bridge the
+modality gap based on such high-frequency components, and propose
+\textbf{Proto}type-guided \textbf{H}igh-frequency \textbf{P}atch
+\textbf{E}nhancement (ProtoHPE) with two core designs. \textbf{First}, to
+enhance the representation ability of cross-modal correlated high-frequency
+components, we split patches with such components by Wavelet Transform and
+exponential moving average Vision Transformer (ViT), then empower ViT to take
+the split patches as auxiliary input. \textbf{Second}, to obtain semantically
+compact and discriminative high-frequency representations of the same identity,
+we propose Multimodal Prototypical Contrast. To be specific, it hierarchically
+captures the comprehensive semantics of different modal instances, facilitating
+the aggregation of high-frequency representations belonging to the same
+identity. With it, ViT can capture key high-frequency components during
+inference without relying on ProtoHPE, thus bringing no extra complexity.
+Extensive experiments validate the effectiveness of ProtoHPE.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Attribute Localization and Revision Network for Zero-Shot Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07548v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07548v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junzhe Xu, Suling Duan, Chenwei Tang, Zhenan He, Jiancheng Lv
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Zero-shot learning enables the model to recognize unseen categories with the
+aid of auxiliary semantic information such as attributes. Current works
+proposed to detect attributes from local image regions and align extracted
+features with class-level semantics. In this paper, we find that the choice
+between local and global features is not a zero-sum game, global features can
+also contribute to the understanding of attributes. In addition, aligning
+attribute features with class-level semantics ignores potential intra-class
+attribute variation. To mitigate these disadvantages, we present Attribute
+Localization and Revision Network in this paper. First, we design Attribute
+Localization Module (ALM) to capture both local and global features from image
+regions, a novel module called Scale Control Unit is incorporated to fuse
+global and local representations. Second, we propose Attribute Revision Module
+(ARM), which generates image-level semantics by revising the ground-truth value
+of each attribute, compensating for performance degradation caused by ignoring
+intra-class variation. Finally, the output of ALM will be aligned with revised
+semantics produced by ARM to achieve the training process. Comprehensive
+experimental results on three widely used benchmarks demonstrate the
+effectiveness of our model in the zero-shot prediction task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Human-Centered Evaluation of XAI Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07534v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07534v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karam Dawoud, Wojciech Samek, Sebastian Lapuschkin, Sebastian Bosse
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the ever-evolving field of Artificial Intelligence, a critical challenge
+has been to decipher the decision-making processes within the so-called "black
+boxes" in deep learning. Over recent years, a plethora of methods have emerged,
+dedicated to explaining decisions across diverse tasks. Particularly in tasks
+like image classification, these methods typically identify and emphasize the
+pivotal pixels that most influence a classifier's prediction. Interestingly,
+this approach mirrors human behavior: when asked to explain our rationale for
+classifying an image, we often point to the most salient features or aspects.
+Capitalizing on this parallel, our research embarked on a user-centric study.
+We sought to objectively measure the interpretability of three leading
+explanation methods: (1) Prototypical Part Network, (2) Occlusion, and (3)
+Layer-wise Relevance Propagation. Intriguingly, our results highlight that
+while the regions spotlighted by these methods can vary widely, they all offer
+humans a nearly equivalent depth of understanding. This enables users to
+discern and categorize images efficiently, reinforcing the value of these
+methods in enhancing AI transparency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ S4C: <span class="highlight-title">Self-Supervised</span> Semantic Scene Completion with Neural Fields 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07522v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07522v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adrian Hayler, Felix Wimbauer, Dominik Muhle, Christian Rupprecht, Daniel Cremers
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  3D semantic scene understanding is a fundamental challenge in computer
+vision. It enables mobile agents to autonomously plan and navigate arbitrary
+environments. SSC formalizes this challenge as jointly estimating dense
+geometry and semantic information from sparse observations of a scene. Current
+methods for SSC are generally trained on 3D ground truth based on aggregated
+LiDAR scans. This process relies on special sensors and annotation by hand
+which are costly and do not scale well. To overcome this issue, our work
+presents the first self-supervised approach to SSC called S4C that does not
+rely on 3D ground truth data. Our proposed method can reconstruct a scene from
+a single image and only relies on videos and pseudo segmentation ground truth
+generated from off-the-shelf image segmentation network during training. Unlike
+existing methods, which use discrete voxel grids, we represent scenes as
+implicit semantic fields. This formulation allows querying any point within the
+camera frustum for occupancy and semantic class. Our architecture is trained
+through rendering-based self-supervised losses. Nonetheless, our method
+achieves performance close to fully supervised state-of-the-art methods.
+Additionally, our method demonstrates strong generalization capabilities and
+can synthesize accurate segmentation maps for far away viewpoints.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CM-PIE: Cross-modal perception for interactive-enhanced audio-visual
+  video parsing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07517v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07517v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaru Chen, Ruohao Guo, Xubo Liu, Peipei Wu, Guangyao Li, Zhenbo Li, Wenwu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio-visual video parsing is the task of categorizing a video at the segment
+level with weak labels, and predicting them as audible or visible events.
+Recent methods for this task leverage the attention mechanism to capture the
+semantic correlations among the whole video across the audio-visual modalities.
+However, these approaches have overlooked the importance of individual segments
+within a video and the relationship among them, and tend to rely on a single
+modality when learning features. In this paper, we propose a novel
+interactive-enhanced cross-modal perception method~(CM-PIE), which can learn
+fine-grained features by applying a segment-based attention module.
+Furthermore, a cross-modal aggregation block is introduced to jointly optimize
+the semantic representation of audio and visual signals by enhancing
+inter-modal interactions. The experimental results show that our model offers
+improved parsing performance on the Look, Listen, and Parse dataset compared to
+other methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures, 15 references</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Unified Remote Sensing Anomaly Detector Across Modalities and Scenes
+  via Deviation Relationship Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07511v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07511v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingtao Li, Xinyu Wang, Hengwei Zhao, Liangpei Zhang, Yanfei Zhong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remote sensing anomaly detector can find the objects deviating from the
+background as potential targets. Given the diversity in earth anomaly types, a
+unified anomaly detector across modalities and scenes should be cost-effective
+and flexible to new earth observation sources and anomaly types. However, the
+current anomaly detectors are limited to a single modality and single scene,
+since they aim to learn the varying background distribution. Motivated by the
+universal anomaly deviation pattern, in that anomalies exhibit deviations from
+their local context, we exploit this characteristic to build a unified anomaly
+detector. Firstly, we reformulate the anomaly detection task as an undirected
+bilayer graph based on the deviation relationship, where the anomaly score is
+modeled as the conditional probability, given the pattern of the background and
+normal objects. The learning objective is then expressed as a conditional
+probability ranking problem. Furthermore, we design an instantiation of the
+reformulation in the data, architecture, and optimization aspects. Simulated
+spectral and spatial anomalies drive the instantiated architecture. The model
+is optimized directly for the conditional probability ranking. The proposed
+model was validated in five modalities including the hyperspectral, visible
+light, synthetic aperture radar (SAR), infrared and low light to show its
+unified detection ability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Journal paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Heuristic Vision <span class="highlight-title">Pre-Train</span>ing with <span class="highlight-title">Self-Supervised</span> and Supervised
+  Multi-Task Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07510v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07510v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiming Qian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To mimic human vision with the way of recognizing the diverse and open world,
+foundation vision models are much critical. While recent techniques of
+self-supervised learning show the promising potentiality of this mission, we
+argue that signals from labelled data are also important for common-sense
+recognition, and properly chosen pre-text tasks can facilitate the efficiency
+of vision representation learning. To this end, we propose a novel pre-training
+framework by adopting both self-supervised and supervised visual pre-text tasks
+in a multi-task manner. Specifically, given an image, we take a heuristic way
+by considering its intrinsic style properties, inside objects with their
+locations and correlations, and how it looks like in 3D space for basic visual
+understanding. However, large-scale object bounding boxes and correlations are
+usually hard to achieve. Alternatively, we develop a hybrid method by
+leveraging both multi-label classification and self-supervised learning. On the
+one hand, under the multi-label supervision, the pre-trained model can explore
+the detailed information of an image, e.g., image types, objects, and part of
+semantic relations. On the other hand, self-supervised learning tasks, with
+respect to Masked Image Modeling (MIM) and contrastive learning, can help the
+model learn pixel details and patch correlations. Results show that our
+pre-trained models can deliver results on par with or better than
+state-of-the-art (SOTA) results on multiple visual tasks. For example, with a
+vanilla Swin-B backbone, we achieve 85.3\% top-1 accuracy on ImageNet-1K
+classification, 47.9 box AP on COCO object detection for Mask R-CNN, and 50.6
+mIoU on ADE-20K semantic segmentation when using Upernet. The performance shows
+the ability of our vision foundation model to serve general purpose vision
+tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Hierarchical Feature Sharing for Efficient <span class="highlight-title">Dataset</span>
+  Condensation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07506v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07506v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haizhong Zheng, Jiachen Sun, Shutong Wu, Bhavya Kailkhura, Zhuoqing Mao, Chaowei Xiao, Atul Prakash
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given a real-world dataset, data condensation (DC) aims to synthesize a
+significantly smaller dataset that captures the knowledge of this dataset for
+model training with high performance. Recent works propose to enhance DC with
+data parameterization, which condenses data into parameterized data containers
+rather than pixel space. The intuition behind data parameterization is to
+encode shared features of images to avoid additional storage costs. In this
+paper, we recognize that images share common features in a hierarchical way due
+to the inherent hierarchical structure of the classification system, which is
+overlooked by current data parameterization methods. To better align DC with
+this hierarchical nature and encourage more efficient information sharing
+inside data containers, we propose a novel data parameterization architecture,
+Hierarchical Memory Network (HMN). HMN stores condensed data in a three-tier
+structure, representing the dataset-level, class-level, and instance-level
+features. Another helpful property of the hierarchical architecture is that HMN
+naturally ensures good independence among images despite achieving information
+sharing. This enables instance-level pruning for HMN to reduce redundant
+information, thereby further minimizing redundancy and enhancing performance.
+We evaluate HMN on four public datasets (SVHN, CIFAR10, CIFAR100, and
+Tiny-ImageNet) and compare HMN with eight DC baselines. The evaluation results
+show that our proposed method outperforms all baselines, even when trained with
+a batch-based loss consuming less GPU memory.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PtychoDV: Vision <span class="highlight-title">Transformer</span>-Based Deep Unrolling Network for
+  Ptychographic Image Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07504v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07504v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weijie Gan, Qiuchen Zhai, Michael Thompson McCann, Cristina Garcia Cardona, Ulugbek S. Kamilov, Brendt Wohlberg
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ptychography is an imaging technique that captures multiple overlapping
+snapshots of a sample, illuminated coherently by a moving localized probe. The
+image recovery from ptychographic data is generally achieved via an iterative
+algorithm that solves a nonlinear phase-field problem derived from measured
+diffraction patterns. However, these approaches have high computational cost.
+In this paper, we introduce PtychoDV, a novel deep model-based network designed
+for efficient, high-quality ptychographic image reconstruction. PtychoDV
+comprises a vision transformer that generates an initial image from the set of
+raw measurements, taking into consideration their mutual correlations. This is
+followed by a deep unrolling network that refines the initial image using
+learnable convolutional priors and the ptychography measurement model.
+Experimental results on simulated data demonstrate that PtychoDV is capable of
+outperforming existing deep learning methods for this problem, and
+significantly reduces computational cost compared to iterative methodologies,
+while maintaining competitive performance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Boosting Black-box Attack to Deep Neural Networks with Conditional
+  Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07492v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07492v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renyang Liu, Wei Zhou, Tianwei Zhang, Kangjie Chen, Jun Zhao, Kwok-Yan Lam
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Existing black-box attacks have demonstrated promising potential in creating
+adversarial examples (AE) to deceive deep learning models. Most of these
+attacks need to handle a vast optimization space and require a large number of
+queries, hence exhibiting limited practical impacts in real-world scenarios. In
+this paper, we propose a novel black-box attack strategy, Conditional Diffusion
+Model Attack (CDMA), to improve the query efficiency of generating AEs under
+query-limited situations. The key insight of CDMA is to formulate the task of
+AE synthesis as a distribution transformation problem, i.e., benign examples
+and their corresponding AEs can be regarded as coming from two distinctive
+distributions and can transform from each other with a particular converter.
+Unlike the conventional \textit{query-and-optimization} approach, we generate
+eligible AEs with direct conditional transform using the aforementioned data
+converter, which can significantly reduce the number of queries needed. CDMA
+adopts the conditional Denoising Diffusion Probabilistic Model as the
+converter, which can learn the transformation from clean samples to AEs, and
+ensure the smooth development of perturbed noise resistant to various defense
+strategies. We demonstrate the effectiveness and efficiency of CDMA by
+comparing it with nine state-of-the-art black-box attacks across three
+benchmark datasets. On average, CDMA can reduce the query count to a handful of
+times; in most cases, the query count is only ONE. We also show that CDMA can
+obtain $>99\%$ attack success rate for untarget attacks over all datasets and
+targeted attack over CIFAR-10 with the noise budget of $\epsilon=16$.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ FG<span class="highlight-title">Prompt</span>: Fine-grained Goal <span class="highlight-title">Prompt</span>ing for Image-goal Navigation <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07473v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07473v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinyu Sun, Peihao Chen, Jugang Fan, Thomas H. Li, Jian Chen, Mingkui Tan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning to navigate to an image-specified goal is an important but
+challenging task for autonomous systems. The agent is required to reason the
+goal location from where a picture is shot. Existing methods try to solve this
+problem by learning a navigation policy, which captures semantic features of
+the goal image and observation image independently and lastly fuses them for
+predicting a sequence of navigation actions. However, these methods suffer from
+two major limitations. 1) They may miss detailed information in the goal image,
+and thus fail to reason the goal location. 2) More critically, it is hard to
+focus on the goal-relevant regions in the observation image, because they
+attempt to understand observation without goal conditioning. In this paper, we
+aim to overcome these limitations by designing a Fine-grained Goal Prompting
+(FGPrompt) method for image-goal navigation. In particular, we leverage
+fine-grained and high-resolution feature maps in the goal image as prompts to
+perform conditioned embedding, which preserves detailed information in the goal
+image and guides the observation encoder to pay attention to goal-relevant
+regions. Compared with existing methods on the image-goal navigation benchmark,
+our method brings significant performance improvement on 3 benchmark datasets
+(i.e., Gibson, MP3D, and HM3D). Especially on Gibson, we surpass the
+state-of-the-art success rate by 8% with only 1/50 model size. Project page:
+https://xinyusun.github.io/fgprompt-pages
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PoRF: Pose Residual Field for Accurate Neural Surface Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07449v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07449v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jia-Wang Bian, Wenjing Bian, Victor Adrian Prisacariu, Philip Torr
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural surface reconstruction is sensitive to the camera pose noise, even if
+state-of-the-art pose estimators like COLMAP or ARKit are used. More
+importantly, existing Pose-NeRF joint optimisation methods have struggled to
+improve pose accuracy in challenging real-world scenarios. To overcome the
+challenges, we introduce the pose residual field (\textbf{PoRF}), a novel
+implicit representation that uses an MLP for regressing pose updates. This is
+more robust than the conventional pose parameter optimisation due to parameter
+sharing that leverages global information over the entire sequence.
+Furthermore, we propose an epipolar geometry loss to enhance the supervision
+that leverages the correspondences exported from COLMAP results without the
+extra computational overhead. Our method yields promising results. On the DTU
+dataset, we reduce the rotation error by 78\% for COLMAP poses, leading to the
+decreased reconstruction Chamfer distance from 3.48mm to 0.85mm. On the
+MobileBrick dataset that contains casually captured unbounded 360-degree
+videos, our method refines ARKit poses and improves the reconstruction F1 score
+from 69.18 to 75.67, outperforming that with the dataset provided ground-truth
+pose (75.14). These achievements demonstrate the efficacy of our approach in
+refining camera poses and improving the accuracy of neural surface
+reconstruction in real-world scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distance-based Weighted <span class="highlight-title">Transformer</span> Network for Image Completion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07440v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07440v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pourya Shamsolmoali, Masoumeh Zareapoor, Huiyu Zhou, Xuelong Li, Yue Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The challenge of image generation has been effectively modeled as a problem
+of structure priors or transformation. However, existing models have
+unsatisfactory performance in understanding the global input image structures
+because of particular inherent features (for example, local inductive prior).
+Recent studies have shown that self-attention is an efficient modeling
+technique for image completion problems. In this paper, we propose a new
+architecture that relies on Distance-based Weighted Transformer (DWT) to better
+understand the relationships between an image's components. In our model, we
+leverage the strengths of both Convolutional Neural Networks (CNNs) and DWT
+blocks to enhance the image completion process. Specifically, CNNs are used to
+augment the local texture information of coarse priors and DWT blocks are used
+to recover certain coarse textures and coherent visual structures. Unlike
+current approaches that generally use CNNs to create feature maps, we use the
+DWT to encode global dependencies and compute distance-based weighted feature
+maps, which substantially minimizes the problem of visual ambiguities.
+Meanwhile, to better produce repeated textures, we introduce Residual Fast
+Fourier Convolution (Res-FFC) blocks to combine the encoder's skip features
+with the coarse features provided by our generator. Furthermore, a simple yet
+effective technique is proposed to normalize the non-zero values of
+convolutions, and fine-tune the network layers for regularization of the
+gradient norms to provide an efficient training stabiliser. Extensive
+quantitative and qualitative experiments on three challenging datasets
+demonstrate the superiority of our proposed model compared to existing
+approaches.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DESTINE: Dynamic Goal Queries with Temporal Transductive Alignment for
+  Trajectory Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07438v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07438v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rezaul Karim, Soheil Mohamad Alizadeh Shabestary, Amir Rasouli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Predicting temporally consistent road users' trajectories in a multi-agent
+setting is a challenging task due to unknown characteristics of agents and
+their varying intentions. Besides using semantic map information and modeling
+interactions, it is important to build an effective mechanism capable of
+reasoning about behaviors at different levels of granularity. To this end, we
+propose Dynamic goal quErieS with temporal Transductive alIgNmEnt (DESTINE)
+method. Unlike past arts, our approach 1) dynamically predicts agents' goals
+irrespective of particular road structures, such as lanes, allowing the method
+to produce a more accurate estimation of destinations; 2) achieves map
+compliant predictions by generating future trajectories in a coarse-to-fine
+fashion, where the coarser predictions at a lower frame rate serve as
+intermediate goals; and 3) uses an attention module designed to temporally
+align predicted trajectories via masked attention. Using the common Argoverse
+benchmark dataset, we show that our method achieves state-of-the-art
+performance on various metrics, and further investigate the contributions of
+proposed modules via comprehensive ablation studies.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6 tables 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Concept T2I-Zero: Tweaking Only The Text Embeddings and Nothing
+  Else 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07419v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07419v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hazarapet Tunanyan, Dejia Xu, Shant Navasardyan, Zhangyang Wang, Humphrey Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advances in text-to-image diffusion models have enabled the
+photorealistic generation of images from text prompts. Despite the great
+progress, existing models still struggle to generate compositional
+multi-concept images naturally, limiting their ability to visualize human
+imagination. While several recent works have attempted to address this issue,
+they either introduce additional training or adopt guidance at inference time.
+In this work, we consider a more ambitious goal: natural multi-concept
+generation using a pre-trained diffusion model, and with almost no extra cost.
+To achieve this goal, we identify the limitations in the text embeddings used
+for the pre-trained text-to-image diffusion models. Specifically, we observe
+concept dominance and non-localized contribution that severely degrade
+multi-concept generation performance. We further design a minimal low-cost
+solution that overcomes the above issues by tweaking (not re-training) the text
+embeddings for more realistic multi-concept text-to-image generation. Our
+Correction by Similarities method tweaks the embedding of concepts by
+collecting semantic features from most similar tokens to localize the
+contribution. To avoid mixing features of concepts, we also apply Cross-Token
+Non-Maximum Suppression, which excludes the overlap of contributions from
+different concepts. Experiments show that our approach outperforms previous
+methods in text-to-image, image manipulation, and personalization tasks,
+despite not introducing additional training or inference costs to the diffusion
+steps.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Voronoi-based Convolutional Neural Network Framework for Pushing
+  Person Detection in Crowd Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07416v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07416v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed Alia, Mohammed Maree, Mohcine Chraibi, Armin Seyfried
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Analyzing the microscopic dynamics of pushing behavior within crowds can
+offer valuable insights into crowd patterns and interactions. By identifying
+instances of pushing in crowd videos, a deeper understanding of when, where,
+and why such behavior occurs can be achieved. This knowledge is crucial to
+creating more effective crowd management strategies, optimizing crowd flow, and
+enhancing overall crowd experiences. However, manually identifying pushing
+behavior at the microscopic level is challenging, and the existing automatic
+approaches cannot detect such microscopic behavior. Thus, this article
+introduces a novel automatic framework for identifying pushing in videos of
+crowds on a microscopic level. The framework comprises two main components: i)
+Feature extraction and ii) Video labeling. In the feature extraction component,
+a new Voronoi-based method is developed for determining the local regions
+associated with each person in the input video. Subsequently, these regions are
+fed into EfficientNetV1B0 Convolutional Neural Network to extract the deep
+features of each person over time. In the second component, a combination of a
+fully connected layer with a Sigmoid activation function is employed to analyze
+these deep features and annotate the individuals involved in pushing within the
+video. The framework is trained and evaluated on a new dataset created using
+six real-world experiments, including their corresponding ground truths. The
+experimental findings indicate that the suggested framework outperforms seven
+baseline methods that are employed for comparative analysis purposes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CLIP for Lightweight Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07394v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07394v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ke Jin, Wankou Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The large-scale pretrained model CLIP, trained on 400 million image-text
+pairs, offers a promising paradigm for tackling vision tasks, albeit at the
+image level. Later works, such as DenseCLIP and LSeg, extend this paradigm to
+dense prediction, including semantic segmentation, and have achieved excellent
+results. However, the above methods either rely on CLIP-pretrained visual
+backbones or use none-pretrained but heavy backbones such as Swin, while
+falling ineffective when applied to lightweight backbones. The reason for this
+is that the lightweitht networks, feature extraction ability of which are
+relatively limited, meet difficulty embedding the image feature aligned with
+text embeddings perfectly. In this work, we present a new feature fusion module
+which tackles this problem and enables language-guided paradigm to be applied
+to lightweight networks. Specifically, the module is a parallel design of CNN
+and transformer with a two-way bridge in between, where CNN extracts spatial
+information and visual context of the feature map from the image encoder, and
+the transformer propagates text embeddings from the text encoder forward. The
+core of the module is the bidirectional fusion of visual and text feature
+across the bridge which prompts their proximity and alignment in embedding
+space. The module is model-agnostic, which can not only make language-guided
+lightweight semantic segmentation practical, but also fully exploit the
+pretrained knowledge of language priors and achieve better performance than
+previous SOTA work, such as DenseCLIP, whatever the vision backbone is.
+Extensive experiments have been conducted to demonstrate the superiority of our
+method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Causal Unsupervised Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07379v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07379v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junho Kim, Byung-Kwan Lee, Yong Man Ro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised semantic segmentation aims to achieve high-quality semantic
+grouping without human-labeled annotations. With the advent of self-supervised
+pre-training, various frameworks utilize the pre-trained features to train
+prediction heads for unsupervised dense prediction. However, a significant
+challenge in this unsupervised setup is determining the appropriate level of
+clustering required for segmenting concepts. To address it, we propose a novel
+framework, CAusal Unsupervised Semantic sEgmentation (CAUSE), which leverages
+insights from causal inference. Specifically, we bridge intervention-oriented
+approach (i.e., frontdoor adjustment) to define suitable two-step tasks for
+unsupervised prediction. The first step involves constructing a concept
+clusterbook as a mediator, which represents possible concept prototypes at
+different levels of granularity in a discretized form. Then, the mediator
+establishes an explicit link to the subsequent concept-wise self-supervised
+learning for pixel-level grouping. Through extensive experiments and analyses
+on various datasets, we corroborate the effectiveness of CAUSE and achieve
+state-of-the-art performance in unsupervised semantic segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>code available:
+  https://github.com/ByungKwanLee/Causal-Unsupervised-Segmentation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Point Cloud Denoising and Outlier Detection with Local Geometric
+  Structure by Dynamic Graph CNN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07376v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07376v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kosuke Nakayama, Hiroto Fukuta, Hiroshi Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The digitalization of society is rapidly developing toward the realization of
+the digital twin and metaverse. In particular, point clouds are attracting
+attention as a media format for 3D space. Point cloud data is contaminated with
+noise and outliers due to measurement errors. Therefore, denoising and outlier
+detection are necessary for point cloud processing. Among them, PointCleanNet
+is an effective method for point cloud denoising and outlier detection.
+However, it does not consider the local geometric structure of the patch. We
+solve this problem by applying two types of graph convolutional layer designed
+based on the Dynamic Graph CNN. Experimental results show that the proposed
+methods outperform the conventional method in AUPR, which indicates outlier
+detection accuracy, and Chamfer Distance, which indicates denoising accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2023 IEEE 12th Global Conference on Consumer Electronics (GCCE 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Domain Generalization Guided by Gradient Signal to Noise Ratio of
+  Parameters <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07361v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07361v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mateusz Michalkiewicz, Masoud Faraki, Xiang Yu, Manmohan Chandraker, Mahsa Baktashmotlagh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Overfitting to the source domain is a common issue in gradient-based training
+of deep neural networks. To compensate for the over-parameterized models,
+numerous regularization techniques have been introduced such as those based on
+dropout. While these methods achieve significant improvements on classical
+benchmarks such as ImageNet, their performance diminishes with the introduction
+of domain shift in the test set i.e. when the unseen data comes from a
+significantly different distribution. In this paper, we move away from the
+classical approach of Bernoulli sampled dropout mask construction and propose
+to base the selection on gradient-signal-to-noise ratio (GSNR) of network's
+parameters. Specifically, at each training step, parameters with high GSNR will
+be discarded. Furthermore, we alleviate the burden of manually searching for
+the optimal dropout ratio by leveraging a meta-learning approach. We evaluate
+our method on standard domain generalization benchmarks and achieve competitive
+results on classification and face anti-spoofing problems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper was accepted to ICCV 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diagnosing Bipolar Disorder from 3-D Structural Magnetic Resonance
+  Images Using a Hybrid GAN-CNN Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07359v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07359v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masood Hamed Saghayan, Mohammad Hossein Zolfagharnasab, Ali Khadem, Farzam Matinfar, Hassan Rashidi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bipolar Disorder (BD) is a psychiatric condition diagnosed by repetitive
+cycles of hypomania and depression. Since diagnosing BD relies on subjective
+behavioral assessments over a long period, a solid diagnosis based on objective
+criteria is not straightforward. The current study responded to the described
+obstacle by proposing a hybrid GAN-CNN model to diagnose BD from 3-D structural
+MRI Images (sMRI). The novelty of this study stems from diagnosing BD from sMRI
+samples rather than conventional datasets such as functional MRI (fMRI),
+electroencephalography (EEG), and behavioral symptoms while removing the data
+insufficiency usually encountered when dealing with sMRI samples. The impact of
+various augmentation ratios is also tested using 5-fold cross-validation. Based
+on the results, this study obtains an accuracy rate of 75.8%, a sensitivity of
+60.3%, and a specificity of 82.5%, which are 3-5% higher than prior work while
+utilizing less than 6% sample counts. Next, it is demonstrated that a 2- D
+layer-based GAN generator can effectively reproduce complex 3D brain samples, a
+more straightforward technique than manual image processing. Lastly, the
+optimum augmentation threshold for the current study using 172 sMRI samples is
+50%, showing the applicability of the described method for larger sMRI
+datasets. In conclusion, it is established that data augmentation using GAN
+improves the accuracy of the CNN classifier using sMRI samples, thus developing
+more reliable decision support systems to assist practitioners in identifying
+BD patients more reliably and in a shorter period
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IMITATE: Clinical Prior Guided Hierarchical Vision-Language <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07355v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07355v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Che Liu, Sibo Cheng, Miaojing Shi, Anand Shah, Wenjia Bai, Rossella Arcucci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of medical Vision-Language Pre-training (VLP), significant
+efforts have been devoted to deriving text and image features from both
+clinical reports and associated medical images. However, most existing methods
+may have overlooked the opportunity in leveraging the inherent hierarchical
+structure of clinical reports, which are generally split into `findings' for
+descriptive content and `impressions' for conclusive observation. Instead of
+utilizing this rich, structured format, current medical VLP approaches often
+simplify the report into either a unified entity or fragmented tokens. In this
+work, we propose a novel clinical prior guided VLP framework named IMITATE to
+learn the structure information from medical reports with hierarchical
+vision-language alignment. The framework derives multi-level visual features
+from the chest X-ray (CXR) images and separately aligns these features with the
+descriptive and the conclusive text encoded in the hierarchical medical report.
+Furthermore, a new clinical-informed contrastive loss is introduced for
+cross-modal learning, which accounts for clinical prior knowledge in
+formulating sample correlations in contrastive learning. The proposed model,
+IMITATE, outperforms baseline VLP methods across six different datasets,
+spanning five medical imaging downstream tasks. Comprehensive experimental
+results highlight the advantages of integrating the hierarchical structure of
+medical reports for vision-language alignment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Guided Attention for Interpretable Motion Captioning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07324v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07324v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karim Radouane, Andon Tchechmedjiev, Sylvie Ranwez, Julien Lagarde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While much effort has been invested in generating human motion from text,
+relatively few studies have been dedicated to the reverse direction, that is,
+generating text from motion. Much of the research focuses on maximizing
+generation quality without any regard for the interpretability of the
+architectures, particularly regarding the influence of particular body parts in
+the generation and the temporal synchronization of words with specific
+movements and actions. This study explores the combination of movement encoders
+with spatio-temporal attention models and proposes strategies to guide the
+attention during training to highlight perceptually pertinent areas of the
+skeleton in time. We show that adding guided attention with adaptive gate leads
+to interpretable captioning while improving performance compared to higher
+parameter-count non-interpretable SOTA systems. On the KIT MLD dataset, we
+obtain a BLEU@4 of 24.4% (SOTA+6%), a ROUGE-L of 58.30% (SOTA +14.1%), a CIDEr
+of 112.10 (SOTA +32.6) and a Bertscore of 41.20% (SOTA +18.20%). On HumanML3D,
+we obtain a BLEU@4 of 25.00 (SOTA +2.7%), a ROUGE-L score of 55.4% (SOTA
++6.1%), a CIDEr of 61.6 (SOTA -10.9%), a Bertscore of 40.3% (SOTA +2.5%). Our
+code implementation and reproduction details will be soon available at
+https://github.com/rd20karim/M2T-Interpretable/tree/main.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv preprint</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A webcam-based machine learning approach for three-dimensional range of
+  motion evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07322v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07322v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiaoye Michael Wang, Derek T. Smith, Qin Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Background. Joint range of motion (ROM) is an important quantitative measure
+for physical therapy. Commonly relying on a goniometer, accurate and reliable
+ROM measurement requires extensive training and practice. This, in turn,
+imposes a significant barrier for those who have limited in-person access to
+healthcare.
+  Objective. The current study presents and evaluates an alternative machine
+learning-based ROM evaluation method that could be remotely accessed via a
+webcam.
+  Methods. To evaluate its reliability, the ROM measurements for a diverse set
+of joints (neck, spine, and upper and lower extremities) derived using this
+method were compared to those obtained from a marker-based optical motion
+capture system.
+  Results. Data collected from 25 healthy adults demonstrated that the webcam
+solution exhibited high test-retest reliability, with substantial to almost
+perfect intraclass correlation coefficients for most joints. Compared with the
+marker-based system, the webcam-based system demonstrated substantial to almost
+perfect inter-rater reliability for some joints, and lower inter-rater
+reliability for other joints (e.g., shoulder flexion and elbow flexion), which
+could be attributed to the reduced sensitivity to joint locations at the apex
+of the movement.
+  Conclusions. The proposed webcam-based method exhibited high test-retest and
+inter-rater reliability, making it a versatile alternative for existing ROM
+evaluation methods in clinical practice and the tele-implementation of physical
+therapy and rehabilitation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Aramaic: Towards a Synthetic Data Paradigm Enabling Machine
+  Learning in Epigraphy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07310v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07310v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andrei C. Aioanei, Regine Hunziker-Rodewald, Konstantin Klein, Dominik L. Michels
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Epigraphy increasingly turns to modern artificial intelligence (AI)
+technologies such as machine learning (ML) for extracting insights from ancient
+inscriptions. However, scarce labeled data for training ML algorithms severely
+limits current techniques, especially for ancient scripts like Old Aramaic. Our
+research pioneers an innovative methodology for generating synthetic training
+data tailored to Old Aramaic letters. Our pipeline synthesizes photo-realistic
+Aramaic letter datasets, incorporating textural features, lighting, damage, and
+augmentations to mimic real-world inscription diversity. Despite minimal real
+examples, we engineer a dataset of 250,000 training and 25,000 validation
+images covering the 22 letter classes in the Aramaic alphabet. This
+comprehensive corpus provides a robust volume of data for training a residual
+neural network (ResNet) to classify highly degraded Aramaic letters. The ResNet
+model demonstrates high accuracy in classifying real images from the 8th
+century BCE Hadad statue inscription. Additional experiments validate
+performance on varying materials and styles, proving effective generalization.
+Our results validate the model's capabilities in handling diverse real-world
+scenarios, proving the viability of our synthetic data approach and avoiding
+the dependence on scarce training data that has constrained epigraphic
+analysis. Our innovative framework elevates interpretation accuracy on damaged
+inscriptions, thus enhancing knowledge extraction from these historical
+resources.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>41 pages, 19 images</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Distilling Efficient Vision <span class="highlight-title">Transformer</span>s from CNNs for Semantic
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07265v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07265v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xu Zheng, Yunhao Luo, Pengyuan Zhou, Lin Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we tackle a new problem: how to transfer knowledge from the
+pre-trained cumbersome yet well-performed CNN-based model to learn a compact
+Vision Transformer (ViT)-based model while maintaining its learning capacity?
+Due to the completely different characteristics of ViT and CNN and the
+long-existing capacity gap between teacher and student models in Knowledge
+Distillation (KD), directly transferring the cross-model knowledge is
+non-trivial. To this end, we subtly leverage the visual and
+linguistic-compatible feature character of ViT (i.e., student), and its
+capacity gap with the CNN (i.e., teacher) and propose a novel CNN-to-ViT KD
+framework, dubbed C2VKD. Importantly, as the teacher's features are
+heterogeneous to those of the student, we first propose a novel
+visual-linguistic feature distillation (VLFD) module that explores efficient KD
+among the aligned visual and linguistic-compatible representations. Moreover,
+due to the large capacity gap between the teacher and student and the
+inevitable prediction errors of the teacher, we then propose a pixel-wise
+decoupled distillation (PDD) module to supervise the student under the
+combination of labels and teacher's predictions from the decoupled target and
+non-target classes. Experiments on three semantic segmentation benchmark
+datasets consistently show that the increment of mIoU of our method is over
+200% of the SoTA KD methods
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncovering Hidden Connections: Iterative Tracking and Reasoning for
+  Video-grounded Dialog 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07259v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07259v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyu Zhang, Meng Liu, Yaowei Wang, Da Cao, Weili Guan, Liqiang Nie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In contrast to conventional visual question answering, video-grounded dialog
+necessitates a profound understanding of both dialog history and video content
+for accurate response generation. Despite commendable strides made by existing
+methodologies, they often grapple with the challenges of incrementally
+understanding intricate dialog histories and assimilating video information. In
+response to this gap, we present an iterative tracking and reasoning strategy
+that amalgamates a textual encoder, a visual encoder, and a generator. At its
+core, our textual encoder is fortified with a path tracking and aggregation
+mechanism, adept at gleaning nuances from dialog history that are pivotal to
+deciphering the posed questions. Concurrently, our visual encoder harnesses an
+iterative reasoning network, meticulously crafted to distill and emphasize
+critical visual markers from videos, enhancing the depth of visual
+comprehension. Culminating this enriched information, we employ the pre-trained
+GPT-2 model as our response generator, stitching together coherent and
+contextually apt answers. Our empirical assessments, conducted on two renowned
+datasets, testify to the prowess and adaptability of our proposed design.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ADASR: An Adversarial Auto-Augmentation Framework for Hyperspectral and
+  Multispectral Data Fusion 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07255v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07255v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jinghui Qin, Lihuang Fang, Ruitao Lu, Liang Lin, Yukai Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning-based hyperspectral image (HSI) super-resolution, which aims to
+generate high spatial resolution HSI (HR-HSI) by fusing hyperspectral image
+(HSI) and multispectral image (MSI) with deep neural networks (DNNs), has
+attracted lots of attention. However, neural networks require large amounts of
+training data, hindering their application in real-world scenarios. In this
+letter, we propose a novel adversarial automatic data augmentation framework
+ADASR that automatically optimizes and augments HSI-MSI sample pairs to enrich
+data diversity for HSI-MSI fusion. Our framework is sample-aware and optimizes
+an augmentor network and two downsampling networks jointly by adversarial
+learning so that we can learn more robust downsampling networks for training
+the upsampling network. Extensive experiments on two public classical
+hyperspectral datasets demonstrate the effectiveness of our ADASR compared to
+the state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This paper has been accepted by IEEE Geoscience and Remote Sensing
+  Letters. Code is released at https://github.com/fangfang11-plog/ADASR</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comparative Study of <span class="highlight-title">Pre-train</span>ed CNNs and GRU-Based Attention for
+  Image Caption Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07252v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07252v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rashid Khan, Bingding Huang, Haseeb Hassan, Asim Zaman, Zhongfu Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image captioning is a challenging task involving generating a textual
+description for an image using computer vision and natural language processing
+techniques. This paper proposes a deep neural framework for image caption
+generation using a GRU-based attention mechanism. Our approach employs multiple
+pre-trained convolutional neural networks as the encoder to extract features
+from the image and a GRU-based language model as the decoder to generate
+descriptive sentences. To improve performance, we integrate the Bahdanau
+attention model with the GRU decoder to enable learning to focus on specific
+image parts. We evaluate our approach using the MSCOCO and Flickr30k datasets
+and show that it achieves competitive scores compared to state-of-the-art
+methods. Our proposed framework can bridge the gap between computer vision and
+natural language and can be extended to specific domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15pages, 10 figures, 5 tables. 2023 the 5th International Conference
+  on Robotics and Computer Vision (ICRCV 2023). arXiv admin note: substantial
+  text overlap with arXiv:2203.01594</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Synthesizing Missing MRI Sequences from Available Modalities using
+  Generative Adversarial Networks in BraTS <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07250v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07250v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ibrahim Ethem Hamamci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Glioblastoma is a highly aggressive and lethal form of brain cancer. Magnetic
+resonance imaging (MRI) plays a significant role in the diagnosis, treatment
+planning, and follow-up of glioblastoma patients due to its non-invasive and
+radiation-free nature. The International Brain Tumor Segmentation (BraTS)
+challenge has contributed to generating numerous AI algorithms to accurately
+and efficiently segment glioblastoma sub-compartments using four structural
+(T1, T1Gd, T2, T2-FLAIR) MRI scans. However, these four MRI sequences may not
+always be available. To address this issue, Generative Adversarial Networks
+(GANs) can be used to synthesize the missing MRI sequences. In this paper, we
+implement and utilize an open-source GAN approach that takes any three MRI
+sequences as input to generate the missing fourth structural sequence. Our
+proposed approach is contributed to the community-driven generally nuanced deep
+learning framework (GaNDLF) and demonstrates promising results in synthesizing
+high-quality and realistic MRI sequences, enabling clinicians to improve their
+diagnostic capabilities and support the application of AI methods to brain
+tumor MRI quantification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IBoxCLA: Towards Robust Box-supervised Segmentation of Polyp via
+  Improved Box-dice and Contrastive Latent-anchors 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07248v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07248v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiwei Wang, Qiang Hu, Hongkuan Shi, Li He, Man He, Wenxuan Dai, Ting Li, Yitong Zhang, Dun Li, Mei Liu, Qiang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Box-supervised polyp segmentation attracts increasing attention for its
+cost-effective potential. Existing solutions often rely on learning-free
+methods or pretrained models to laboriously generate pseudo masks, triggering
+Dice constraint subsequently. In this paper, we found that a model guided by
+the simplest box-filled masks can accurately predict polyp locations/sizes, but
+suffers from shape collapsing. In response, we propose two innovative learning
+fashions, Improved Box-dice (IBox) and Contrastive Latent-Anchors (CLA), and
+combine them to train a robust box-supervised model IBoxCLA. The core idea
+behind IBoxCLA is to decouple the learning of location/size and shape, allowing
+for focused constraints on each of them. Specifically, IBox transforms the
+segmentation map into a proxy map using shape decoupling and confusion-region
+swapping sequentially. Within the proxy map, shapes are disentangled, while
+locations/sizes are encoded as box-like responses. By constraining the proxy
+map instead of the raw prediction, the box-filled mask can well supervise
+IBoxCLA without misleading its shape learning. Furthermore, CLA contributes to
+shape learning by generating two types of latent anchors, which are learned and
+updated using momentum and segmented polyps to steadily represent polyp and
+background features. The latent anchors facilitate IBoxCLA to capture
+discriminative features within and outside boxes in a contrastive manner,
+yielding clearer boundaries. We benchmark IBoxCLA on five public polyp
+datasets. The experimental results demonstrate the competitive performance of
+IBoxCLA compared to recent fully-supervised polyp segmentation methods, and its
+superiority over other box-supervised state-of-the-arts with a relative
+increase of overall mDice and mIoU by at least 6.5% and 7.5%, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Optimizing the Placement of Roadside LiDARs for Autonomous Driving 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07247v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07247v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wentao Jiang, Hao Xiang, Xinyu Cai, Runsheng Xu, Jiaqi Ma, Yikang Li, Gim Hee Lee, Si Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-agent cooperative perception is an increasingly popular topic in the
+field of autonomous driving, where roadside LiDARs play an essential role.
+However, how to optimize the placement of roadside LiDARs is a crucial but
+often overlooked problem. This paper proposes an approach to optimize the
+placement of roadside LiDARs by selecting optimized positions within the scene
+for better perception performance. To efficiently obtain the best combination
+of locations, a greedy algorithm based on perceptual gain is proposed, which
+selects the location that can maximize the perceptual gain sequentially. We
+define perceptual gain as the increased perceptual capability when a new LiDAR
+is placed. To obtain the perception capability, we propose a perception
+predictor that learns to evaluate LiDAR placement using only a single point
+cloud frame. A dataset named Roadside-Opt is created using the CARLA simulator
+to facilitate research on the roadside LiDAR placement problem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Crowd Counting in Harsh Weather using Image Denoising with Pix2Pix GANs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07245v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07245v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Asif Khan, Hamid Menouar, Ridha Hamila
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual crowd counting estimates the density of the crowd using deep learning
+models such as convolution neural networks (CNNs). The performance of the model
+heavily relies on the quality of the training data that constitutes crowd
+images. In harsh weather such as fog, dust, and low light conditions, the
+inference performance may severely degrade on the noisy and blur images. In
+this paper, we propose the use of Pix2Pix generative adversarial network (GAN)
+to first denoise the crowd images prior to passing them to the counting model.
+A Pix2Pix network is trained using synthetic noisy images generated from
+original crowd images and then the pretrained generator is then used in the
+inference engine to estimate the crowd density in unseen, noisy crowd images.
+The performance is tested on JHU-Crowd dataset to validate the significance of
+the proposed method particularly when high reliability and accuracy are
+required.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper has been accepted for presentation in IEEE 38th
+  International Conference on Image and Vision Computing New Zealand (IVCNZ
+  2023). The final manuscript can be accessed at ieeexplore</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SAGE-ICP: Semantic Information-Assisted ICP 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07237v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07237v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaming Cui, Jiming Chen, Liang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robust and accurate pose estimation in unknown environments is an essential
+part of robotic applications. We focus on LiDAR-based point-to-point ICP
+combined with effective semantic information. This paper proposes a novel
+semantic information-assisted ICP method named SAGE-ICP, which leverages
+semantics in odometry. The semantic information for the whole scan is timely
+and efficiently extracted by a 3D convolution network, and these point-wise
+labels are deeply involved in every part of the registration, including
+semantic voxel downsampling, data association, adaptive local map, and dynamic
+vehicle removal. Unlike previous semantic-aided approaches, the proposed method
+can improve localization accuracy in large-scale scenes even if the semantic
+information has certain errors. Experimental evaluations on KITTI and KITTI-360
+show that our method outperforms the baseline methods, and improves accuracy
+while maintaining real-time performance, i.e., runs faster than the sensor
+frame rate.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>6+1 pages, 4 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AdaMesh: Personalized Facial Expressions and Head Poses for
+  Speech-Driven 3D Facial Animation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liyang Chen, Weihong Bao, Shun Lei, Boshi Tang, Zhiyong Wu, Shiyin Kang, Haozhi Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech-driven 3D facial animation aims at generating facial movements that
+are synchronized with the driving speech, which has been widely explored
+recently. Existing works mostly neglect the person-specific talking style in
+generation, including facial expression and head pose styles. Several works
+intend to capture the personalities by fine-tuning modules. However, limited
+training data leads to the lack of vividness. In this work, we propose AdaMesh,
+a novel adaptive speech-driven facial animation approach, which learns the
+personalized talking style from a reference video of about 10 seconds and
+generates vivid facial expressions and head poses. Specifically, we propose
+mixture-of-low-rank adaptation (MoLoRA) to fine-tune the expression adapter,
+which efficiently captures the facial expression style. For the personalized
+pose style, we propose a pose adapter by building a discrete pose prior and
+retrieving the appropriate style embedding with a semantic-aware pose style
+matrix without fine-tuning. Extensive experimental results show that our
+approach outperforms state-of-the-art methods, preserves the talking style in
+the reference video, and generates vivid facial animation. The supplementary
+video and code will be available at https://adamesh.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://adamesh.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning for blind spectral unmixing of LULC classes with MODIS
+  multispectral time series and ancillary data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07223v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07223v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        José Rodríguez-Ortega, Rohaifa Khaldi, Domingo Alcaraz-Segura, Siham Tabik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remotely sensed data are dominated by mixed Land Use and Land Cover (LULC)
+types. Spectral unmixing is a technique to extract information from mixed
+pixels into their constituent LULC types and corresponding abundance fractions.
+Traditionally, solving this task has relied on either classical methods that
+require prior knowledge of endmembers or machine learning methods that avoid
+explicit endmembers calculation, also known as blind spectral unmixing (BSU).
+Most BSU studies based on Deep Learning (DL) focus on one time-step
+hyperspectral data, yet its acquisition remains quite costly compared with
+multispectral data. To our knowledge, here we provide the first study on BSU of
+LULC classes using multispectral time series data with DL models. We further
+boost the performance of a Long-Short Term Memory (LSTM)-based model by
+incorporating geographic plus topographic (geo-topographic) and climatic
+ancillary information. Our experiments show that combining spectral-temporal
+input data together with geo-topographic and climatic information substantially
+improves the abundance estimation of LULC classes in mixed pixels. To carry out
+this study, we built a new labeled dataset of the region of Andalusia (Spain)
+with monthly multispectral time series of pixels for the year 2013 from MODIS
+at 460m resolution, for two hierarchical levels of LULC classes, named
+Andalusia MultiSpectral MultiTemporal Unmixing (Andalusia-MSMTU). This dataset
+provides, at the pixel level, a multispectral time series plus ancillary
+information annotated with the abundance of each LULC class inside each pixel.
+The dataset and code are available to the public.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uni-paint: A Unified Framework for Multimodal Image Inpainting with
+  <span class="highlight-title">Pretrain</span>ed Diffusion Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07222v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07222v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shiyuan Yang, Xiaodong Chen, Jing Liao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, text-to-image denoising diffusion probabilistic models (DDPMs) have
+demonstrated impressive image generation capabilities and have also been
+successfully applied to image inpainting. However, in practice, users often
+require more control over the inpainting process beyond textual guidance,
+especially when they want to composite objects with customized appearance,
+color, shape, and layout. Unfortunately, existing diffusion-based inpainting
+methods are limited to single-modal guidance and require task-specific
+training, hindering their cross-modal scalability. To address these
+limitations, we propose Uni-paint, a unified framework for multimodal
+inpainting that offers various modes of guidance, including unconditional,
+text-driven, stroke-driven, exemplar-driven inpainting, as well as a
+combination of these modes. Furthermore, our Uni-paint is based on pretrained
+Stable Diffusion and does not require task-specific training on specific
+datasets, enabling few-shot generalizability to customized images. We have
+conducted extensive qualitative and quantitative evaluations that show our
+approach achieves comparable results to existing single-modal methods while
+offering multimodal inpainting capabilities not available in other methods.
+Code will be available at https://github.com/ysy31415/unipaint.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACMMM'23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-Task Learning-Enabled Automatic Vessel Draft Reading for
+  Intelligent Maritime Surveillance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07212v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07212v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingxiang Qu, Ryan Wen Liu, Chenjie Zhao, Yu Guo, Sendren Sheng-Dong Xu, Fenghua Zhu, Yisheng Lv
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The accurate and efficient vessel draft reading (VDR) is an important
+component of intelligent maritime surveillance, which could be exploited to
+assist in judging whether the vessel is normally loaded or overloaded. The
+computer vision technique with an excellent price-to-performance ratio has
+become a popular medium to estimate vessel draft depth. However, the
+traditional estimation methods easily suffer from several limitations, such as
+sensitivity to low-quality images, high computational cost, etc. In this work,
+we propose a multi-task learning-enabled computational method (termed MTL-VDR)
+for generating highly reliable VDR. In particular, our MTL-VDR mainly consists
+of four components, i.e., draft mark detection, draft scale recognition,
+vessel/water segmentation, and final draft depth estimation. We first construct
+a benchmark dataset related to draft mark detection and employ a powerful and
+efficient convolutional neural network to accurately perform the detection
+task. The multi-task learning method is then proposed for simultaneous draft
+scale recognition and vessel/water segmentation. To obtain more robust VDR
+under complex conditions (e.g., damaged and stained scales, etc.), the accurate
+draft scales are generated by an automatic correction method, which is
+presented based on the spatial distribution rules of draft scales. Finally, an
+adaptive computational method is exploited to yield an accurate and robust
+draft depth. Extensive experiments have been implemented on the realistic
+dataset to compare our MTL-VDR with state-of-the-art methods. The results have
+demonstrated its superior performance in terms of accuracy, robustness, and
+efficiency. The computational speed exceeds 40 FPS, which satisfies the
+requirements of real-time maritime surveillance to guarantee vessel traffic
+safety.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>12 pages,11 figures, submitted to IEEE T-ITS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multi-task Explainable Skin Lesion Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07209v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07209v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahapara Khurshid, Mayank Vatsa, Richa Singh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Skin cancer is one of the deadliest diseases and has a high mortality rate if
+left untreated. The diagnosis generally starts with visual screening and is
+followed by a biopsy or histopathological examination. Early detection can aid
+in lowering mortality rates. Visual screening can be limited by the experience
+of the doctor. Due to the long tail distribution of dermatological datasets and
+significant intra-variability between classes, automatic classification
+utilizing computer-aided methods becomes challenging. In this work, we propose
+a multitask few-shot-based approach for skin lesions that generalizes well with
+few labelled data to address the small sample space challenge. The proposed
+approach comprises a fusion of a segmentation network that acts as an attention
+module and classification network. The output of the segmentation network helps
+to focus on the most discriminatory features while making a decision by the
+classification network. To further enhance the classification performance, we
+have combined segmentation and classification loss in a weighted manner. We
+have also included the visualization results that explain the decisions made by
+the algorithm. Three dermatological datasets are used to evaluate the proposed
+method thoroughly. We also conducted cross-database experiments to ensure that
+the proposed approach is generalizable across similar datasets. Experimental
+results demonstrate the efficacy of the proposed work.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DeepSimHO: Stable Pose Estimation for Hand-Object Interaction via
+  Physics Simulation <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07206v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07206v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rong Wang, Wei Mao, Hongdong Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper addresses the task of 3D pose estimation for a hand interacting
+with an object from a single image observation. When modeling hand-object
+interaction, previous works mainly exploit proximity cues, while overlooking
+the dynamical nature that the hand must stably grasp the object to counteract
+gravity and thus preventing the object from slipping or falling. These works
+fail to leverage dynamical constraints in the estimation and consequently often
+produce unstable results. Meanwhile, refining unstable configurations with
+physics-based reasoning remains challenging, both by the complexity of contact
+dynamics and by the lack of effective and efficient physics inference in the
+data-driven learning framework. To address both issues, we present DeepSimHO: a
+novel deep-learning pipeline that combines forward physics simulation and
+backward gradient approximation with a neural network. Specifically, for an
+initial hand-object pose estimated by a base network, we forward it to a
+physics simulator to evaluate its stability. However, due to non-smooth contact
+geometry and penetration, existing differentiable simulators can not provide
+reliable state gradient. To remedy this, we further introduce a deep network to
+learn the stability evaluation process from the simulator, while smoothly
+approximating its gradient and thus enabling effective back-propagation.
+Extensive experiments show that our method noticeably improves the stability of
+the estimation and achieves superior efficiency over test-time optimization.
+The code is available at https://github.com/rongakowang/DeepSimHO.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SpikePoint: An Efficient Point-based Spiking Neural Network for Event
+  Cameras Action Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07189v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07189v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongwei Ren, Yue Zhou, Yulong Huang, Haotian Fu, Xiaopeng Lin, Jie Song, Bojun Cheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Event cameras are bio-inspired sensors that respond to local changes in light
+intensity and feature low latency, high energy efficiency, and high dynamic
+range. Meanwhile, Spiking Neural Networks (SNNs) have gained significant
+attention due to their remarkable efficiency and fault tolerance. By
+synergistically harnessing the energy efficiency inherent in event cameras and
+the spike-based processing capabilities of SNNs, their integration could enable
+ultra-low-power application scenarios, such as action recognition tasks.
+However, existing approaches often entail converting asynchronous events into
+conventional frames, leading to additional data mapping efforts and a loss of
+sparsity, contradicting the design concept of SNNs and event cameras. To
+address this challenge, we propose SpikePoint, a novel end-to-end point-based
+SNN architecture. SpikePoint excels at processing sparse event cloud data,
+effectively extracting both global and local features through a singular-stage
+structure. Leveraging the surrogate training method, SpikePoint achieves high
+accuracy with few parameters and maintains low power consumption, specifically
+employing the identity mapping feature extractor on diverse datasets.
+SpikePoint achieves state-of-the-art (SOTA) performance on four event-based
+action recognition datasets using only 16 timesteps, surpassing other SNN
+methods. Moreover, it also achieves SOTA performance across all methods on
+three datasets, utilizing approximately 0.3\% of the parameters and 0.5\% of
+power consumption employed by artificial neural networks (ANNs). These results
+emphasize the significance of Point Cloud and pave the way for many
+ultra-low-power event-based data processing applications.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multiview <span class="highlight-title">Transformer</span>: Rethinking Spatial Information in Hyperspectral
+  Image Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07186v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07186v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jie Zhang, Yongshan Zhang, Yicong Zhou
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Identifying the land cover category for each pixel in a hyperspectral image
+(HSI) relies on spectral and spatial information. An HSI cuboid with a specific
+patch size is utilized to extract spatial-spectral feature representation for
+the central pixel. In this article, we investigate that scene-specific but not
+essential correlations may be recorded in an HSI cuboid. This additional
+information improves the model performance on existing HSI datasets and makes
+it hard to properly evaluate the ability of a model. We refer to this problem
+as the spatial overfitting issue and utilize strict experimental settings to
+avoid it. We further propose a multiview transformer for HSI classification,
+which consists of multiview principal component analysis (MPCA), spectral
+encoder-decoder (SED), and spatial-pooling tokenization transformer (SPTT).
+MPCA performs dimension reduction on an HSI via constructing spectral multiview
+observations and applying PCA on each view data to extract low-dimensional view
+representation. The combination of view representations, named multiview
+representation, is the dimension reduction output of the MPCA. To aggregate the
+multiview information, a fully-convolutional SED with a U-shape in spectral
+dimension is introduced to extract a multiview feature map. SPTT transforms the
+multiview features into tokens using the spatial-pooling tokenization strategy
+and learns robust and discriminative spatial-spectral features for land cover
+identification. Classification is conducted with a linear classifier.
+Experiments on three HSI datasets with rigid settings demonstrate the
+superiority of the proposed multiview transformer over the state-of-the-art
+methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NeuroInspect: Interpretable Neuron-based Debugging Framework through
+  Class-conditional Visualizations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07184v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07184v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yeong-Joon Ju, Ji-Hoon Park, Seong-Whan Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite deep learning (DL) has achieved remarkable progress in various
+domains, the DL models are still prone to making mistakes. This issue
+necessitates effective debugging tools for DL practitioners to interpret the
+decision-making process within the networks. However, existing debugging
+methods often demand extra data or adjustments to the decision process,
+limiting their applicability. To tackle this problem, we present NeuroInspect,
+an interpretable neuron-based debugging framework with three key stages:
+counterfactual explanations, feature visualizations, and false correlation
+mitigation. Our debugging framework first pinpoints neurons responsible for
+mistakes in the network and then visualizes features embedded in the neurons to
+be human-interpretable. To provide these explanations, we introduce
+CLIP-Illusion, a novel feature visualization method that generates images
+representing features conditioned on classes to examine the connection between
+neurons and the decision layer. We alleviate convoluted explanations of the
+conventional visualization approach by employing class information, thereby
+isolating mixed properties. This process offers more human-interpretable
+explanations for model errors without altering the trained network or requiring
+additional data. Furthermore, our framework mitigates false correlations
+learned from a dataset under a stochastic perspective, modifying decisions for
+the neurons considered as the main causes. We validate the effectiveness of our
+framework by addressing false correlations and improving inferences for classes
+with the worst performance in real-world settings. Moreover, we demonstrate
+that NeuroInspect helps debug the mistakes of DL models through evaluation for
+human understanding. The code is openly available at
+https://github.com/yeongjoonJu/NeuroInspect.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Summitted to IEEE Transactions on Neural Networks and Learning
+  Systems (TNNLS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ rpcPRF: Generalizable MPI Neural Radiance Field for Satellite Camera 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07179v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07179v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tongtong Zhang, Yuanxiang Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Novel view synthesis of satellite images holds a wide range of practical
+applications. While recent advances in the Neural Radiance Field have
+predominantly targeted pin-hole cameras, and models for satellite cameras often
+demand sufficient input views. This paper presents rpcPRF, a Multiplane Images
+(MPI) based Planar neural Radiance Field for Rational Polynomial Camera (RPC).
+Unlike coordinate-based neural radiance fields in need of sufficient views of
+one scene, our model is applicable to single or few inputs and performs well on
+images from unseen scenes. To enable generalization across scenes, we propose
+to use reprojection supervision to induce the predicted MPI to learn the
+correct geometry between the 3D coordinates and the images. Moreover, we remove
+the stringent requirement of dense depth supervision from deep
+multiview-stereo-based methods by introducing rendering techniques of radiance
+fields. rpcPRF combines the superiority of implicit representations and the
+advantages of the RPC model, to capture the continuous altitude space while
+learning the 3D structure. Given an RGB image and its corresponding RPC, the
+end-to-end model learns to synthesize the novel view with a new RPC and
+reconstruct the altitude of the scene. When multiple views are provided as
+inputs, rpcPRF exerts extra supervision provided by the extra views. On the TLC
+dataset from ZY-3, and the SatMVS3D dataset with urban scenes from WV-3, rpcPRF
+outperforms state-of-the-art nerf-based methods by a significant margin in
+terms of image fidelity, reconstruction accuracy, and efficiency, for both
+single-view and multiview task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving mitosis detection on histopathology images using large
+  vision-language models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07176v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07176v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ruiwen Ding, James Hall, Neil Tenenholtz, Kristen Severson
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In certain types of cancerous tissue, mitotic count has been shown to be
+associated with tumor proliferation, poor prognosis, and therapeutic
+resistance. Due to the high inter-rater variability of mitotic counting by
+pathologists, convolutional neural networks (CNNs) have been employed to reduce
+the subjectivity of mitosis detection in hematoxylin and eosin (H&E)-stained
+whole slide images. However, most existing models have performance that lags
+behind expert panel review and only incorporate visual information. In this
+work, we demonstrate that pre-trained large-scale vision-language models that
+leverage both visual features and natural language improve mitosis detection
+accuracy. We formulate the mitosis detection task as an image captioning task
+and a visual question answering (VQA) task by including metadata such as tumor
+and scanner types as context. The effectiveness of our pipeline is demonstrated
+via comparison with various baseline models using 9,501 mitotic figures and
+11,051 hard negatives (non-mitotic figures that are difficult to characterize)
+from the publicly available Mitosis Domain Generalization Challenge (MIDOG22)
+dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to IEEE ISBI 2024. Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Anchor-based Multi-view Subspace Clustering with Hierarchical Feature
+  Descent 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07166v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07166v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qiyuan Ou, Siwei Wang, Pei Zhang, Sihang Zhou, En Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-view clustering has attracted growing attention owing to its
+capabilities of aggregating information from various sources and its promising
+horizons in public affairs. Up till now, many advanced approaches have been
+proposed in recent literature. However, there are several ongoing difficulties
+to be tackled. One common dilemma occurs while attempting to align the features
+of different views. We dig out as well as deploy the dependency amongst views
+through hierarchical feature descent, which leads to a common latent space(
+STAGE 1). This latent space, for the first time of its kind, is regarded as a
+'resemblance space', as it reveals certain correlations and dependencies of
+different views. To be exact, the one-hot encoding of a category can also be
+referred to as a resemblance space in its terminal phase. Moreover, due to the
+intrinsic fact that most of the existing multi-view clustering algorithms stem
+from k-means clustering and spectral clustering, this results in cubic time
+complexity w.r.t. the number of the objects. However, we propose Anchor-based
+Multi-view Subspace Clustering with Hierarchical Feature Descent(MVSC-HFD) to
+further reduce the computing complexity to linear time cost through a unified
+sampling strategy in resemblance space( STAGE 2), followed by subspace
+clustering to learn the representation collectively( STAGE 3). Extensive
+experimental results on public benchmark datasets demonstrate that our proposed
+model consistently outperforms the state-of-the-art techniques.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Unsupervised Domain Adaptation by Retaining Confident Entropy via
+  Edge Concatenation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07149v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07149v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hye-Seong Hong, Abhishek Kumar, Dong-Gyu Lee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The generalization capability of unsupervised domain adaptation can mitigate
+the need for extensive pixel-level annotations to train semantic segmentation
+networks by training models on synthetic data as a source with
+computer-generated annotations. Entropy-based adversarial networks are proposed
+to improve source domain prediction; however, they disregard significant
+external information, such as edges, which have the potential to identify and
+distinguish various objects within an image accurately. To address this issue,
+we introduce a novel approach to domain adaptation, leveraging the synergy of
+internal and external information within entropy-based adversarial networks. In
+this approach, we enrich the discriminator network with edge-predicted
+probability values within this innovative framework to enhance the clarity of
+class boundaries. Furthermore, we devised a probability-sharing network that
+integrates diverse information for more effective segmentation. Incorporating
+object edges addresses a pivotal aspect of unsupervised domain adaptation that
+has frequently been neglected in the past -- the precise delineation of object
+boundaries. Conventional unsupervised domain adaptation methods usually center
+around aligning feature distributions and may not explicitly model object
+boundaries. Our approach effectively bridges this gap by offering clear
+guidance on object boundaries, thereby elevating the quality of domain
+adaptation. Our approach undergoes rigorous evaluation on the established
+unsupervised domain adaptation benchmarks, specifically in adapting SYNTHIA
+$\rightarrow$ Cityscapes and SYNTHIA $\rightarrow$ Mapillary. Experimental
+results show that the proposed model attains better performance than
+state-of-the-art methods. The superior performance across different
+unsupervised domain adaptation scenarios highlights the versatility and
+robustness of the proposed method.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Denoising Task Routing for Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07138v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07138v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Byeongjun Park, Sangmin Woo, Hyojun Go, Jin-Young Kim, Changick Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models generate highly realistic images through learning a
+multi-step denoising process, naturally embodying the principles of multi-task
+learning (MTL). Despite the inherent connection between diffusion models and
+MTL, there remains an unexplored area in designing neural architectures that
+explicitly incorporate MTL into the framework of diffusion models. In this
+paper, we present Denoising Task Routing (DTR), a simple add-on strategy for
+existing diffusion model architectures to establish distinct information
+pathways for individual tasks within a single architecture by selectively
+activating subsets of channels in the model. What makes DTR particularly
+compelling is its seamless integration of prior knowledge of denoising tasks
+into the framework: (1) Task Affinity: DTR activates similar channels for tasks
+at adjacent timesteps and shifts activated channels as sliding windows through
+timesteps, capitalizing on the inherent strong affinity between tasks at
+adjacent timesteps. (2) Task Weights: During the early stages (higher
+timesteps) of the denoising process, DTR assigns a greater number of
+task-specific channels, leveraging the insight that diffusion models prioritize
+reconstructing global structure and perceptually rich contents in earlier
+stages, and focus on simple noise removal in later stages. Our experiments
+demonstrate that DTR consistently enhances the performance of diffusion models
+across various evaluation protocols, all without introducing additional
+parameters. Furthermore, DTR contributes to accelerating convergence during
+training. Finally, we show the complementarity between our architectural
+approach and existing MTL optimization techniques, providing a more complete
+view of MTL within the context of diffusion training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Echocardiography video synthesis from end diastolic semantic map via
+  diffusion model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07131v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07131v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Phi Nguyen Van, Duc Tran Minh, Hieu Pham Huy, Long Tran Quoc
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Denoising Diffusion Probabilistic Models (DDPMs) have demonstrated
+significant achievements in various image and video generation tasks, including
+the domain of medical imaging. However, generating echocardiography videos
+based on semantic anatomical information remains an unexplored area of
+research. This is mostly due to the constraints imposed by the currently
+available datasets, which lack sufficient scale and comprehensive frame-wise
+annotations for every cardiac cycle. This paper aims to tackle the
+aforementioned challenges by expanding upon existing video diffusion models for
+the purpose of cardiac video synthesis. More specifically, our focus lies in
+generating video using semantic maps of the initial frame during the cardiac
+cycle, commonly referred to as end diastole. To further improve the synthesis
+process, we integrate spatial adaptive normalization into multiscale feature
+maps. This enables the inclusion of semantic guidance during synthesis,
+resulting in enhanced realism and coherence of the resultant video sequences.
+Experiments are conducted on the CAMUS dataset, which is a highly used dataset
+in the field of echocardiography. Our model exhibits better performance
+compared to the standard diffusion technique in terms of multiple metrics,
+including FID, FVD, and SSMI.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What Matters to You? Towards Visual Representation Alignment for Robot
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07932v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07932v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ran Tian, Chenfeng Xu, Masayoshi Tomizuka, Jitendra Malik, Andrea Bajcsy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When operating in service of people, robots need to optimize rewards aligned
+with end-user preferences. Since robots will rely on raw perceptual inputs like
+RGB images, their rewards will inevitably use visual representations. Recently
+there has been excitement in using representations from pre-trained visual
+models, but key to making these work in robotics is fine-tuning, which is
+typically done via proxy tasks like dynamics prediction or enforcing temporal
+cycle-consistency. However, all these proxy tasks bypass the human's input on
+what matters to them, exacerbating spurious correlations and ultimately leading
+to robot behaviors that are misaligned with user preferences. In this work, we
+propose that robots should leverage human feedback to align their visual
+representations with the end-user and disentangle what matters for the task. We
+propose Representation-Aligned Preference-based Learning (RAPL), a method for
+solving the visual representation alignment problem and visual reward learning
+problem through the lens of preference-based learning and optimal transport.
+Across experiments in X-MAGICAL and in robotic manipulation, we find that
+RAPL's reward consistently generates preferred robot behaviors with high sample
+efficiency, and shows strong zero-shot generalization when the visual
+representation is learned from a different embodiment than the robot's.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ D2 Pruning: Message Passing for Balancing Diversity and Difficulty in
+  Data Pruning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07931v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07931v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Adyasha Maharana, Prateek Yadav, Mohit Bansal
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Analytical theories suggest that higher-quality data can lead to lower test
+errors in models trained on a fixed data budget. Moreover, a model can be
+trained on a lower compute budget without compromising performance if a dataset
+can be stripped of its redundancies. Coreset selection (or data pruning) seeks
+to select a subset of the training data so as to maximize the performance of
+models trained on this subset, also referred to as coreset. There are two
+dominant approaches: (1) geometry-based data selection for maximizing data
+diversity in the coreset, and (2) functions that assign difficulty scores to
+samples based on training dynamics. Optimizing for data diversity leads to a
+coreset that is biased towards easier samples, whereas, selection by difficulty
+ranking omits easy samples that are necessary for the training of deep learning
+models. This demonstrates that data diversity and importance scores are two
+complementary factors that need to be jointly considered during coreset
+selection. We represent a dataset as an undirected graph and propose a novel
+pruning algorithm, D2 Pruning, that uses forward and reverse message passing
+over this dataset graph for coreset selection. D2 Pruning updates the
+difficulty scores of each example by incorporating the difficulty of its
+neighboring examples in the dataset graph. Then, these updated difficulty
+scores direct a graph-based sampling method to select a coreset that
+encapsulates both diverse and difficult regions of the dataset space. We
+evaluate supervised and self-supervised versions of our method on various
+vision and language datasets. Results show that D2 Pruning improves coreset
+selection over previous state-of-the-art methods for up to 70% pruning rates.
+Additionally, we find that using D2 Pruning for filtering large multimodal
+datasets leads to increased diversity in the dataset and improved
+generalization of pretrained models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages (Our code is available at
+  https://github.com/adymaharana/d2pruning)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Dynamic Appearance Particle Neural Radiance Field 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07916v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07916v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ancheng Lin, Jun Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural Radiance Fields (NeRFs) have shown great potential in modelling 3D
+scenes. Dynamic NeRFs extend this model by capturing time-varying elements,
+typically using deformation fields. The existing dynamic NeRFs employ a similar
+Eulerian representation for both light radiance and deformation fields. This
+leads to a close coupling of appearance and motion and lacks a physical
+interpretation. In this work, we propose Dynamic Appearance Particle Neural
+Radiance Field (DAP-NeRF), which introduces particle-based representation to
+model the motions of visual elements in a dynamic 3D scene. DAP-NeRF consists
+of superposition of a static field and a dynamic field. The dynamic field is
+quantised as a collection of {\em appearance particles}, which carries the
+visual information of a small dynamic element in the scene and is equipped with
+a motion model. All components, including the static field, the visual features
+and motion models of the particles, are learned from monocular videos without
+any prior geometric knowledge of the scene. We develop an efficient
+computational framework for the particle-based model. We also construct a new
+dataset to evaluate motion modelling. Experimental results show that DAP-NeRF
+is an effective technique to capture not only the appearance but also the
+physically meaningful motions in a 3D dynamic scene.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NoMaD: Goal Masked Diffusion Policies for Navigation and Exploration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07896v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07896v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ajay Sridhar, Dhruv Shah, Catherine Glossop, Sergey Levine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Robotic learning for navigation in unfamiliar environments needs to provide
+policies for both task-oriented navigation (i.e., reaching a goal that the
+robot has located), and task-agnostic exploration (i.e., searching for a goal
+in a novel setting). Typically, these roles are handled by separate models, for
+example by using subgoal proposals, planning, or separate navigation
+strategies. In this paper, we describe how we can train a single unified
+diffusion policy to handle both goal-directed navigation and goal-agnostic
+exploration, with the latter providing the ability to search novel
+environments, and the former providing the ability to reach a user-specified
+goal once it has been located. We show that this unified policy results in
+better overall performance when navigating to visually indicated goals in novel
+environments, as compared to approaches that use subgoal proposals from
+generative models, or prior methods based on latent variable models. We
+instantiate our method by using a large-scale Transformer-based policy trained
+on data from multiple ground robots, with a diffusion model decoder to flexibly
+handle both goal-conditioned and goal-agnostic navigation. Our experiments,
+conducted on a real-world mobile robot platform, show effective navigation in
+unseen environments in comparison with five alternative methods, and
+demonstrate significant improvements in performance and lower collision rates,
+despite utilizing smaller models than state-of-the-art approaches. For more
+videos, code, and pre-trained model checkpoints, see
+https://general-navigation-models.github.io/nomad/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project page https://general-navigation-models.github.io/nomad/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient Integrators for Diffusion Generative Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07894v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07894v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kushagra Pandey, Maja Rudolph, Stephan Mandt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models suffer from slow sample generation at inference time.
+Therefore, developing a principled framework for fast deterministic/stochastic
+sampling for a broader class of diffusion models is a promising direction. We
+propose two complementary frameworks for accelerating sample generation in
+pre-trained models: Conjugate Integrators and Splitting Integrators. Conjugate
+integrators generalize DDIM, mapping the reverse diffusion dynamics to a more
+amenable space for sampling. In contrast, splitting-based integrators, commonly
+used in molecular dynamics, reduce the numerical simulation error by cleverly
+alternating between numerical updates involving the data and auxiliary
+variables. After extensively studying these methods empirically and
+theoretically, we present a hybrid method that leads to the best-reported
+performance for diffusion models in augmented spaces. Applied to Phase Space
+Langevin Diffusion [Pandey & Mandt, 2023] on CIFAR-10, our deterministic and
+stochastic samplers achieve FID scores of 2.11 and 2.36 in only 100 network
+function evaluations (NFE) as compared to 2.57 and 2.63 for the best-performing
+baselines, respectively. Our code and model checkpoints will be made publicly
+available at \url{https://github.com/mandt-lab/PSLD}.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ LangNav: Language as a Perceptual Representation for Navigation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07889v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07889v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Pan, Rameswar Panda, SouYoung Jin, Rogerio Feris, Aude Oliva, Phillip Isola, Yoon Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We explore the use of language as a perceptual representation for
+vision-and-language navigation. Our approach uses off-the-shelf vision systems
+(for image captioning and object detection) to convert an agent's egocentric
+panoramic view at each time step into natural language descriptions. We then
+finetune a pretrained language model to select an action, based on the current
+view and the trajectory history, that would best fulfill the navigation
+instructions. In contrast to the standard setup which adapts a pretrained
+language model to work directly with continuous visual features from pretrained
+vision models, our approach instead uses (discrete) language as the perceptual
+representation. We explore two use cases of our language-based navigation
+(LangNav) approach on the R2R vision-and-language navigation benchmark:
+generating synthetic trajectories from a prompted large language model (GPT-4)
+with which to finetune a smaller language model; and sim-to-real transfer where
+we transfer a policy learned on a simulated environment (ALFRED) to a
+real-world environment (R2R). Our approach is found to improve upon strong
+baselines that rely on visual features in settings where only a few gold
+trajectories (10-100) are available, demonstrating the potential of using
+language as a perceptual representation for navigation tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Structured Noise Removal with Variational Lossy Autoencoder 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07887v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07887v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Benjamin Salmon, Alexander Krull
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Most unsupervised denoising methods are based on the assumption that imaging
+noise is either pixel-independent, i.e., spatially uncorrelated, or
+signal-independent, i.e., purely additive. However, in practice many imaging
+setups, especially in microscopy, suffer from a combination of signal-dependent
+noise (e.g. Poisson shot noise) and axis-aligned correlated noise (e.g. stripe
+shaped scanning or readout artifacts). In this paper, we present the first
+unsupervised deep learning-based denoiser that can remove this type of noise
+without access to any clean images or a noise model. Unlike self-supervised
+techniques, our method does not rely on removing pixels by masking or
+subsampling so can utilize all available information. We implement a
+Variational Autoencoder (VAE) with a specially designed autoregressive decoder
+capable of modelling the noise component of an image but incapable of
+independently modelling the underlying clean signal component. As a
+consequence, our VAE's encoder learns to encode only underlying clean signal
+content and to discard imaging noise. We also propose an additional decoder for
+mapping the encoder's latent variables back into image space, thereby sampling
+denoised images. Experimental results demonstrate that our approach surpasses
+existing methods for self- and unsupervised image denoising while being robust
+with respect to the size of the autoregressive receptive field. Code for this
+project can be found at https://github.com/krulllab/DVLAE.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A <span class="highlight-title">Survey</span> of Feature Types and Their Contributions for Camera Tampering
+  Detection 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07886v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07886v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pranav Mantini, Shishir K. Shah
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Camera tamper detection is the ability to detect unauthorized and
+unintentional alterations in surveillance cameras by analyzing the video.
+Camera tampering can occur due to natural events or it can be caused
+intentionally to disrupt surveillance. We cast tampering detection as a change
+detection problem, and perform a review of the existing literature with
+emphasis on feature types. We formulate tampering detection as a time series
+analysis problem, and design experiments to study the robustness and capability
+of various feature types. We compute ten features on real-world surveillance
+video and apply time series analysis to ascertain their predictability, and
+their capability to detect tampering. Finally, we quantify the performance of
+various time series models using each feature type to detect tampering.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CrIBo: <span class="highlight-title">Self-Supervised</span> Learning via Cross-Image Object-Level
+  Bootstrapping 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07855v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07855v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tim Lebailly, Thomas Stegmüller, Behzad Bozorgtabar, Jean-Philippe Thiran, Tinne Tuytelaars
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Leveraging nearest neighbor retrieval for self-supervised representation
+learning has proven beneficial with object-centric images. However, this
+approach faces limitations when applied to scene-centric datasets, where
+multiple objects within an image are only implicitly captured in the global
+representation. Such global bootstrapping can lead to undesirable entanglement
+of object representations. Furthermore, even object-centric datasets stand to
+benefit from a finer-grained bootstrapping approach. In response to these
+challenges, we introduce a novel Cross-Image Object-Level Bootstrapping method
+tailored to enhance dense visual representation learning. By employing
+object-level nearest neighbor bootstrapping throughout the training, CrIBo
+emerges as a notably strong and adequate candidate for in-context learning,
+leveraging nearest neighbor retrieval at test time. CrIBo shows
+state-of-the-art performance on the latter task while being highly competitive
+in more standard downstream segmentation tasks. Our code and pretrained models
+will be publicly available upon acceptance.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TidyBot: Personalized Robot Assistance with Large Language Models <span class="chip">IROS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.05658v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.05658v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jimmy Wu, Rika Antonova, Adam Kan, Marion Lepert, Andy Zeng, Shuran Song, Jeannette Bohg, Szymon Rusinkiewicz, Thomas Funkhouser
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For a robot to personalize physical assistance effectively, it must learn
+user preferences that can be generally reapplied to future scenarios. In this
+work, we investigate personalization of household cleanup with robots that can
+tidy up rooms by picking up objects and putting them away. A key challenge is
+determining the proper place to put each object, as people's preferences can
+vary greatly depending on personal taste or cultural background. For instance,
+one person may prefer storing shirts in the drawer, while another may prefer
+them on the shelf. We aim to build systems that can learn such preferences from
+just a handful of examples via prior interactions with a particular person. We
+show that robots can combine language-based planning and perception with the
+few-shot summarization capabilities of large language models (LLMs) to infer
+generalized user preferences that are broadly applicable to future
+interactions. This approach enables fast adaptation and achieves 91.2% accuracy
+on unseen objects in our benchmark dataset. We also demonstrate our approach on
+a real-world mobile manipulator called TidyBot, which successfully puts away
+85.0% of objects in real-world test scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Autonomous Robots (AuRo) - Special Issue: Large Language
+  Models in Robotics, 2023 and IEEE/RSJ International Conference on Intelligent
+  Robots and Systems (IROS), 2023. Project page:
+  https://tidybot.cs.princeton.edu</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ FateZero: Fusing Attentions for Zero-shot Text-based Video Editing <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09535v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09535v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenyang Qi, Xiaodong Cun, Yong Zhang, Chenyang Lei, Xintao Wang, Ying Shan, Qifeng Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The diffusion-based generative models have achieved remarkable success in
+text-based image generation. However, since it contains enormous randomness in
+generation progress, it is still challenging to apply such models for
+real-world visual content editing, especially in videos. In this paper, we
+propose FateZero, a zero-shot text-based editing method on real-world videos
+without per-prompt training or use-specific mask. To edit videos consistently,
+we propose several techniques based on the pre-trained models. Firstly, in
+contrast to the straightforward DDIM inversion technique, our approach captures
+intermediate attention maps during inversion, which effectively retain both
+structural and motion information. These maps are directly fused in the editing
+process rather than generated during denoising. To further minimize semantic
+leakage of the source video, we then fuse self-attentions with a blending mask
+obtained by cross-attention features from the source prompt. Furthermore, we
+have implemented a reform of the self-attention mechanism in denoising UNet by
+introducing spatial-temporal attention to ensure frame consistency. Yet
+succinct, our method is the first one to show the ability of zero-shot
+text-driven video style and local attribute editing from the trained
+text-to-image model. We also have a better zero-shot shape-aware editing
+ability based on the text-to-video model. Extensive experiments demonstrate our
+superior temporal consistency and editing capability than previous works.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to ICCV 2023 as an Oral Presentation. Project page:
+  https://fate-zero-edit.github.io ; GitHub repository:
+  https://github.com/ChenyangQiQi/FateZero</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient <span class="highlight-title">Transformer</span>-based 3D Object Detection with Dynamic Token
+  Halting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.05078v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.05078v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mao Ye, Gregory P. Meyer, Yuning Chai, Qiang Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Balancing efficiency and accuracy is a long-standing problem for deploying
+deep learning models. The trade-off is even more important for real-time
+safety-critical systems like autonomous vehicles. In this paper, we propose an
+effective approach for accelerating transformer-based 3D object detectors by
+dynamically halting tokens at different layers depending on their contribution
+to the detection task. Although halting a token is a non-differentiable
+operation, our method allows for differentiable end-to-end learning by
+leveraging an equivalent differentiable forward-pass. Furthermore, our
+framework allows halted tokens to be reused to inform the model's predictions
+through a straightforward token recycling mechanism. Our method significantly
+improves the Pareto frontier of efficiency versus accuracy when compared with
+the existing approaches. By halting tokens and increasing model capacity, we
+are able to improve the baseline model's performance without increasing the
+model's latency on the Waymo Open Dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VisoGender: A <span class="highlight-title">dataset</span> for benchmarking gender bias in image-text pronoun
+  resolution 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.12424v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.12424v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siobhan Mackenzie Hall, Fernanda Gonçalves Abrantes, Hanwen Zhu, Grace Sodunke, Aleksandar Shtedritski, Hannah Rose Kirk
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce VisoGender, a novel dataset for benchmarking gender bias in
+vision-language models. We focus on occupation-related biases within a
+hegemonic system of binary gender, inspired by Winograd and Winogender schemas,
+where each image is associated with a caption containing a pronoun relationship
+of subjects and objects in the scene. VisoGender is balanced by gender
+representation in professional roles, supporting bias evaluation in two ways:
+i) resolution bias, where we evaluate the difference between pronoun resolution
+accuracies for image subjects with gender presentations perceived as masculine
+versus feminine by human annotators and ii) retrieval bias, where we compare
+ratios of professionals perceived to have masculine and feminine gender
+presentations retrieved for a gender-neutral search query. We benchmark several
+state-of-the-art vision-language models and find that they demonstrate bias in
+resolving binary gender in complex scenes. While the direction and magnitude of
+gender bias depends on the task and the model being evaluated, captioning
+models are generally less biased than Vision-Language Encoders. Dataset and
+code are available at https://github.com/oxai/visogender
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Data and code available at https://github.com/oxai/visogender</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Editing Large Language Models: Problems, Methods, and Opportunities <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13172v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13172v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunzhi Yao, Peng Wang, Bozhong Tian, Siyuan Cheng, Zhoubo Li, Shumin Deng, Huajun Chen, Ningyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the ability to train capable LLMs, the methodology for maintaining
+their relevancy and rectifying errors remains elusive. To this end, the past
+few years have witnessed a surge in techniques for editing LLMs, the objective
+of which is to efficiently alter the behavior of LLMs within a specific domain
+without negatively impacting performance across other inputs. This paper
+embarks on a deep exploration of the problems, methods, and opportunities
+related to model editing for LLMs. In particular, we provide an exhaustive
+overview of the task definition and challenges associated with model editing,
+along with an in-depth empirical analysis of the most progressive methods
+currently at our disposal. We also build a new benchmark dataset to facilitate
+a more robust evaluation and pinpoint enduring issues intrinsic to existing
+techniques. Our objective is to provide valuable insights into the
+effectiveness and feasibility of each editing technique, thereby assisting the
+community in making informed decisions on the selection of the most appropriate
+method for a specific task or context. Code and datasets are available at
+https://github.com/zjunlp/EasyEdit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023. Updated with new experiments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Edge Video Analytics: A <span class="highlight-title">Survey</span> on Applications, Systems and Enabling
+  Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.15751v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.15751v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renjie Xu, Saiedeh Razavi, Rong Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video, as a key driver in the global explosion of digital information, can
+create tremendous benefits for human society. Governments and enterprises are
+deploying innumerable cameras for a variety of applications, e.g., law
+enforcement, emergency management, traffic control, and security surveillance,
+all facilitated by video analytics (VA). This trend is spurred by the rapid
+advancement of deep learning (DL), which enables more precise models for object
+classification, detection, and tracking. Meanwhile, with the proliferation of
+Internet-connected devices, massive amounts of data are generated daily,
+overwhelming the cloud. Edge computing, an emerging paradigm that moves
+workloads and services from the network core to the network edge, has been
+widely recognized as a promising solution. The resulting new intersection, edge
+video analytics (EVA), begins to attract widespread attention. Nevertheless,
+only a few loosely-related surveys exist on this topic. The basic concepts of
+EVA (e.g., definition, architectures) were not fully elucidated due to the
+rapid development of this domain. To fill these gaps, we provide a
+comprehensive survey of the recent efforts on EVA. In this paper, we first
+review the fundamentals of edge computing, followed by an overview of VA. EVA
+systems and their enabling techniques are discussed next. In addition, we
+introduce prevalent frameworks and datasets to aid future researchers in the
+development of EVA systems. Finally, we discuss existing challenges and foresee
+future research directions. We believe this survey will help readers comprehend
+the relationship between VA and edge computing, and spark new ideas on EVA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IEEE Communications Surveys and Tutorials, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Empirical Study of Multimodal Model Merging <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14933v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14933v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi-Lin Sung, Linjie Li, Kevin Lin, Zhe Gan, Mohit Bansal, Lijuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model merging (e.g., via interpolation or task arithmetic) fuses multiple
+models trained on different tasks to generate a multi-task solution. The
+technique has been proven successful in previous studies, where the models are
+trained on similar tasks and with the same initialization. In this paper, we
+expand on this concept to a multimodal setup by merging transformers trained on
+different modalities. Furthermore, we conduct our study for a novel goal where
+we can merge vision, language, and cross-modal transformers of a
+modality-specific architecture to create a parameter-efficient
+modality-agnostic architecture. Through comprehensive experiments, we
+systematically investigate the key factors impacting model performance after
+merging, including initialization, merging mechanisms, and model architectures.
+We also propose two metrics that assess the distance between weights to be
+merged and can serve as an indicator of the merging outcomes. Our analysis
+leads to an effective training recipe for matching the performance of the
+modality-agnostic baseline (i.e., pre-trained from scratch) via model merging.
+Our method also outperforms naive merging significantly on various tasks, with
+improvements of 3% on VQA, 7% on COCO retrieval, 25% on NLVR2, 14% on Flickr30k
+and 3% on ADE20k. Our code is available at https://github.com/ylsung/vl-merging
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Instruction<span class="highlight-title">GPT</span>-4: A 200-Instruction Paradigm for Fine-Tuning Mini<span class="highlight-title">GPT</span>-4 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.12067v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.12067v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lai Wei, Zihao Jiang, Weiran Huang, Lichao Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal large language models are typically trained in two stages: first
+pre-training on image-text pairs, and then fine-tuning using supervised
+vision-language instruction data. Recent studies have shown that large language
+models can achieve satisfactory results even with a limited amount of
+high-quality instruction-following data. In this paper, we introduce
+InstructionGPT-4, which is fine-tuned on a small dataset comprising only 200
+examples, amounting to approximately 6\% of the instruction-following data used
+in the alignment dataset for MiniGPT-4. To achieve this, we first propose
+several metrics to access the quality of multimodal instruction data. Based on
+these metrics, we present an effective and trainable data selector to
+automatically identify and filter low-quality vision-language data. By
+employing this method, InstructionGPT-4 outperforms the original MiniGPT-4 on
+various evaluations. Overall, our findings demonstrate that less but
+high-quality instruction tuning data is efficient in enabling multimodal large
+language models to generate better output. Our code is available at
+https://github.com/waltonfuture/InstructionGPT-4.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Described Object Detection: Liberating Object Detection with Flexible
+  Expressions <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.12813v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.12813v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chi Xie, Zhao Zhang, Yixuan Wu, Feng Zhu, Rui Zhao, Shuang Liang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Detecting objects based on language information is a popular task that
+includes Open-Vocabulary object Detection (OVD) and Referring Expression
+Comprehension (REC). In this paper, we advance them to a more practical setting
+called Described Object Detection (DOD) by expanding category names to flexible
+language expressions for OVD and overcoming the limitation of REC only
+grounding the pre-existing object. We establish the research foundation for DOD
+by constructing a Description Detection Dataset ($D^3$). This dataset features
+flexible language expressions, whether short category names or long
+descriptions, and annotating all described objects on all images without
+omission. By evaluating previous SOTA methods on $D^3$, we find some
+troublemakers that fail current REC, OVD, and bi-functional methods. REC
+methods struggle with confidence scores, rejecting negative instances, and
+multi-target scenarios, while OVD methods face constraints with long and
+complex descriptions. Recent bi-functional methods also do not work well on DOD
+due to their separated training procedures and inference strategies for REC and
+OVD tasks. Building upon the aforementioned findings, we propose a baseline
+that largely improves REC methods by reconstructing the training data and
+introducing a binary classification sub-task, outperforming existing methods.
+Data and code are available at https://github.com/shikras/d-cube and related
+works are tracked in
+https://github.com/Charles-Xie/awesome-described-object-detection.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MP-FedCL: Multiprototype Federated Contrastive Learning for Edge
+  Intelligence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.01950v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.01950v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Qiao, Md. Shirajum Munir, Apurba Adhikary, Huy Q. Le, Avi Deb Raha, Chaoning Zhang, Choong Seon Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning-assisted edge intelligence enables privacy protection in
+modern intelligent services. However, not independent and identically
+distributed (non-IID) distribution among edge clients can impair the local
+model performance. The existing single prototype-based strategy represents a
+class by using the mean of the feature space. However, feature spaces are
+usually not clustered, and a single prototype may not represent a class well.
+Motivated by this, this paper proposes a multi-prototype federated contrastive
+learning approach (MP-FedCL) which demonstrates the effectiveness of using a
+multi-prototype strategy over a single-prototype under non-IID settings,
+including both label and feature skewness. Specifically, a multi-prototype
+computation strategy based on \textit{k-means} is first proposed to capture
+different embedding representations for each class space, using multiple
+prototypes ($k$ centroids) to represent a class in the embedding space. In each
+global round, the computed multiple prototypes and their respective model
+parameters are sent to the edge server for aggregation into a global prototype
+pool, which is then sent back to all clients to guide their local training.
+Finally, local training for each client minimizes their own supervised learning
+tasks and learns from shared prototypes in the global prototype pool through
+supervised contrastive learning, which encourages them to learn knowledge
+related to their own class from others and reduces the absorption of unrelated
+knowledge in each global iteration. Experimental results on MNIST, Digit-5,
+Office-10, and DomainNet show that our method outperforms multiple baselines,
+with an average test accuracy improvement of about 4.6\% and 10.4\% under
+feature and label non-IID distributions, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Internet of Things</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ ECPC-IDS:A benchmark endometrail cancer PET/CT image <span class="highlight-title">dataset</span> for
+  evaluation of semantic segmentation and detection of hypermetabolic regions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.08313v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.08313v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dechao Tang, Tianming Du, Deguo Ma, Zhiyu Ma, Hongzan Sun, Marcin Grzegorzek, Huiyan Jiang, Chen Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Endometrial cancer is one of the most common tumors in the female
+reproductive system and is the third most common gynecological malignancy that
+causes death after ovarian and cervical cancer. Early diagnosis can
+significantly improve the 5-year survival rate of patients. With the
+development of artificial intelligence, computer-assisted diagnosis plays an
+increasingly important role in improving the accuracy and objectivity of
+diagnosis, as well as reducing the workload of doctors. However, the absence of
+publicly available endometrial cancer image datasets restricts the application
+of computer-assisted diagnostic techniques.In this paper, a publicly available
+Endometrial Cancer PET/CT Image Dataset for Evaluation of Semantic Segmentation
+and Detection of Hypermetabolic Regions (ECPC-IDS) are published. Specifically,
+the segmentation section includes PET and CT images, with a total of 7159
+images in multiple formats. In order to prove the effectiveness of segmentation
+methods on ECPC-IDS, five classical deep learning semantic segmentation methods
+are selected to test the image segmentation task. The object detection section
+also includes PET and CT images, with a total of 3579 images and XML files with
+annotation information. Six deep learning methods are selected for experiments
+on the detection task.This study conduct extensive experiments using deep
+learning-based semantic segmentation and object detection methods to
+demonstrate the differences between various methods on ECPC-IDS. As far as we
+know, this is the first publicly available dataset of endometrial cancer with a
+large number of multiple images, including a large amount of information
+required for image and target detection. ECPC-IDS can aid researchers in
+exploring new algorithms to enhance computer-assisted technology, benefiting
+both clinical doctors and patients greatly.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>14 pages,6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ altiro3D: Scene representation from single image and novel view
+  synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.11161v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.11161v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        E. Canessa, L. Tenze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce altiro3D, a free extended library developed to represent reality
+starting from a given original RGB image or flat video. It allows to generate a
+light-field (or Native) image or video and get a realistic 3D experience. To
+synthesize N-number of virtual images and add them sequentially into a Quilt
+collage, we apply MiDaS models for the monocular depth estimation, simple
+OpenCV and Telea inpainting techniques to map all pixels, and implement a
+'Fast' algorithm to handle 3D projection camera and scene transformations along
+N-viewpoints. We use the degree of depth to move proportionally the pixels,
+assuming the original image to be at the center of all the viewpoints. altiro3D
+can also be used with DIBR algorithm to compute intermediate snapshots from a
+equivalent 'Real (slower)' camera with N-geometric viewpoints, which requires
+to calibrate a priori several intrinsic and extrinsic camera parameters. We
+adopt a pixel- and device-based Lookup Table to optimize computing time. The
+multiple viewpoints and video generated from a single image or frame can be
+displayed in a free-view LCD display.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In press (2023) Springer International Journal of Information
+  Technology (IJIT) 10 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EXACT: How to Train Your Accuracy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.09615v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.09615v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ivan Karpukhin, Stanislav Dereka, Sergey Kolesnikov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classification tasks are usually evaluated in terms of accuracy. However,
+accuracy is discontinuous and cannot be directly optimized using gradient
+ascent. Popular methods minimize cross-entropy, hinge loss, or other surrogate
+losses, which can lead to suboptimal results. In this paper, we propose a new
+optimization framework by introducing stochasticity to a model's output and
+optimizing expected accuracy, i.e. accuracy of the stochastic model. Extensive
+experiments on linear models and deep image classification show that the
+proposed optimization method is a powerful alternative to widely used
+classification losses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ AsymFormer: Asymmetrical Cross-Modal Representation Learning for Mobile
+  Platform Real-Time RGB-D Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.14065v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.14065v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Siqi Du, Weixi Wang, Renzhong Guo, Shengjun Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the realm of robotic intelligence, achieving efficient and precise RGB-D
+semantic segmentation is a key cornerstone. State-of-the-art multimodal
+semantic segmentation methods, primarily rooted in symmetrical skeleton
+networks, find it challenging to harmonize computational efficiency and
+precision. In this work, we propose AsymFormer, a novel network for real-time
+RGB-D semantic segmentation, which targets the minimization of superfluous
+parameters by optimizing the distribution of computational resources and
+introduces an asymmetrical backbone to allow for the effective fusion of
+multimodal features. Furthermore, we explore techniques to bolster network
+accuracy by redefining feature selection and extracting multi-modal
+self-similarity features without a substantial increase in the parameter count,
+thereby ensuring real-time execution on robotic platforms. Additionally, a
+Local Attention-Guided Feature Selection (LAFS) module is used to selectively
+fuse features from different modalities by leveraging their dependencies.
+Subsequently, a Cross-Modal Attention-Guided Feature Correlation Embedding
+(CMA) module is introduced to further extract cross-modal representations. This
+method is evaluated on NYUv2 and SUNRGBD datasets, with AsymFormer
+demonstrating competitive results with 52.0% mIoU on NYUv2 and 49.1% mIoU on
+SUNRGBD. Notably, AsymFormer achieves an inference speed of 65 FPS and after
+implementing mixed precision quantization, it attains an impressive inference
+speed of 79 FPS on RTX3090. This significantly outperforms existing multi-modal
+methods, thereby demonstrating that AsymFormer can strike a balance between
+high accuracy and efficiency for RGB-D semantic segmentation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DiM: Distilling <span class="highlight-title">Dataset</span> into Generative Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.04707v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.04707v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kai Wang, Jianyang Gu, Daquan Zhou, Zheng Zhu, Wei Jiang, Yang You
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dataset distillation reduces the network training cost by synthesizing small
+and informative datasets from large-scale ones. Despite the success of the
+recent dataset distillation algorithms, three drawbacks still limit their wider
+application: i). the synthetic images perform poorly on large architectures;
+ii). they need to be re-optimized when the distillation ratio changes; iii).
+the limited diversity restricts the performance when the distillation ratio is
+large. In this paper, we propose a novel distillation scheme to
+\textbf{D}istill information of large train sets \textbf{i}nto generative
+\textbf{M}odels, named DiM. Specifically, DiM learns to use a generative model
+to store the information of the target dataset. During the distillation phase,
+we minimize the differences in logits predicted by a models pool between real
+and generated images. At the deployment stage, the generative model synthesizes
+various training samples from random noises on the fly. Due to the simple yet
+effective designs, the trained DiM can be directly applied to different
+distillation ratios and large architectures without extra cost. We validate the
+proposed DiM across 4 datasets and achieve state-of-the-art results on all of
+them. To the best of our knowledge, we are the first to achieve higher accuracy
+on complex architectures than simple ones, such as 75.1\% with ResNet-18 and
+72.6\% with ConvNet-3 on ten images per class of CIFAR-10. Besides, DiM
+outperforms previous methods with 10\% $\sim$ 22\% when images per class are 1
+and 10 on the SVHN dataset.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Distilling datasets into generative models</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analysis of Rainfall Variability and Water Extent of Selected Hydropower
+  Reservoir Using Google Earth Engine (GEE): A Case Study from Two Tropical
+  Countries, Sri Lanka and Vietnam 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05682v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05682v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Punsisi Rajakaruna, Surajit Ghosh, Bunyod Holmatov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents a comprehensive remote sensing analysis of rainfall
+patterns and selected hydropower reservoir water extent in two tropical monsoon
+countries, Vietnam and Sri Lanka. The aim is to understand the relationship
+between remotely sensed rainfall data and the dynamic changes (monthly) in
+reservoir water extent. The analysis utilizes high-resolution optical imagery
+and Sentinel-1 Synthetic Aperture Radar (SAR) data to observe and monitor water
+bodies during different weather conditions, especially during the monsoon
+season. The average annual rainfall for both countries is determined, and
+spatiotemporal variations in monthly average rainfall are examined at regional
+and reservoir basin levels using the Climate Hazards Group InfraRed
+Precipitation with Station (CHIRPS) dataset from 1981 to 2022. Water extents
+are derived for selected reservoirs using Sentinel-1 SAR Ground Range Detected
+(GRD) images in Vietnam and Sri Lanka from 2017 to 2022. The images are
+pre-processed and corrected using terrain correction and refined Lee filter. An
+automated thresholding algorithm, OTSU, distinguishes water and land, taking
+advantage of both VV and VH polarization data. The connected pixel count
+threshold is applied to enhance result accuracy. The results indicate a clear
+relationship between rainfall patterns and reservoir water extent, with
+increased precipitation during the monsoon season leading to higher water
+extents in the later months. This study contributes to understanding how
+rainfall variability impacts reservoir water resources in tropical monsoon
+regions. The preliminary findings can inform water resource management
+strategies and support these countries' decision-making processes related to
+hydropower generation, flood management, and irrigation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Point2Vec for <span class="highlight-title">Self-Supervised</span> Representation Learning on Point Clouds 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16570v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16570v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karim Abou Zeid, Jonas Schult, Alexander Hermans, Bastian Leibe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, the self-supervised learning framework data2vec has shown inspiring
+performance for various modalities using a masked student-teacher approach.
+However, it remains open whether such a framework generalizes to the unique
+challenges of 3D point clouds. To answer this question, we extend data2vec to
+the point cloud domain and report encouraging results on several downstream
+tasks. In an in-depth analysis, we discover that the leakage of positional
+information reveals the overall object shape to the student even under heavy
+masking and thus hampers data2vec to learn strong representations for point
+clouds. We address this 3D-specific shortcoming by proposing point2vec, which
+unleashes the full potential of data2vec-like pre-training on point clouds. Our
+experiments show that point2vec outperforms other self-supervised methods on
+shape classification and few-shot learning on ModelNet40 and ScanObjectNN,
+while achieving competitive results on part segmentation on ShapeNetParts.
+These results suggest that the learned representations are strong and
+transferable, highlighting point2vec as a promising direction for
+self-supervised learning of point cloud representations.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at GCPR 2023. Project page at
+  https://vision.rwth-aachen.de/point2vec</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DifFSS: Diffusion Model for Few-Shot Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00773v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00773v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Weimin Tan, Siyuan Chen, Bo Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Diffusion models have demonstrated excellent performance in image generation.
+Although various few-shot semantic segmentation (FSS) models with different
+network structures have been proposed, performance improvement has reached a
+bottleneck. This paper presents the first work to leverage the diffusion model
+for FSS task, called DifFSS. DifFSS, a novel FSS paradigm, can further improve
+the performance of the state-of-the-art FSS models by a large margin without
+modifying their network structure. Specifically, we utilize the powerful
+generation ability of diffusion models to generate diverse auxiliary support
+images by using the semantic mask, scribble or soft HED boundary of the support
+image as control conditions. This generation process simulates the variety
+within the class of the query image, such as color, texture variation,
+lighting, $etc$. As a result, FSS models can refer to more diverse support
+images, yielding more robust representations, thereby achieving a consistent
+improvement in segmentation performance. Extensive experiments on three
+publicly available datasets based on existing advanced FSS models demonstrate
+the effectiveness of the diffusion model for FSS task. Furthermore, we explore
+in detail the impact of different input settings of the diffusion model on
+segmentation performance. Hopefully, this completely new paradigm will bring
+inspiration to the study of FSS task integrated with AI-generated content. Code
+is available at https://github.com/TrinitialChan/DifFSS
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>code is available at https://github.com/TrinitialChan/DifFSS</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LEO: Generative Latent Image Animator for Human Video Synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.03989v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.03989v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaohui Wang, Xin Ma, Xinyuan Chen, Antitza Dantcheva, Bo Dai, Yu Qiao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Spatio-temporal coherency is a major challenge in synthesizing high quality
+videos, particularly in synthesizing human videos that contain rich global and
+local deformations. To resolve this challenge, previous approaches have
+resorted to different features in the generation process aimed at representing
+appearance and motion. However, in the absence of strict mechanisms to
+guarantee such disentanglement, a separation of motion from appearance has
+remained challenging, resulting in spatial distortions and temporal jittering
+that break the spatio-temporal coherency. Motivated by this, we here propose
+LEO, a novel framework for human video synthesis, placing emphasis on
+spatio-temporal coherency. Our key idea is to represent motion as a sequence of
+flow maps in the generation process, which inherently isolate motion from
+appearance. We implement this idea via a flow-based image animator and a Latent
+Motion Diffusion Model (LMDM). The former bridges a space of motion codes with
+the space of flow maps, and synthesizes video frames in a warp-and-inpaint
+manner. LMDM learns to capture motion prior in the training data by
+synthesizing sequences of motion codes. Extensive quantitative and qualitative
+analysis suggests that LEO significantly improves coherent synthesis of human
+videos over previous methods on the datasets TaichiHD, FaceForensics and
+CelebV-HQ. In addition, the effective disentanglement of appearance and motion
+in LEO allows for two additional tasks, namely infinite-length human video
+synthesis, as well as content-preserving video editing.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project webpage: https://wyhsirius.github.io/LEO-project/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DisCo: Disentangled Control for Realistic Human Dance Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.00040v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.00040v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tan Wang, Linjie Li, Kevin Lin, Yuanhao Zhai, Chung-Ching Lin, Zhengyuan Yang, Hanwang Zhang, Zicheng Liu, Lijuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative AI has made significant strides in computer vision, particularly
+in text-driven image/video synthesis (T2I/T2V). Despite the notable
+advancements, it remains challenging in human-centric content synthesis such as
+realistic dance generation. Current methodologies, primarily tailored for human
+motion transfer, encounter difficulties when confronted with real-world dance
+scenarios (e.g., social media dance) which require to generalize across a wide
+spectrum of poses and intricate human details. In this paper, we depart from
+the traditional paradigm of human motion transfer and emphasize two additional
+critical attributes for the synthesis of human dance content in social media
+contexts: (i) Generalizability: the model should be able to generalize beyond
+generic human viewpoints as well as unseen human subjects, backgrounds, and
+poses; (ii) Compositionality: it should allow for composition of seen/unseen
+subjects, backgrounds, and poses from different sources seamlessly. To address
+these challenges, we introduce DisCo, which includes a novel model architecture
+with disentangled control to improve the compositionality of dance synthesis,
+and an effective human attribute pre-training for better generalizability to
+unseen humans. Extensive qualitative and quantitative results demonstrate that
+DisCo can generate high-quality human dance images and videos with diverse
+appearances and flexible motions. Code, demo, video and visualization are
+available at: https://disco-dance.github.io/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://disco-dance.github.io/ ; Add temporal module ;
+  Synchronize FVD computation with MCVD ; More baselines and visualizations</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MomentDiff: Generative Video Moment Retrieval from Random to Real 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02869v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02869v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pandeng Li, Chen-Wei Xie, Hongtao Xie, Liming Zhao, Lei Zhang, Yun Zheng, Deli Zhao, Yongdong Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video moment retrieval pursues an efficient and generalized solution to
+identify the specific temporal segments within an untrimmed video that
+correspond to a given language description. To achieve this goal, we provide a
+generative diffusion-based framework called MomentDiff, which simulates a
+typical human retrieval process from random browsing to gradual localization.
+Specifically, we first diffuse the real span to random noise, and learn to
+denoise the random noise to the original span with the guidance of similarity
+between text and video. This allows the model to learn a mapping from arbitrary
+random locations to real moments, enabling the ability to locate segments from
+random initialization. Once trained, MomentDiff could sample random temporal
+segments as initial guesses and iteratively refine them to generate an accurate
+temporal boundary. Different from discriminative works (e.g., based on
+learnable proposals or queries), MomentDiff with random initialized spans could
+resist the temporal location biases from datasets. To evaluate the influence of
+the temporal location biases, we propose two anti-bias datasets with location
+distribution shifts, named Charades-STA-Len and Charades-STA-Mom. The
+experimental results demonstrate that our efficient framework consistently
+outperforms state-of-the-art methods on three public benchmarks, and exhibits
+better generalization and robustness on the proposed anti-bias datasets. The
+code, model, and anti-bias evaluation datasets are available at
+https://github.com/IMCCretrieval/MomentDiff.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Video alignment using unsupervised learning of local and global features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06841v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06841v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niloufar Fakhfour, Mohammad ShahverdiKondori, Hoda Mohammadzade
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we tackle the problem of video alignment, the process of
+matching the frames of a pair of videos containing similar actions. The main
+challenge in video alignment is that accurate correspondence should be
+established despite the differences in the execution processes and appearances
+between the two videos. We introduce an unsupervised method for alignment that
+uses global and local features of the frames. In particular, we introduce
+effective features for each video frame using three machine vision tools:
+person detection, pose estimation, and VGG network. Then, the features are
+processed and combined to construct a multidimensional time series that
+represents the video. The resulting time series are used to align videos of the
+same actions using a novel version of dynamic time warping named Diagonalized
+Dynamic Time Warping(DDTW). The main advantage of our approach is that no
+training is required, which makes it applicable for any new type of action
+without any need to collect training samples for it. For evaluation, we
+considered video synchronization and phase classification tasks on the Penn
+action dataset. Also, for an effective evaluation of the video synchronization
+task, we present a new metric called Enclosed Area Error(EAE). The results show
+that our method outperforms previous state-of-the-art methods, such as TCC, and
+other self-supervised and weakly supervised methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Video <span class="highlight-title">Transformer</span>s under Occlusion: How Physics and Background
+  Attributes Impact Large Models for Robotic Manipulation <span class="chip">ICRA 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.02044v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.02044v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shutong Jin, Ruiyu Wang, Muhammad Zahid, Florian T. Pokorny
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As transformer architectures and dataset sizes continue to scale, the need to
+understand the specific dataset factors affecting model performance becomes
+increasingly urgent. This paper investigates how object physics attributes
+(color, friction coefficient, shape) and background characteristics (static,
+dynamic, background complexity) influence the performance of Video Transformers
+in trajectory prediction tasks under occlusion. Beyond mere occlusion
+challenges, this study aims to investigate three questions: How do object
+physics attributes and background characteristics influence the model
+performance? What kinds of attributes are most influential to the model
+generalization? Is there a data saturation point for large transformer model
+performance within a single task? To facilitate this research, we present
+OccluManip, a real-world video-based robot pushing dataset comprising 460,000
+consistent recordings of objects with different physics and varying
+backgrounds. 1.4 TB and in total 1278 hours of high-quality videos of flexible
+temporal length along with target object trajectories are collected,
+accommodating tasks with different temporal requirements. Additionally, we
+propose Video Occlusion Transformer (VOT), a generic video-transformer-based
+network achieving an average 96% accuracy across all 18 sub-datasets provided
+in OccluManip. OccluManip and VOT will be released at:
+https://github.com/ShutongJIN/OccluManip.git
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review at IEEE ICRA 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ GRAM-HD: 3D-Consistent Image Generation at High Resolution with
+  Generative Radiance Manifolds <span class="chip">ICCV2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.07255v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.07255v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jianfeng Xiang, Jiaolong Yang, Yu Deng, Xin Tong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent works have shown that 3D-aware GANs trained on unstructured single
+image collections can generate multiview images of novel instances. The key
+underpinnings to achieve this are a 3D radiance field generator and a volume
+rendering process. However, existing methods either cannot generate
+high-resolution images (e.g., up to 256X256) due to the high computation cost
+of neural volume rendering, or rely on 2D CNNs for image-space upsampling which
+jeopardizes the 3D consistency across different views. This paper proposes a
+novel 3D-aware GAN that can generate high resolution images (up to 1024X1024)
+while keeping strict 3D consistency as in volume rendering. Our motivation is
+to achieve super-resolution directly in the 3D space to preserve 3D
+consistency. We avoid the otherwise prohibitively-expensive computation cost by
+applying 2D convolutions on a set of 2D radiance manifolds defined in the
+recent generative radiance manifold (GRAM) approach, and apply dedicated loss
+functions for effective GAN training at high resolution. Experiments on FFHQ
+and AFHQv2 datasets show that our method can produce high-quality 3D-consistent
+results that significantly outperform existing methods. It makes a significant
+step towards closing the gap between traditional 2D image generation and
+3D-consistent free-view generation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV2023 camera ready version (more results and method comparisons).
+  Project page: https://jeffreyxiang.github.io/GRAM-HD/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Face Recognition with Latent Space Data Augmentation and
+  Facial Posture Reconstruction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.11986v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.11986v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Soroush Hashemifar, Abdolreza Marefat, Javad Hassannataj Joloudari, Hamid Hassanpour
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The small amount of training data for many state-of-the-art deep
+learning-based Face Recognition (FR) systems causes a marked deterioration in
+their performance. Although a considerable amount of research has addressed
+this issue by inventing new data augmentation techniques, using either input
+space transformations or Generative Adversarial Networks (GAN) for feature
+space augmentations, these techniques have yet to satisfy expectations. In this
+paper, we propose an approach named the Face Representation Augmentation (FRA)
+for augmenting face datasets. To the best of our knowledge, FRA is the first
+method that shifts its focus towards manipulating the face embeddings generated
+by any face representation learning algorithm to create new embeddings
+representing the same identity and facial emotion but with an altered posture.
+Extensive experiments conducted in this study convince of the efficacy of our
+methodology and its power to provide noiseless, completely new facial
+representations to improve the training procedure of any FR algorithm.
+Therefore, FRA can help the recent state-of-the-art FR methods by providing
+more data for training FR systems. The proposed method, using experiments
+conducted on the Karolinska Directed Emotional Faces (KDEF) dataset, improves
+the identity classification accuracies by 9.52 %, 10.04 %, and 16.60 %, in
+comparison with the base models of MagFace, ArcFace, and CosFace, respectively.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bridging the Gap between Human Motion and Action Semantics via Kinematic
+  Phrases 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04189v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04189v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinpeng Liu, Yong-Lu Li, Ailing Zeng, Zizheng Zhou, Yang You, Cewu Lu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The goal of motion understanding is to establish a reliable mapping between
+motion and action semantics, while it is a challenging many-to-many problem. An
+abstract action semantic (i.e., walk forwards) could be conveyed by
+perceptually diverse motions (walk with arms up or swinging), while a motion
+could carry different semantics w.r.t. its context and intention. This makes an
+elegant mapping between them difficult. Previous attempts adopted
+direct-mapping paradigms with limited reliability. Also, current automatic
+metrics fail to provide reliable assessments of the consistency between motions
+and action semantics. We identify the source of these problems as the
+significant gap between the two modalities. To alleviate this gap, we propose
+Kinematic Phrases (KP) that take the objective kinematic facts of human motion
+with proper abstraction, interpretability, and generality characteristics.
+Based on KP as a mediator, we can unify a motion knowledge base and build a
+motion understanding system. Meanwhile, KP can be automatically converted from
+motions and to text descriptions with no subjective bias, inspiring Kinematic
+Prompt Generation (KPG) as a novel automatic motion generation benchmark. In
+extensive experiments, our approach shows superiority over other methods. Our
+code and data would be made publicly available at https://foruck.github.io/KP.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Yong-Lu Li and Cewu Lu are the corresponding authors. Project page is
+  available at https://foruck.github.io/KP/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Decoupled Diffusion Models: Image to Zero and Zero to Noise 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.13720v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.13720v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhang Huang, Liang Zheng, Zheng Qin, Xinwang Liu, Kai Xu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent diffusion probabilistic models (DPMs) have shown remarkable abilities
+of generated content, however, they often suffer from complex forward
+processes, resulting in inefficient solutions for the reversed process and
+prolonged sampling times. In this paper, we aim to address the aforementioned
+challenges by focusing on the diffusion process itself that we propose to
+decouple the intricate diffusion process into two comparatively simpler process
+to improve the generative efficacy and speed. In particular, we present a novel
+diffusion paradigm named DDM (Decoupled Diffusion Models) based on the Ito
+diffusion process, in which the image distribution is approximated by an
+explicit transition probability while the noise path is controlled by the
+standard Wiener process. We find that decoupling the diffusion process reduces
+the learning difficulty and the explicit transition probability improves the
+generative speed significantly. We prove a new training objective for DPM,
+which enables the model to learn to predict the noise and image components
+separately. Moreover, given the novel forward diffusion equation, we derive the
+reverse denoising formula of DDM that naturally supports fewer steps of
+generation without ordinary differential equation (ODE) based accelerators. Our
+experiments demonstrate that DDM outperforms previous DPMs by a large margin in
+fewer function evaluations setting and gets comparable performances in long
+function evaluations setting. We also show that our framework can be applied to
+image-conditioned generation and high-resolution image synthesis, and that it
+can generate high-quality images with only 10 function evaluations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Video-Teller: Enhancing Cross-Modal Generation with Fusion and
+  Decoupling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04991v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04991v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haogeng Liu, Qihang Fan, Tingkai Liu, Linjie Yang, Yunzhe Tao, Huaibo Huang, Ran He, Hongxia Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper proposes Video-Teller, a video-language foundation model that
+leverages multi-modal fusion and fine-grained modality alignment to
+significantly enhance the video-to-text generation task. Video-Teller boosts
+the training efficiency by utilizing frozen pretrained vision and language
+modules. It capitalizes on the robust linguistic capabilities of large language
+models, enabling the generation of both concise and elaborate video
+descriptions. To effectively integrate visual and auditory information,
+Video-Teller builds upon the image-based BLIP-2 model and introduces a cascaded
+Q-Former which fuses information across frames and ASR texts. To better guide
+video summarization, we introduce a fine-grained modality alignment objective,
+where the cascaded Q-Former's output embedding is trained to align with the
+caption/summary embedding created by a pretrained text auto-encoder.
+Experimental results demonstrate the efficacy of our proposed video-language
+foundation model in accurately comprehending videos and generating coherent and
+precise language descriptions. It is worth noting that the fine-grained
+alignment enhances the model's capabilities (4% improvement of CIDEr score on
+MSR-VTT) with only 13% extra parameters in training and zero additional cost in
+inference.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Fair and Comprehensive Comparisons for Image-Based 3D Object
+  Detection <span class="chip">ICCV23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05447v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05447v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinzhu Ma, Yongtao Wang, Yinmin Zhang, Zhiyi Xia, Yuan Meng, Zhihui Wang, Haojie Li, Wanli Ouyang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this work, we build a modular-designed codebase, formulate strong training
+recipes, design an error diagnosis toolbox, and discuss current methods for
+image-based 3D object detection. In particular, different from other highly
+mature tasks, e.g., 2D object detection, the community of image-based 3D object
+detection is still evolving, where methods often adopt different training
+recipes and tricks resulting in unfair evaluations and comparisons. What is
+worse, these tricks may overwhelm their proposed designs in performance, even
+leading to wrong conclusions. To address this issue, we build a module-designed
+codebase and formulate unified training standards for the community.
+Furthermore, we also design an error diagnosis toolbox to measure the detailed
+characterization of detection models. Using these tools, we analyze current
+methods in-depth under varying settings and provide discussions for some open
+questions, e.g., discrepancies in conclusions on KITTI-3D and nuScenes
+datasets, which have led to different dominant methods for these datasets. We
+hope that this work will facilitate future research in image-based 3D object
+detection. Our codes will be released at
+\url{https://github.com/OpenGVLab/3dodi}
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICCV23, code will be released soon</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Measuring uncertainty in human visual segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.07807v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.07807v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jonathan Vacher, Claire Launay, Pascal Mamassian, Ruben Coen-Cagli
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Segmenting visual stimuli into distinct groups of features and visual objects
+is central to visual function. Classical psychophysical methods have helped
+uncover many rules of human perceptual segmentation, and recent progress in
+machine learning has produced successful algorithms. Yet, the computational
+logic of human segmentation remains unclear, partially because we lack
+well-controlled paradigms to measure perceptual segmentation maps and compare
+models quantitatively. Here we propose a new, integrated approach: given an
+image, we measure multiple pixel-based same--different judgments and perform
+model--based reconstruction of the underlying segmentation map. The
+reconstruction is robust to several experimental manipulations and captures the
+variability of individual participants. We demonstrate the validity of the
+approach on human segmentation of natural images and composite textures. We
+show that image uncertainty affects measured human variability, and it
+influences how participants weigh different visual features. Because any
+putative segmentation algorithm can be inserted to perform the reconstruction,
+our paradigm affords quantitative tests of theories of perception as well as
+new benchmarks for segmentation algorithms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>32 pages, 9 figures, 5 appendix, 5 figures in appendix</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ APRIL-GAN: A Zero-/Few-Shot Anomaly Classification and Segmentation
+  Method for CVPR 2023 VAND Workshop Challenge Tracks 1&2: 1st Place on
+  Zero-shot AD and 4th Place on Few-shot AD 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.17382v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.17382v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuhai Chen, Yue Han, Jiangning Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this technical report, we briefly introduce our solution for the
+Zero/Few-shot Track of the Visual Anomaly and Novelty Detection (VAND) 2023
+Challenge. For industrial visual inspection, building a single model that can
+be rapidly adapted to numerous categories without or with only a few normal
+reference images is a promising research direction. This is primarily because
+of the vast variety of the product types. For the zero-shot track, we propose a
+solution based on the CLIP model by adding extra linear layers. These layers
+are used to map the image features to the joint embedding space, so that they
+can compare with the text features to generate the anomaly maps. Besides, when
+the reference images are available, we utilize multiple memory banks to store
+their features and compare them with the features of the test images during the
+testing phase. In this challenge, our method achieved first place in the
+zero-shot track, especially excelling in segmentation with an impressive F1
+score improvement of 0.0489 over the second-ranked participant. Furthermore, in
+the few-shot track, we secured the fourth position overall, with our
+classification F1 score of 0.8687 ranking first among all participating teams.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ CryoFormer: Continuous Heterogeneous Cryo-EM Reconstruction using
+  <span class="highlight-title">Transformer</span>-based Neural Representations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.16254v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.16254v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xinhang Liu, Yan Zeng, Yifan Qin, Hao Li, Jiakai Zhang, Lan Xu, Jingyi Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cryo-electron microscopy (cryo-EM) allows for the high-resolution
+reconstruction of 3D structures of proteins and other biomolecules. Successful
+reconstruction of both shape and movement greatly helps understand the
+fundamental processes of life. However, it is still challenging to reconstruct
+the continuous motions of 3D structures from hundreds of thousands of noisy and
+randomly oriented 2D cryo-EM images. Recent advancements use Fourier domain
+coordinate-based neural networks to continuously model 3D conformations, yet
+they often struggle to capture local flexible regions accurately. We propose
+CryoFormer, a new approach for continuous heterogeneous cryo-EM reconstruction.
+Our approach leverages an implicit feature volume directly in the real domain
+as the 3D representation. We further introduce a novel query-based deformation
+transformer decoder to improve the reconstruction quality. Our approach is
+capable of refining pre-computed pose estimations and locating flexible
+regions. In experiments, our method outperforms current approaches on three
+public datasets (1 synthetic and 2 experimental) and a new synthetic dataset of
+PEDV spike protein. The code and new synthetic dataset will be released for
+better reproducibility of our results. Project page:
+https://cryoformer.github.io.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rethinking Superpixel Segmentation from Biologically Inspired Mechanisms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.13438v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.13438v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Tingyu Zhao, Bo Peng, Yuan Sun, Daipeng Yang, Zhenguang Zhang, Xi Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, advancements in deep learning-based superpixel segmentation methods
+have brought about improvements in both the efficiency and the performance of
+segmentation. However, a significant challenge remains in generating
+superpixels that strictly adhere to object boundaries while conveying rich
+visual significance, especially when cross-surface color correlations may
+interfere with objects. Drawing inspiration from neural structure and visual
+mechanisms, we propose a biological network architecture comprising an Enhanced
+Screening Module (ESM) and a novel Boundary-Aware Label (BAL) for superpixel
+segmentation. The ESM enhances semantic information by simulating the
+interactive projection mechanisms of the visual cortex. Additionally, the BAL
+emulates the spatial frequency characteristics of visual cortical cells to
+facilitate the generation of superpixels with strong boundary adherence. We
+demonstrate the effectiveness of our approach through evaluations on both the
+BSDS500 dataset and the NYUv2 dataset.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ VDT: General-purpose Video Diffusion <span class="highlight-title">Transformer</span>s via Mask Modeling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13311v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13311v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haoyu Lu, Guoxing Yang, Nanyi Fei, Yuqi Huo, Zhiwu Lu, Ping Luo, Mingyu Ding
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work introduces Video Diffusion Transformer (VDT), which pioneers the
+use of transformers in diffusion-based video generation. It features
+transformer blocks with modularized temporal and spatial attention modules to
+leverage the rich spatial-temporal representation inherited in transformers. We
+also propose a unified spatial-temporal mask modeling mechanism, seamlessly
+integrated with the model, to cater to diverse video generation scenarios. VDT
+offers several appealing benefits. 1) It excels at capturing temporal
+dependencies to produce temporally consistent video frames and even simulate
+the physics and dynamics of 3D objects over time. 2) It facilitates flexible
+conditioning information, \eg, simple concatenation in the token space,
+effectively unifying different token lengths and modalities. 3) Pairing with
+our proposed spatial-temporal mask modeling mechanism, it becomes a
+general-purpose video diffuser for harnessing a range of tasks, including
+unconditional generation, video prediction, interpolation, animation, and
+completion, etc. Extensive experiments on these tasks spanning various
+scenarios, including autonomous driving, natural weather, human action, and
+physics-based simulation, demonstrate the effectiveness of VDT. Additionally,
+we present comprehensive studies on how \model handles conditioning information
+with the mask modeling mechanism, which we believe will benefit future research
+and advance the field. Project page: https:VDT-2023.github.io
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cell Tracking-by-detection using Elliptical Bounding Boxes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04895v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04895v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas N. Kirsten, Cláudio R. Jung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cell detection and tracking are paramount for bio-analysis. Recent approaches
+rely on the tracking-by-model evolution paradigm, which usually consists of
+training end-to-end deep learning models to detect and track the cells on the
+frames with promising results. However, such methods require extensive amounts
+of annotated data, which is time-consuming to obtain and often requires
+specialized annotators. This work proposes a new approach based on the
+classical tracking-by-detection paradigm that alleviates the requirement of
+annotated data. More precisely, it approximates the cell shapes as oriented
+ellipses and then uses generic-purpose oriented object detectors to identify
+the cells in each frame. We then rely on a global data association algorithm
+that explores temporal cell similarity using probability distance metrics,
+considering that the ellipses relate to two-dimensional Gaussian distributions.
+Our results show that our method can achieve detection and tracking results
+competitively with state-of-the-art techniques that require considerably more
+extensive data annotation. Our code is available at:
+https://github.com/LucasKirsten/Deep-Cell-Tracking-EBB.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper under review on IEEE/ACM Transactions on Computational Biology
+  and Bioinformatics</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Critical Look at Classic Test-Time Adaptation Methods in Semantic
+  Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05341v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05341v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chang'an Yi, Haotian Chen, Yifan Zhang, Yonghui Xu, Lizhen Cui
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Test-time adaptation (TTA) aims to adapt a model, initially trained on
+training data, to potential distribution shifts in the test data. Most existing
+TTA studies, however, focus on classification tasks, leaving a notable gap in
+the exploration of TTA for semantic segmentation. This pronounced emphasis on
+classification might lead numerous newcomers and engineers to mistakenly assume
+that classic TTA methods designed for classification can be directly applied to
+segmentation. Nonetheless, this assumption remains unverified, posing an open
+question. To address this, we conduct a systematic, empirical study to disclose
+the unique challenges of segmentation TTA, and to determine whether classic TTA
+strategies can effectively address this task. Our comprehensive results have
+led to three key observations. First, the classic batch norm updating strategy,
+commonly used in classification TTA, only brings slight performance
+improvement, and in some cases it might even adversely affect the results. Even
+with the application of advanced distribution estimation techniques like batch
+renormalization, the problem remains unresolved. Second, the teacher-student
+scheme does enhance training stability for segmentation TTA in the presence of
+noisy pseudo-labels. However, it cannot directly result in performance
+improvement compared to the original model without TTA. Third, segmentation TTA
+suffers a severe long-tailed imbalance problem, which is substantially more
+complex than that in TTA for classification. This long-tailed challenge
+significantly affects segmentation TTA performance, even when the accuracy of
+pseudo-labels is high. In light of these observations, we conclude that TTA for
+segmentation presents significant challenges, and simply using classic TTA
+methods cannot address this problem well.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Drivable Avatar Clothing: Faithful Full-Body Telepresence with Dynamic
+  Clothing Driven by Sparse RGB-D Input <span class="chip">SIGGRAPH</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05917v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05917v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Donglai Xiang, Fabian Prada, Zhe Cao, Kaiwen Guo, Chenglei Wu, Jessica Hodgins, Timur Bagautdinov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Clothing is an important part of human appearance but challenging to model in
+photorealistic avatars. In this work we present avatars with dynamically moving
+loose clothing that can be faithfully driven by sparse RGB-D inputs as well as
+body and face motion. We propose a Neural Iterative Closest Point (N-ICP)
+algorithm that can efficiently track the coarse garment shape given sparse
+depth input. Given the coarse tracking results, the input RGB-D images are then
+remapped to texel-aligned features, which are fed into the drivable avatar
+models to faithfully reconstruct appearance details. We evaluate our method
+against recent image-driven synthesis baselines, and conduct a comprehensive
+analysis of the N-ICP algorithm. We demonstrate that our method can generalize
+to a novel testing environment, while preserving the ability to produce
+high-fidelity and faithful clothing dynamics and appearance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>SIGGRAPH Asia 2023 Conference Paper. Project website:
+  https://xiangdonglai.github.io/www-sa23-drivable-clothing/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ The Dawn of LMMs: Preliminary Explorations with <span class="highlight-title">GPT</span>-4V(ision) 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17421v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17421v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengyuan Yang, Linjie Li, Kevin Lin, Jianfeng Wang, Chung-Ching Lin, Zicheng Liu, Lijuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large multimodal models (LMMs) extend large language models (LLMs) with
+multi-sensory skills, such as visual understanding, to achieve stronger generic
+intelligence. In this paper, we analyze the latest model, GPT-4V(ision), to
+deepen the understanding of LMMs. The analysis focuses on the intriguing tasks
+that GPT-4V can perform, containing test samples to probe the quality and
+genericity of GPT-4V's capabilities, its supported inputs and working modes,
+and the effective ways to prompt the model. In our approach to exploring
+GPT-4V, we curate and organize a collection of carefully designed qualitative
+samples spanning a variety of domains and tasks. Observations from these
+samples demonstrate that GPT-4V's unprecedented ability in processing
+arbitrarily interleaved multimodal inputs and the genericity of its
+capabilities together make GPT-4V a powerful multimodal generalist system.
+Furthermore, GPT-4V's unique capability of understanding visual markers drawn
+on input images can give rise to new human-computer interaction methods such as
+visual referring prompting. We conclude the report with in-depth discussions on
+the emerging application scenarios and the future research directions for
+GPT-4V-based systems. We hope that this preliminary exploration will inspire
+future research on the next-generation multimodal task formulation, new ways to
+exploit and enhance LMMs to solve real-world problems, and gaining better
+understanding of multimodal foundation models. Finally, we acknowledge that the
+model under our study is solely the product of OpenAI's innovative work, and
+they should be fully credited for its development. Please see the GPT-4V
+contributions paper for the authorship and credit attribution:
+https://cdn.openai.com/contributions/gpt-4v.pdf
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ InstructDET: Diversifying Referring Object Detection with Generalized
+  Instructions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05136v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05136v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ronghao Dang, Jiangyan Feng, Haodong Zhang, Chongjian Ge, Lin Song, Lijun Gong, Chengju Liu, Qijun Chen, Feng Zhu, Rui Zhao, Yibing Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose InstructDET, a data-centric method for referring object detection
+(ROD) that localizes target objects based on user instructions. While deriving
+from referring expressions (REC), the instructions we leverage are greatly
+diversified to encompass common user intentions related to object detection.
+For one image, we produce tremendous instructions that refer to every single
+object and different combinations of multiple objects. Each instruction and its
+corresponding object bounding boxes (bbxs) constitute one training data pair.
+In order to encompass common detection expressions, we involve emerging
+vision-language model (VLM) and large language model (LLM) to generate
+instructions guided by text prompts and object bbxs, as the generalizations of
+foundation models are effective to produce human-like expressions (e.g.,
+describing object property, category, and relationship). We name our
+constructed dataset as InDET. It contains images, bbxs and generalized
+instructions that are from foundation models. Our InDET is developed from
+existing REC datasets and object detection datasets, with the expanding
+potential that any image with object bbxs can be incorporated through using our
+InstructDET method. By using our InDET dataset, we show that a conventional ROD
+model surpasses existing methods on standard REC datasets and our InDET test
+set. Our data-centric method InstructDET, with automatic data expansion by
+leveraging foundation models, directs a promising field that ROD can be greatly
+diversified to execute common object detection instructions.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Adjust the subject</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PAIR-Diffusion: A Comprehensive Multimodal Object-Level Image Editor 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.17546v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.17546v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Vidit Goel, Elia Peruzzo, Yifan Jiang, Dejia Xu, Xingqian Xu, Nicu Sebe, Trevor Darrell, Zhangyang Wang, Humphrey Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative image editing has recently witnessed extremely fast-paced growth.
+Some works use high-level conditioning such as text, while others use low-level
+conditioning. Nevertheless, most of them lack fine-grained control over the
+properties of the different objects present in the image, i.e.\,object-level
+image editing. In this work, we tackle the task by perceiving the images as an
+amalgamation of various objects and aim to control the properties of each
+object in a fine-grained manner. Out of these properties, we identify structure
+and appearance as the most intuitive to understand and useful for editing
+purposes. We propose \textbf{PAIR} Diffusion, a generic framework that can
+enable a diffusion model to control the structure and appearance properties of
+each object in the image. We show that having control over the properties of
+each object in an image leads to comprehensive editing capabilities. Our
+framework allows for various object-level editing operations on real images
+such as reference image-based appearance editing, free-form shape editing,
+adding objects, and variations. Thanks to our design, we do not require any
+inversion step. Additionally, we propose multimodal classifier-free guidance
+which enables editing images using both reference images and text when using
+our approach with foundational diffusion models. We validate the above claims
+by extensively evaluating our framework on both unconditional and foundational
+diffusion models. Please refer to
+https://vidit98.github.io/publication/conference-paper/pair_diff.html for code
+and model release.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>26 pages and 17 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IPD:An Incremental Prototype based DBSCAN for large-scale data with
+  cluster representatives 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2202.07870v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2202.07870v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jayasree Saha, Jayanta Mukherjee
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  DBSCAN is a fundamental density-based clustering technique that identifies
+any arbitrary shape of the clusters. However, it becomes infeasible while
+handling big data. On the other hand, centroid-based clustering is important
+for detecting patterns in a dataset since unprocessed data points can be
+labeled to their nearest centroid. However, it can not detect non-spherical
+clusters. For a large data, it is not feasible to store and compute labels of
+every samples. These can be done as and when the information is required. The
+purpose can be accomplished when clustering act as a tool to identify cluster
+representatives and query is served by assigning cluster labels of nearest
+representative. In this paper, we propose an Incremental Prototype-based DBSCAN
+(IPD) algorithm which is designed to identify arbitrary-shaped clusters for
+large-scale data. Additionally, it chooses a set of representatives for each
+cluster.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ YOLO-Drone:Airborne real-time detection of dense small objects from
+  high-altitude perspective 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06925v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06925v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Li Zhu, Jiahui Xiong, Feng Xiong, Hanzheng Hu, Zhengnan Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unmanned Aerial Vehicles (UAVs), specifically drones equipped with remote
+sensing object detection technology, have rapidly gained a broad spectrum of
+applications and emerged as one of the primary research focuses in the field of
+computer vision. Although UAV remote sensing systems have the ability to detect
+various objects, small-scale objects can be challenging to detect reliably due
+to factors such as object size, image degradation, and real-time limitations.
+To tackle these issues, a real-time object detection algorithm (YOLO-Drone) is
+proposed and applied to two new UAV platforms as well as a specific light
+source (silicon-based golden LED). YOLO-Drone presents several novelties: 1)
+including a new backbone Darknet59; 2) a new complex feature aggregation module
+MSPP-FPN that incorporated one spatial pyramid pooling and three atrous spatial
+pyramid pooling modules; 3) and the use of Generalized Intersection over Union
+(GIoU) as the loss function. To evaluate performance, two benchmark datasets,
+UAVDT and VisDrone, along with one homemade dataset acquired at night under
+silicon-based golden LEDs, are utilized. The experimental results show that, in
+both UAVDT and VisDrone, the proposed YOLO-Drone outperforms state-of-the-art
+(SOTA) object detection methods by improving the mAP of 10.13% and 8.59%,
+respectively. With regards to UAVDT, the YOLO-Drone exhibits both high
+real-time inference speed of 53 FPS and a maximum mAP of 34.04%. Notably,
+YOLO-Drone achieves high performance under the silicon-based golden LEDs, with
+a mAP of up to 87.71%, surpassing the performance of YOLO series under ordinary
+light sources. To conclude, the proposed YOLO-Drone is a highly effective
+solution for object detection in UAV applications, particularly for night
+detection tasks where silicon-based golden light LED technology exhibits
+significant superiority.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Some contributing authors are not signed</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MuseChat: A Conversational Music Recommendation System for Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06282v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06282v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhikang Dong, Bin Chen, Xiulong Liu, Pawel Polak, Peng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce MuseChat, an innovative dialog-based music recommendation
+system. This unique platform not only offers interactive user engagement but
+also suggests music tailored for input videos, so that users can refine and
+personalize their music selections. In contrast, previous systems predominantly
+emphasized content compatibility, often overlooking the nuances of users'
+individual preferences. For example, all the datasets only provide basic
+music-video pairings or such pairings with textual music descriptions. To
+address this gap, our research offers three contributions. First, we devise a
+conversation-synthesis method that simulates a two-turn interaction between a
+user and a recommendation system, which leverages pre-trained music tags and
+artist information. In this interaction, users submit a video to the system,
+which then suggests a suitable music piece with a rationale. Afterwards, users
+communicate their musical preferences, and the system presents a refined music
+recommendation with reasoning. Second, we introduce a multi-modal
+recommendation engine that matches music either by aligning it with visual cues
+from the video or by harmonizing visual information, feedback from previously
+recommended music, and the user's textual input. Third, we bridge music
+representations and textual data with a Large Language Model(Vicuna-7B). This
+alignment equips MuseChat to deliver music recommendations and their underlying
+reasoning in a manner resembling human communication. Our evaluations show that
+MuseChat surpasses existing state-of-the-art models in music retrieval tasks
+and pioneers the integration of the recommendation process within a natural
+language framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Zero-shot Inversion Process for Image Attribute Editing with Diffusion
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.15854v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.15854v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhanbo Feng, Zenan Ling, Ci Gong, Feng Zhou, Jie Li, Robert C. Qiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Denoising diffusion models have shown outstanding performance in image
+editing. Existing works tend to use either image-guided methods, which provide
+a visual reference but lack control over semantic coherence, or text-guided
+methods, which ensure faithfulness to text guidance but lack visual quality. To
+address the problem, we propose the Zero-shot Inversion Process (ZIP), a
+framework that injects a fusion of generated visual reference and text guidance
+into the semantic latent space of a \textit{frozen} pre-trained diffusion
+model. Only using a tiny neural network, the proposed ZIP produces diverse
+content and attributes under the intuitive control of the text prompt.
+Moreover, ZIP shows remarkable robustness for both in-domain and out-of-domain
+attribute manipulation on real images. We perform detailed experiments on
+various benchmark datasets. Compared to state-of-the-art methods, ZIP produces
+images of equivalent quality while providing a realistic editing effect.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Adaptive Visual Scene Understanding: Incremental Scene Graph Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.01636v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.01636v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Naitik Khandelwal, Xiao Liu, Mengmi Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Scene graph generation (SGG) involves analyzing images to extract meaningful
+information about objects and their relationships. Given the dynamic nature of
+the visual world, it becomes crucial for AI systems to detect new objects and
+establish their new relationships with existing objects. To address the lack of
+continual learning methodologies in SGG, we introduce the comprehensive
+Continual ScenE Graph Generation (CSEGG) dataset along with 3 learning
+scenarios and 8 evaluation metrics. Our research investigates the continual
+learning performances of existing SGG methods on the retention of previous
+object entities and relationships as they learn new ones. Moreover, we also
+explore how continual object detection enhances generalization in classifying
+known relationships on unknown objects. We conduct extensive experiments
+benchmarking and analyzing the classical two-stage SGG methods and the most
+recent transformer-based SGG methods in continual learning settings, and gain
+valuable insights into the CSEGG problem. We invite the research community to
+explore this emerging field of study.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LRANet: Towards Accurate and Efficient Scene Text Detection with
+  Low-Rank Approximation Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.15142v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.15142v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuchen Su, Zhineng Chen, Zhiwen Shao, Yuning Du, Zhilong Ji, Jinfeng Bai, Yong Zhou, Yu-Gang Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, regression-based methods, which predict parameterized text shapes
+for text localization, have gained popularity in scene text detection. However,
+the existing parameterized text shape methods still have limitations in
+modeling arbitrary-shaped texts due to ignoring the utilization of
+text-specific shape information. Moreover, the time consumption of the entire
+pipeline has been largely overlooked, leading to a suboptimal overall inference
+speed. To address these issues, we first propose a novel parameterized text
+shape method based on low-rank approximation. Unlike other shape representation
+methods that employ data-irrelevant parameterization, our approach utilizes
+singular value decomposition and reconstructs the text shape using a few
+eigenvectors learned from labeled text contours. By exploring the shape
+correlation among different text contours, our method achieves consistency,
+compactness, simplicity, and robustness in shape representation. Next, we
+propose a dual assignment scheme for speed acceleration. It adopts a sparse
+assignment branch to accelerate the inference speed, and meanwhile, provides
+ample supervised signals for training through a dense assignment branch.
+Building upon these designs, we implement an accurate and efficient
+arbitrary-shaped text detector named LRANet. Extensive experiments are
+conducted on several challenging benchmarks, demonstrating the superior
+accuracy and efficiency of LRANet compared to state-of-the-art methods. Code
+will be released soon.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ PNet -- A Deep Learning Based Photometry and Astrometry Bayesian
+  Framework 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2106.14349v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2106.14349v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rui Sun, Peng Jia, Yongyang Sun, Zhimin Yang, Qiang Liu, Hongyan Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time domain astronomy has emerged as a vibrant research field in recent
+years, focusing on celestial objects that exhibit variable magnitudes or
+positions. Given the urgency of conducting follow-up observations for such
+objects, the development of an algorithm capable of detecting them and
+determining their magnitudes and positions has become imperative. Leveraging
+the advancements in deep neural networks, we present the PNet, an end-to-end
+framework designed not only to detect celestial objects and extract their
+magnitudes and positions but also to estimate photometry uncertainty. The PNet
+comprises two essential steps. Firstly, it detects stars and retrieves their
+positions, magnitudes, and calibrated magnitudes. Subsequently, in the second
+phase, the PNet estimates the uncertainty associated with the photometry
+results, serving as a valuable reference for the light curve classification
+algorithm. Our algorithm has been tested using both simulated and real
+observation data, demonstrating the PNet's ability to deliver consistent and
+reliable outcomes. Integration of the PNet into data processing pipelines for
+time-domain astronomy holds significant potential for enhancing response speed
+and improving the detection capabilities for celestial objects with variable
+positions and magnitudes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>To be published in the AJ and welcome to any comments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ IPMix: Label-Preserving Data Augmentation Method for Training Robust
+  Classifiers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04780v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04780v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhenglin Huang, Xianan Bao, Na Zhang, Qingqi Zhang, Xiaomei Tu, Biao Wu, Xi Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data augmentation has been proven effective for training high-accuracy
+convolutional neural network classifiers by preventing overfitting. However,
+building deep neural networks in real-world scenarios requires not only high
+accuracy on clean data but also robustness when data distributions shift. While
+prior methods have proposed that there is a trade-off between accuracy and
+robustness, we propose IPMix, a simple data augmentation approach to improve
+robustness without hurting clean accuracy. IPMix integrates three levels of
+data augmentation (image-level, patch-level, and pixel-level) into a coherent
+and label-preserving technique to increase the diversity of training data with
+limited computational overhead. To further improve the robustness, IPMix
+introduces structural complexity at different levels to generate more diverse
+images and adopts the random mixing method for multi-scale information fusion.
+Experiments demonstrate that IPMix outperforms state-of-the-art corruption
+robustness on CIFAR-C and ImageNet-C. In addition, we show that IPMix also
+significantly improves the other safety measures, including robustness to
+adversarial perturbations, calibration, prediction consistency, and anomaly
+detection, achieving state-of-the-art or comparable results on several
+benchmarks, including ImageNet-R, ImageNet-A, and ImageNet-O.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SPEC2: SPECtral SParsE CNN Accelerator on FPGAs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1910.11103v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1910.11103v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yue Niu, Hanqing Zeng, Ajitesh Srivastava, Kartik Lakhotia, Rajgopal Kannan, Yanzhi Wang, Viktor Prasanna
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To accelerate inference of Convolutional Neural Networks (CNNs), various
+techniques have been proposed to reduce computation redundancy. Converting
+convolutional layers into frequency domain significantly reduces the
+computation complexity of the sliding window operations in space domain. On the
+other hand, weight pruning techniques address the redundancy in model
+parameters by converting dense convolutional kernels into sparse ones. To
+obtain high-throughput FPGA implementation, we propose SPEC2 -- the first work
+to prune and accelerate spectral CNNs. First, we propose a systematic pruning
+algorithm based on Alternative Direction Method of Multipliers (ADMM). The
+offline pruning iteratively sets the majority of spectral weights to zero,
+without using any handcrafted heuristics. Then, we design an optimized pipeline
+architecture on FPGA that has efficient random access into the sparse kernels
+and exploits various dimensions of parallelism in convolutional layers.
+Overall, SPEC2 achieves high inference throughput with extremely low
+computation complexity and negligible accuracy degradation. We demonstrate
+SPEC2 by pruning and implementing LeNet and VGG16 on the Xilinx Virtex
+platform. After pruning 75% of the spectral weights, SPEC2 achieves 0% accuracy
+loss for LeNet, and <1% accuracy loss for VGG16. The resulting accelerators
+achieve up to 24x higher throughput, compared with the state-of-the-art FPGA
+implementations for VGG16.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>This is a 10-page conference paper in 26TH IEEE International
+  Conference On High Performance Computing, Data, and Analytics (HiPC)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Joint Latent Space EBM Prior Model for Multi-layer Generator 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.06323v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.06323v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiali Cui, Ying Nian Wu, Tian Han
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper studies the fundamental problem of learning multi-layer generator
+models. The multi-layer generator model builds multiple layers of latent
+variables as a prior model on top of the generator, which benefits learning
+complex data distribution and hierarchical representations. However, such a
+prior model usually focuses on modeling inter-layer relations between latent
+variables by assuming non-informative (conditional) Gaussian distributions,
+which can be limited in model expressivity. To tackle this issue and learn more
+expressive prior models, we propose an energy-based model (EBM) on the joint
+latent space over all layers of latent variables with the multi-layer generator
+as its backbone. Such joint latent space EBM prior model captures the
+intra-layer contextual relations at each layer through layer-wise energy terms,
+and latent variables across different layers are jointly corrected. We develop
+a joint training scheme via maximum likelihood estimation (MLE), which involves
+Markov Chain Monte Carlo (MCMC) sampling for both prior and posterior
+distributions of the latent variables from different layers. To ensure
+efficient inference and learning, we further propose a variational training
+scheme where an inference model is used to amortize the costly posterior MCMC
+sampling. Our experiments demonstrate that the learned model can be expressive
+in generating high-quality images and capturing hierarchical features for
+better outlier detection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Distilling Large Vision-Language Model with Out-of-Distribution
+  Generalizability <span class="chip">ICCV</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.03135v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.03135v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xuanlin Li, Yunhao Fang, Minghua Liu, Zhan Ling, Zhuowen Tu, Hao Su
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large vision-language models have achieved outstanding performance, but their
+size and computational requirements make their deployment on
+resource-constrained devices and time-sensitive tasks impractical. Model
+distillation, the process of creating smaller, faster models that maintain the
+performance of larger models, is a promising direction towards the solution.
+This paper investigates the distillation of visual representations in large
+teacher vision-language models into lightweight student models using a small-
+or mid-scale dataset. Notably, this study focuses on open-vocabulary
+out-of-distribution (OOD) generalization, a challenging problem that has been
+overlooked in previous model distillation literature. We propose two principles
+from vision and language modality perspectives to enhance student's OOD
+generalization: (1) by better imitating teacher's visual representation space,
+and carefully promoting better coherence in vision-language alignment with the
+teacher; (2) by enriching the teacher's language representations with
+informative and finegrained semantic attributes to effectively distinguish
+between different labels. We propose several metrics and conduct extensive
+experiments to investigate their techniques. The results demonstrate
+significant improvements in zero-shot and few-shot student performance on
+open-vocabulary out-of-distribution classification, highlighting the
+effectiveness of our proposed approaches. Poster:
+https://xuanlinli17.github.io/pdfs/iccv23_large_vlm_distillation_poster.pdf
+Code: https://github.com/xuanlinli17/large_vlm_distillation_ood
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at International Conference on Computer Vision (ICCV) 2023.
+  Poster at
+  https://xuanlinli17.github.io/pdfs/iccv23_large_vlm_distillation_poster.pdf</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Saliency-based Video Summarization for Face Anti-spoofing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.12364v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.12364v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Usman Muhammad, Mourad Oussalah, Jorma Laaksonen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the growing availability of databases for face presentation attack
+detection, researchers are increasingly focusing on video-based face
+anti-spoofing methods that involve hundreds to thousands of images for training
+the models. However, there is currently no clear consensus on the optimal
+number of frames in a video to improve face spoofing detection. Inspired by the
+visual saliency theory, we present a video summarization method for face
+anti-spoofing detection that aims to enhance the performance and efficiency of
+deep learning models by leveraging visual saliency. In particular, saliency
+information is extracted from the differences between the Laplacian and Wiener
+filter outputs of the source images, enabling identification of the most
+visually salient regions within each frame. Subsequently, the source images are
+decomposed into base and detail images, enhancing the representation of the
+most important information. Weighting maps are then computed based on the
+saliency information, indicating the importance of each pixel in the image. By
+linearly combining the base and detail images using the weighting maps, the
+method fuses the source images to create a single representative image that
+summarizes the entire video. The key contribution of the proposed method lies
+in demonstrating how visual saliency can be used as a data-centric approach to
+improve the performance and efficiency for face presentation attack detection.
+By focusing on the most salient images or regions within the images, a more
+representative and diverse training set can be created, potentially leading to
+more effective models. To validate the method's effectiveness, a simple CNN-RNN
+deep learning architecture was used, and the experimental results showcased
+state-of-the-art performance on five challenging face anti-spoofing datasets
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ NDC-Scene: Boost Monocular 3D Semantic Scene Completion in Normalized
+  Device Coordinates Space <span class="chip">ICCV 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.14616v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.14616v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawei Yao, Chuming Li, Keqiang Sun, Yingjie Cai, Hao Li, Wanli Ouyang, Hongsheng Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Monocular 3D Semantic Scene Completion (SSC) has garnered significant
+attention in recent years due to its potential to predict complex semantics and
+geometry shapes from a single image, requiring no 3D inputs. In this paper, we
+identify several critical issues in current state-of-the-art methods, including
+the Feature Ambiguity of projected 2D features in the ray to the 3D space, the
+Pose Ambiguity of the 3D convolution, and the Computation Imbalance in the 3D
+convolution across different depth levels. To address these problems, we devise
+a novel Normalized Device Coordinates scene completion network (NDC-Scene) that
+directly extends the 2D feature map to a Normalized Device Coordinates (NDC)
+space, rather than to the world space directly, through progressive restoration
+of the dimension of depth with deconvolution operations. Experiment results
+demonstrate that transferring the majority of computation from the target 3D
+space to the proposed normalized device coordinates space benefits monocular
+SSC tasks. Additionally, we design a Depth-Adaptive Dual Decoder to
+simultaneously upsample and fuse the 2D and 3D feature maps, further improving
+overall performance. Our extensive experiments confirm that the proposed method
+consistently outperforms state-of-the-art methods on both outdoor SemanticKITTI
+and indoor NYUv2 datasets. Our code are available at
+https://github.com/Jiawei-Yao0812/NDCScene.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at ICCV 2023. Project page:
+  https://jiawei-yao0812.github.io/NDC-Scene/</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Image as First-Order Norm+Linear Autoregression: Unveiling Mathematical
+  Invariance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.16319v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.16319v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yinpeng Chen, Xiyang Dai, Dongdong Chen, Mengchen Liu, Lu Yuan, Zicheng Liu, Youzuo Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a novel mathematical property applicable to diverse
+images, referred to as FINOLA (First-Order Norm+Linear Autoregressive). FINOLA
+represents each image in the latent space as a first-order autoregressive
+process, in which each regression step simply applies a shared linear model on
+the normalized value of its immediate neighbor. This intriguing property
+reveals a mathematical invariance that transcends individual images. Expanding
+from image grids to continuous coordinates, we unveil the presence of two
+underlying partial differential equations. We validate the FINOLA property from
+two distinct angles: image reconstruction and self-supervised learning.
+Firstly, we demonstrate the ability of FINOLA to auto-regress up to a 256x256
+feature map (the same resolution to the image) from a single vector placed at
+the center, successfully reconstructing the original image by only using three
+3x3 convolution layers as decoder. Secondly, we leverage FINOLA for
+self-supervised learning by employing a simple masked prediction approach.
+Encoding a single unmasked quadrant block, we autoregressively predict the
+surrounding masked region. Remarkably, this pre-trained representation proves
+highly effective in image classification and object detection tasks, even when
+integrated into lightweight networks, all without the need for extensive
+fine-tuning. The code will be made publicly available.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ A Complete Recipe for Diffusion Generative Models <span class="chip">ICCV'23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.01748v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.01748v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kushagra Pandey, Stephan Mandt
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Score-based Generative Models (SGMs) have demonstrated exceptional synthesis
+outcomes across various tasks. However, the current design landscape of the
+forward diffusion process remains largely untapped and often relies on physical
+heuristics or simplifying assumptions. Utilizing insights from the development
+of scalable Bayesian posterior samplers, we present a complete recipe for
+formulating forward processes in SGMs, ensuring convergence to the desired
+target distribution. Our approach reveals that several existing SGMs can be
+seen as specific manifestations of our framework. Building upon this method, we
+introduce Phase Space Langevin Diffusion (PSLD), which relies on score-based
+modeling within an augmented space enriched by auxiliary variables akin to
+physical phase space. Empirical results exhibit the superior sample quality and
+improved speed-quality trade-off of PSLD compared to various competing
+approaches on established image synthesis benchmarks. Remarkably, PSLD achieves
+sample quality akin to state-of-the-art SGMs (FID: 2.10 for unconditional
+CIFAR-10 generation). Lastly, we demonstrate the applicability of PSLD in
+conditional synthesis using pre-trained score networks, offering an appealing
+alternative as an SGM backbone for future advancements. Code and model
+checkpoints can be accessed at \url{https://github.com/mandt-lab/PSLD}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in ICCV'23 (Oral Presentation)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MMTSA: Multimodal Temporal Segment Attention Network for Efficient Human
+  Activity Recognition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.09222v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.09222v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziqi Gao, Yuntao Wang, Jianguo Chen, Junliang Xing, Shwetak Patel, Xin Liu, Yuanchun Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal sensors provide complementary information to develop accurate
+machine-learning methods for human activity recognition (HAR), but introduce
+significantly higher computational load, which reduces efficiency. This paper
+proposes an efficient multimodal neural architecture for HAR using an RGB
+camera and inertial measurement units (IMUs) called Multimodal Temporal Segment
+Attention Network (MMTSA). MMTSA first transforms IMU sensor data into a
+temporal and structure-preserving gray-scale image using the Gramian Angular
+Field (GAF), representing the inherent properties of human activities. MMTSA
+then applies a multimodal sparse sampling method to reduce data redundancy.
+Lastly, MMTSA adopts an inter-segment attention module for efficient multimodal
+fusion. Using three well-established public datasets, we evaluated MMTSA's
+effectiveness and efficiency in HAR. Results show that our method achieves
+superior performance improvements 11.13% of cross-subject F1-score on the MMAct
+dataset than the previous state-of-the-art (SOTA) methods. The ablation study
+and analysis suggest that MMTSA's effectiveness in fusing multimodal data for
+accurate HAR. The efficiency evaluation on an edge device showed that MMTSA
+achieved significantly better accuracy, lower computational load, and lower
+inference latency than SOTA methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Information Retrieval <span class="chip" style="font-size: 60%">19</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InstructRetro: Instruction Tuning post Retrieval-Augmented <span class="highlight-title">Pretrain</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07713v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07713v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boxin Wang, Wei Ping, Lawrence McAfee, Peng Xu, Bo Li, Mohammad Shoeybi, Bryan Catanzaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pretraining auto-regressive large language models (LLMs) with retrieval
+demonstrates better perplexity and factual accuracy by leveraging external
+databases. However, the size of existing pretrained retrieval-augmented LLM is
+still limited (e.g., Retro has 7.5B parameters), which limits the effectiveness
+of instruction tuning and zero-shot generalization. In this work, we introduce
+Retro 48B, the largest LLM pretrained with retrieval before instruction tuning.
+Specifically, we continue to pretrain the 43B GPT model on additional 100
+billion tokens using the Retro augmentation method by retrieving from 1.2
+trillion tokens. The obtained foundation model, Retro 48B, largely outperforms
+the original 43B GPT in terms of perplexity. After instruction tuning on Retro,
+InstructRetro demonstrates significant improvement over the instruction tuned
+GPT on zero-shot question answering (QA) tasks. Specifically, the average
+improvement of InstructRetro is 7% over its GPT counterpart across 8 short-form
+QA tasks, and 10% over GPT across 4 challenging long-form QA tasks.
+Surprisingly, we find that one can ablate the encoder from InstructRetro
+architecture and directly use its decoder backbone, while achieving comparable
+results. We hypothesize that pretraining with retrieval makes its decoder good
+at incorporating context for QA. Our results highlights the promising direction
+to obtain a better GPT decoder for QA through continued pretraining with
+retrieval before instruction tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Retrieve Anything To Augment Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07554v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07554v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peitian Zhang, Shitao Xiao, Zheng Liu, Zhicheng Dou, Jian-Yun Nie
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) face significant challenges stemming from the
+inherent limitations in knowledge, memory, alignment, and action. These
+challenges cannot be addressed by LLMs alone, but should rely on assistance
+from the external world, such as knowledge base, memory store, demonstration
+examples, and tools. Retrieval augmentation stands as a vital mechanism for
+bridging the gap between LLMs and the external assistance. However,
+conventional methods encounter two pressing issues. On one hand, the
+general-purpose retrievers are not properly optimized for the retrieval
+augmentation of LLMs. On the other hand, the task-specific retrievers lack the
+required versatility, hindering their performance across the diverse retrieval
+augmentation scenarios.
+  In this work, we present a novel approach, the LLM Embedder, which
+comprehensively support the diverse needs of LLMs' retrieval augmentation with
+one unified embedding model. Training such an unified model is non-trivial, as
+various retrieval tasks aim to capture distinct semantic relationships, often
+subject to mutual interference. To address this challenge, we systematically
+optimize our training methodology. This includes reward formulation based on
+LLMs' feedback, the stabilization of knowledge distillation, multi-task
+fine-tuning with explicit instructions, and the use of homogeneous in-batch
+negative sampling. These optimization strategies contribute to the outstanding
+empirical performance of the LLM-Embedder. Notably, it yields remarkable
+enhancements in retrieval augmentation for LLMs, surpassing both
+general-purpose and task-specific retrievers in various evaluation scenarios.
+This project is made publicly available at
+https://github.com/FlagOpen/FlagEmbedding.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GMOCAT: A Graph-Enhanced Multi-Objective Method for Computerized
+  Adaptive Testing <span class="chip">KDD23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07477v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07477v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hangyu Wang, Ting Long, Liang Yin, Weinan Zhang, Wei Xia, Qichen Hong, Dingyin Xia, Ruiming Tang, Yong Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Computerized Adaptive Testing(CAT) refers to an online system that adaptively
+selects the best-suited question for students with various abilities based on
+their historical response records. Most CAT methods only focus on the quality
+objective of predicting the student ability accurately, but neglect concept
+diversity or question exposure control, which are important considerations in
+ensuring the performance and validity of CAT. Besides, the students' response
+records contain valuable relational information between questions and knowledge
+concepts. The previous methods ignore this relational information, resulting in
+the selection of sub-optimal test questions. To address these challenges, we
+propose a Graph-Enhanced Multi-Objective method for CAT (GMOCAT). Firstly,
+three objectives, namely quality, diversity and novelty, are introduced into
+the Scalarized Multi-Objective Reinforcement Learning framework of CAT, which
+respectively correspond to improving the prediction accuracy, increasing the
+concept diversity and reducing the question exposure. We use an Actor-Critic
+Recommender to select questions and optimize three objectives simultaneously by
+the scalarization function. Secondly, we utilize the graph neural network to
+learn relation-aware embeddings of questions and concepts. These embeddings are
+able to aggregate neighborhood information in the relation graphs between
+questions and concepts. We conduct experiments on three real-world educational
+datasets, and show that GMOCAT not only outperforms the state-of-the-art
+methods in the ability prediction, but also achieve superior performance in
+improving the concept diversity and alleviating the question exposure. Our code
+is available at https://github.com/justarter/GMOCAT.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>KDD23</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Preliminary Results of a Scientometric Analysis of the German
+  Information Retrieval Community 2020-2023 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07346v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07346v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Philipp Schaer, Svetlana Myshkina, Jüri Keller
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The German Information Retrieval community is located in two different
+sub-fields: Information and computer science. There are no current studies that
+investigate these communities on a scientometric level. Available studies only
+focus on the information scientific part of the community. We generated a data
+set of 401 recent IR-related publications extracted from six core IR
+conferences from a mainly computer scientific background. We analyze this data
+set at the institutional and researcher level. The data set is publicly
+released, and we also demonstrate a mapping use case.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Data available at https://github.com/irgroup/LWDA2023-IR-community</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Completely Locale-independent Session-based Recommender System by
+  Leveraging Trained Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07281v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07281v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Tokutake, Chihiro Yamasaki, Yongzhi Jin, Ayuka Inoue, Kei Harada
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we propose a solution that won the 10th prize in the KDD Cup
+2023 Challenge Task 2 (Next Product Recommendation for Underrepresented
+Languages/Locales). Our approach involves two steps: (i) Identify candidate
+item sets based on co-visitation, and (ii) Re-ranking the items using LightGBM
+with locale-independent features, including session-based features and product
+similarity. The experiment demonstrated that the locale-independent model
+performed consistently well across different test locales, and performed even
+better when incorporating data from other locales into the training.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Validating Synthetic Usage Data in Living Lab Environments 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07142v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07142v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Timo Breuer, Norbert Fuhr, Philipp Schaer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Evaluating retrieval performance without editorial relevance judgments is
+challenging, but instead, user interactions can be used as relevance signals.
+Living labs offer a way for small-scale platforms to validate information
+retrieval systems with real users. If enough user interaction data are
+available, click models can be parameterized from historical sessions to
+evaluate systems before exposing users to experimental rankings. However,
+interaction data are sparse in living labs, and little is studied about how
+click models can be validated for reliable user simulations when click data are
+available in moderate amounts.
+  This work introduces an evaluation approach for validating synthetic usage
+data generated by click models in data-sparse human-in-the-loop environments
+like living labs. We ground our methodology on the click model's estimates
+about a system ranking compared to a reference ranking for which the relative
+performance is known. Our experiments compare different click models and their
+reliability and robustness as more session log data becomes available. In our
+setup, simple click models can reliably determine the relative system
+performance with already 20 logged sessions for 50 queries. In contrast, more
+complex click models require more session data for reliable estimates, but they
+are a better choice in simulated interleaving experiments when enough session
+data are available. While it is easier for click models to distinguish between
+more diverse systems, it is harder to reproduce the system ranking based on the
+same retrieval algorithm with different interpolation weights. Our setup is
+entirely open, and we share the code to reproduce the experiments.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>25 pages + appendix and references, accepted JDIQ journal paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AE-smnsMLC: Multi-Label Classification with Semantic Matching and
+  Negative Label Sampling for Product Attribute Value Extraction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07137v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07137v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhongfen Deng, Wei-Te Chen, Lei Chen, Philip S. Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Product attribute value extraction plays an important role for many
+real-world applications in e-Commerce such as product search and
+recommendation. Previous methods treat it as a sequence labeling task that
+needs more annotation for position of values in the product text. This limits
+their application to real-world scenario in which only attribute values are
+weakly-annotated for each product without their position. Moreover, these
+methods only use product text (i.e., product title and description) and do not
+consider the semantic connection between the multiple attribute values of a
+given product and its text, which can help attribute value extraction. In this
+paper, we reformulate this task as a multi-label classification task that can
+be applied for real-world scenario in which only annotation of attribute values
+is available to train models (i.e., annotation of positional information of
+attribute values is not available). We propose a classification model with
+semantic matching and negative label sampling for attribute value extraction.
+Semantic matching aims to capture semantic interactions between attribute
+values of a given product and its text. Negative label sampling aims to enhance
+the model's ability of distinguishing similar values belonging to the same
+attribute. Experimental results on three subsets of a large real-world
+e-Commerce dataset demonstrate the effectiveness and superiority of our
+proposed model.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Refined Mechanism Design for Approximately Structured Priors via Active
+  Regression <span class="chip">NeurIPS
+  2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07874v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07874v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Christos Boutsikas, Petros Drineas, Marios Mertzanidis, Alexandros Psomas, Paritosh Verma
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We consider the problem of a revenue-maximizing seller with a large number of
+items $m$ for sale to $n$ strategic bidders, whose valuations are drawn
+independently from high-dimensional, unknown prior distributions. It is
+well-known that optimal and even approximately-optimal mechanisms for this
+setting are notoriously difficult to characterize or compute, and, even when
+they can be found, are often rife with various counter-intuitive properties. In
+this paper, following a model introduced recently by Cai and
+Daskalakis~\cite{cai2022recommender}, we consider the case that bidders' prior
+distributions can be well-approximated by a topic model. We design an active
+learning component, responsible for interacting with the bidders and outputting
+low-dimensional approximations of their types, and a mechanism design
+component, responsible for robustifying mechanisms for the low-dimensional
+model to work for the approximate types of the former component. On the active
+learning front, we cast our problem in the framework of Randomized Linear
+Algebra (RLA) for regression problems, allowing us to import several
+breakthrough results from that line of research, and adapt them to our setting.
+On the mechanism design front, we remove many restrictive assumptions of prior
+work on the type of access needed to the underlying distributions and the
+associated mechanisms. To the best of our knowledge, our work is the first to
+formulate connections between mechanism design, and RLA for active learning of
+regression problems, opening the door for further applications of randomized
+linear algebra primitives to mechanism design.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>37th Conference on Neural Information Processing Systems (NeurIPS
+  2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Language Models As Semantic Indexers 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07815v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07815v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Jin, Hansi Zeng, Guoyin Wang, Xiusi Chen, Tianxin Wei, Ruirui Li, Zhengyang Wang, Zheng Li, Yang Li, Hanqing Lu, Suhang Wang, Jiawei Han, Xianfeng Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic identifier (ID) is an important concept in information retrieval
+that aims to preserve the semantics of objects such as documents and items
+inside their IDs. Previous studies typically adopt a two-stage pipeline to
+learn semantic IDs by first procuring embeddings using off-the-shelf text
+encoders and then deriving IDs based on the embeddings. However, each step
+introduces potential information loss and there is usually an inherent mismatch
+between the distribution of embeddings within the latent space produced by text
+encoders and the anticipated distribution required for semantic indexing.
+Nevertheless, it is non-trivial to design a method that can learn the
+document's semantic representations and its hierarchical structure
+simultaneously, given that semantic IDs are discrete and sequentially
+structured, and the semantic supervision is deficient. In this paper, we
+introduce LMINDEXER, a self-supervised framework to learn semantic IDs with a
+generative language model. We tackle the challenge of sequential discrete ID by
+introducing a semantic indexer capable of generating neural sequential discrete
+representations with progressive training and contrastive learning. In response
+to the semantic supervision deficiency, we propose to train the model with a
+self-supervised document reconstruction objective. The learned semantic indexer
+can facilitate various downstream tasks, such as recommendation and retrieval.
+We conduct experiments on three tasks including recommendation, product search,
+and document retrieval on five datasets from various domains, where LMINDEXER
+outperforms competitive baselines significantly and consistently.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 3 appendix pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Non-Stationary Contextual Bandit Learning via Neural Predictive Ensemble
+  Sampling 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07786v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07786v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zheqing Zhu, Yueyang Liu, Xu Kuang, Benjamin Van Roy
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Real-world applications of contextual bandits often exhibit non-stationarity
+due to seasonality, serendipity, and evolving social trends. While a number of
+non-stationary contextual bandit learning algorithms have been proposed in the
+literature, they excessively explore due to a lack of prioritization for
+information of enduring value, or are designed in ways that do not scale in
+modern applications with high-dimensional user-specific features and large
+action set, or both. In this paper, we introduce a novel non-stationary
+contextual bandit algorithm that addresses these concerns. It combines a
+scalable, deep-neural-network-based architecture with a carefully designed
+exploration mechanism that strategically prioritizes collecting information
+with the most lasting value in a non-stationary environment. Through empirical
+evaluations on two real-world recommendation datasets, which exhibit pronounced
+non-stationarity, we demonstrate that our approach significantly outperforms
+the state-of-the-art baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Schema-adaptable Knowledge Graph Construction <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08703v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08703v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongbin Ye, Honghao Gui, Xin Xu, Huajun Chen, Ningyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conventional Knowledge Graph Construction (KGC) approaches typically follow
+the static information extraction paradigm with a closed set of pre-defined
+schema. As a result, such approaches fall short when applied to dynamic
+scenarios or domains, whereas a new type of knowledge emerges. This
+necessitates a system that can handle evolving schema automatically to extract
+information for KGC. To address this need, we propose a new task called
+schema-adaptable KGC, which aims to continually extract entity, relation, and
+event based on a dynamically changing schema graph without re-training. We
+first split and convert existing datasets based on three principles to build a
+benchmark, i.e., horizontal schema expansion, vertical schema expansion, and
+hybrid schema expansion; then investigate the schema-adaptable performance of
+several well-known approaches such as Text2Event, TANL, UIE and GPT-3.5. We
+further propose a simple yet effective baseline dubbed \textsc{AdaKGC}, which
+contains schema-enriched prefix instructor and schema-conditioned dynamic
+decoding to better handle evolving schema. Comprehensive experimental results
+illustrate that AdaKGC can outperform baselines but still have room for
+improvement. We hope the proposed work can deliver benefits to the community.
+Code and datasets available at https://github.com/zjunlp/AdaKGC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 (Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Editing Large Language Models: Problems, Methods, and Opportunities <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13172v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13172v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunzhi Yao, Peng Wang, Bozhong Tian, Siyuan Cheng, Zhoubo Li, Shumin Deng, Huajun Chen, Ningyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the ability to train capable LLMs, the methodology for maintaining
+their relevancy and rectifying errors remains elusive. To this end, the past
+few years have witnessed a surge in techniques for editing LLMs, the objective
+of which is to efficiently alter the behavior of LLMs within a specific domain
+without negatively impacting performance across other inputs. This paper
+embarks on a deep exploration of the problems, methods, and opportunities
+related to model editing for LLMs. In particular, we provide an exhaustive
+overview of the task definition and challenges associated with model editing,
+along with an in-depth empirical analysis of the most progressive methods
+currently at our disposal. We also build a new benchmark dataset to facilitate
+a more robust evaluation and pinpoint enduring issues intrinsic to existing
+techniques. Our objective is to provide valuable insights into the
+effectiveness and feasibility of each editing technique, thereby assisting the
+community in making informed decisions on the selection of the most appropriate
+method for a specific task or context. Code and datasets are available at
+https://github.com/zjunlp/EasyEdit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023. Updated with new experiments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Towards Reliable Item Sampling for Recommendation Evaluation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.15743v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.15743v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dong Li, Ruoming Jin, Zhenming Liu, Bin Ren, Jing Gao, Zhi Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Since Rendle and Krichene argued that commonly used sampling-based evaluation
+metrics are "inconsistent" with respect to the global metrics (even in
+expectation), there have been a few studies on the sampling-based recommender
+system evaluation. Existing methods try either mapping the sampling-based
+metrics to their global counterparts or more generally, learning the empirical
+rank distribution to estimate the top-$K$ metrics. However, despite existing
+efforts, there is still a lack of rigorous theoretical understanding of the
+proposed metric estimators, and the basic item sampling also suffers from the
+"blind spot" issue, i.e., estimation accuracy to recover the top-$K$ metrics
+when $K$ is small can still be rather substantial. In this paper, we provide an
+in-depth investigation into these problems and make two innovative
+contributions. First, we propose a new item-sampling estimator that explicitly
+optimizes the error with respect to the ground truth, and theoretically
+highlight its subtle difference against prior work. Second, we propose a new
+adaptive sampling method which aims to deal with the "blind spot" problem and
+also demonstrate the expectation-maximization (EM) algorithm can be generalized
+for such a setting. Our experimental results confirm our statistical analysis
+and the superiority of the proposed works. This study helps lay the theoretical
+foundation for adopting item sampling metrics for recommendation evaluation,
+and provides strong evidence towards making item sampling a powerful and
+reliable tool for recommendation evaluation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>aaai2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Deep Neural Aggregation for Recommending Items to Group of Users 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.09447v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.09447v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jorge Dueñas-Lerín, Raúl Lara-Cabrera, Fernando Ortega, Jesús Bobadilla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Modern society devotes a significant amount of time to digital interaction.
+Many of our daily actions are carried out through digital means. This has led
+to the emergence of numerous Artificial Intelligence tools that assist us in
+various aspects of our lives. One key tool for the digital society is
+Recommender Systems, intelligent systems that learn from our past actions to
+propose new ones that align with our interests. Some of these systems have
+specialized in learning from the behavior of user groups to make
+recommendations to a group of individuals who want to perform a joint task. In
+this article, we analyze the current state of Group Recommender Systems and
+propose two new models that use emerging Deep Learning architectures.
+Experimental results demonstrate the improvement achieved by employing the
+proposed models compared to the state-of-the-art models using four different
+datasets. The source code of the models, as well as that of all the experiments
+conducted, is available in a public repository.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Contrastive <span class="highlight-title">Self-supervised</span> Learning in Recommender Systems: A <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.09902v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.09902v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mengyuan Jing, Yanmin Zhu, Tianzi Zang, Ke Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep learning-based recommender systems have achieved remarkable success in
+recent years. However, these methods usually heavily rely on labeled data
+(i.e., user-item interactions), suffering from problems such as data sparsity
+and cold-start. Self-supervised learning, an emerging paradigm that extracts
+information from unlabeled data, provides insights into addressing these
+problems. Specifically, contrastive self-supervised learning, due to its
+flexibility and promising performance, has attracted considerable interest and
+recently become a dominant branch in self-supervised learning-based
+recommendation methods. In this survey, we provide an up-to-date and
+comprehensive review of current contrastive self-supervised learning-based
+recommendation methods. Firstly, we propose a unified framework for these
+methods. We then introduce a taxonomy based on the key components of the
+framework, including view generation strategy, contrastive task, and
+contrastive objective. For each component, we provide detailed descriptions and
+discussions to guide the choice of the appropriate method. Finally, we outline
+open issues and promising directions for future research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM Transactions on Information Systems (TOIS)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Knowledge Rumination for <span class="highlight-title">Pre-train</span>ed Language Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08732v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08732v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunzhi Yao, Peng Wang, Shengyu Mao, Chuanqi Tan, Fei Huang, Huajun Chen, Ningyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Previous studies have revealed that vanilla pre-trained language models
+(PLMs) lack the capacity to handle knowledge-intensive NLP tasks alone; thus,
+several works have attempted to integrate external knowledge into PLMs.
+However, despite the promising outcome, we empirically observe that PLMs may
+have already encoded rich knowledge in their pre-trained parameters but fail to
+fully utilize them when applying them to knowledge-intensive tasks. In this
+paper, we propose a new paradigm dubbed Knowledge Rumination to help the
+pre-trained language model utilize that related latent knowledge without
+retrieving it from the external corpus. By simply adding a prompt like "As far
+as I know" to the PLMs, we try to review related latent knowledge and inject
+them back into the model for knowledge consolidation. We apply the proposed
+knowledge rumination to various language models, including RoBERTa, DeBERTa,
+and GPT-3. Experimental results on six commonsense reasoning tasks and GLUE
+benchmarks demonstrate the effectiveness of our proposed approach, which proves
+that the knowledge stored in PLMs can be better exploited to enhance
+performance. Code is available in
+https://github.com/zjunlp/knowledge-rumination.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficiently Leveraging Multi-level User Intent for Session-based
+  Recommendation via Atten-Mixer Network 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2206.12781v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2206.12781v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Peiyan Zhang, Jiayan Guo, Chaozhuo Li, Yueqi Xie, Jaeboum Kim, Yan Zhang, Xing Xie, Haohan Wang, Sunghun Kim
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Session-based recommendation (SBR) aims to predict the user's next action
+based on short and dynamic sessions. Recently, there has been an increasing
+interest in utilizing various elaborately designed graph neural networks (GNNs)
+to capture the pair-wise relationships among items, seemingly suggesting the
+design of more complicated models is the panacea for improving the empirical
+performance. However, these models achieve relatively marginal improvements
+with exponential growth in model complexity. In this paper, we dissect the
+classical GNN-based SBR models and empirically find that some sophisticated GNN
+propagations are redundant, given the readout module plays a significant role
+in GNN-based models. Based on this observation, we intuitively propose to
+remove the GNN propagation part, while the readout module will take on more
+responsibility in the model reasoning process. To this end, we propose the
+Multi-Level Attention Mixture Network (Atten-Mixer), which leverages both
+concept-view and instance-view readouts to achieve multi-level reasoning over
+item transitions. As simply enumerating all possible high-level concepts is
+infeasible for large real-world recommender systems, we further incorporate
+SBR-related inductive biases, i.e., local invariance and inherent priority to
+prune the search space. Experiments on three benchmarks demonstrate the
+effectiveness and efficiency of our proposal. We also have already launched the
+proposed techniques to a large-scale e-commercial online service since April
+2021, with significant improvements of top-tier business metrics demonstrated
+in the online experiments on live traffic.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Query2doc: Query Expansion with Large Language Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.07678v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.07678v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liang Wang, Nan Yang, Furu Wei
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces a simple yet effective query expansion approach,
+denoted as query2doc, to improve both sparse and dense retrieval systems. The
+proposed method first generates pseudo-documents by few-shot prompting large
+language models (LLMs), and then expands the query with generated
+pseudo-documents. LLMs are trained on web-scale text corpora and are adept at
+knowledge memorization. The pseudo-documents from LLMs often contain highly
+relevant information that can aid in query disambiguation and guide the
+retrievers. Experimental results demonstrate that query2doc boosts the
+performance of BM25 by 3% to 15% on ad-hoc IR datasets, such as MS-MARCO and
+TREC DL, without any model fine-tuning. Furthermore, our method also benefits
+state-of-the-art dense retrievers in terms of both in-domain and out-of-domain
+results.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MuseChat: A Conversational Music Recommendation System for Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.06282v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.06282v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhikang Dong, Bin Chen, Xiulong Liu, Pawel Polak, Peng Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce MuseChat, an innovative dialog-based music recommendation
+system. This unique platform not only offers interactive user engagement but
+also suggests music tailored for input videos, so that users can refine and
+personalize their music selections. In contrast, previous systems predominantly
+emphasized content compatibility, often overlooking the nuances of users'
+individual preferences. For example, all the datasets only provide basic
+music-video pairings or such pairings with textual music descriptions. To
+address this gap, our research offers three contributions. First, we devise a
+conversation-synthesis method that simulates a two-turn interaction between a
+user and a recommendation system, which leverages pre-trained music tags and
+artist information. In this interaction, users submit a video to the system,
+which then suggests a suitable music piece with a rationale. Afterwards, users
+communicate their musical preferences, and the system presents a refined music
+recommendation with reasoning. Second, we introduce a multi-modal
+recommendation engine that matches music either by aligning it with visual cues
+from the video or by harmonizing visual information, feedback from previously
+recommended music, and the user's textual input. Third, we bridge music
+representations and textual data with a Large Language Model(Vicuna-7B). This
+alignment equips MuseChat to deliver music recommendations and their underlying
+reasoning in a manner resembling human communication. Our evaluations show that
+MuseChat surpasses existing state-of-the-art models in music retrieval tasks
+and pioneers the integration of the recommendation process within a natural
+language framework.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Machine Learning <span class="chip" style="font-size: 60%">150</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ InstructRetro: Instruction Tuning post Retrieval-Augmented <span class="highlight-title">Pretrain</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07713v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07713v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Boxin Wang, Wei Ping, Lawrence McAfee, Peng Xu, Bo Li, Mohammad Shoeybi, Bryan Catanzaro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pretraining auto-regressive large language models (LLMs) with retrieval
+demonstrates better perplexity and factual accuracy by leveraging external
+databases. However, the size of existing pretrained retrieval-augmented LLM is
+still limited (e.g., Retro has 7.5B parameters), which limits the effectiveness
+of instruction tuning and zero-shot generalization. In this work, we introduce
+Retro 48B, the largest LLM pretrained with retrieval before instruction tuning.
+Specifically, we continue to pretrain the 43B GPT model on additional 100
+billion tokens using the Retro augmentation method by retrieving from 1.2
+trillion tokens. The obtained foundation model, Retro 48B, largely outperforms
+the original 43B GPT in terms of perplexity. After instruction tuning on Retro,
+InstructRetro demonstrates significant improvement over the instruction tuned
+GPT on zero-shot question answering (QA) tasks. Specifically, the average
+improvement of InstructRetro is 7% over its GPT counterpart across 8 short-form
+QA tasks, and 10% over GPT across 4 challenging long-form QA tasks.
+Surprisingly, we find that one can ablate the encoder from InstructRetro
+architecture and directly use its decoder backbone, while achieving comparable
+results. We hypothesize that pretraining with retrieval makes its decoder good
+at incorporating context for QA. Our results highlights the promising direction
+to obtain a better GPT decoder for QA through continued pretraining with
+retrieval before instruction tuning.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Found in the Middle: Permutation Self-Consistency Improves Listwise
+  Ranking in Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07712v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07712v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Raphael Tang, Xinyu Zhang, Xueguang Ma, Jimmy Lin, Ferhan Ture
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) exhibit positional bias in how they use context,
+which especially complicates listwise ranking. To address this, we propose
+permutation self-consistency, a form of self-consistency over ranking list
+outputs of black-box LLMs. Our key idea is to marginalize out different list
+orders in the prompt to produce an order-independent ranking with less
+positional bias. First, given some input prompt, we repeatedly shuffle the list
+in the prompt and pass it through the LLM while holding the instructions the
+same. Next, we aggregate the resulting sample of rankings by computing the
+central ranking closest in distance to all of them, marginalizing out prompt
+order biases in the process. Theoretically, we prove the robustness of our
+method, showing convergence to the true ranking in the presence of random
+perturbations. Empirically, on five list-ranking datasets in sorting and
+passage reranking, our approach improves scores from conventional inference by
+up to 7-18% for GPT-3.5 and 8-16% for LLaMA v2 (70B), surpassing the previous
+state of the art in passage reranking. Our code is at
+https://github.com/castorini/perm-sc.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>First two authors contributed equally; 10 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Growing Brains: Co-emergence of Anatomical and Functional Modularity in
+  Recurrent Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07711v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07711v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ziming Liu, Mikail Khona, Ila R. Fiete, Max Tegmark
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recurrent neural networks (RNNs) trained on compositional tasks can exhibit
+functional modularity, in which neurons can be clustered by activity similarity
+and participation in shared computational subtasks. Unlike brains, these RNNs
+do not exhibit anatomical modularity, in which functional clustering is
+correlated with strong recurrent coupling and spatial localization of
+functional clusters. Contrasting with functional modularity, which can be
+ephemerally dependent on the input, anatomically modular networks form a robust
+substrate for solving the same subtasks in the future. To examine whether it is
+possible to grow brain-like anatomical modularity, we apply a recent machine
+learning method, brain-inspired modular training (BIMT), to a network being
+trained to solve a set of compositional cognitive tasks. We find that
+functional and anatomical clustering emerge together, such that functionally
+similar neurons also become spatially localized and interconnected. Moreover,
+compared to standard $L_1$ or no regularization settings, the model exhibits
+superior performance by optimally balancing task performance and network
+sparsity. In addition to achieving brain-like organization in RNNs, our
+findings also suggest that BIMT holds promise for applications in neuromorphic
+computing and enhancing the interpretability of neural network architectures.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ DiPmark: A Stealthy, Efficient and Resilient Watermark for Large
+  Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07710v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07710v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yihan Wu, Zhengmian Hu, Hongyang Zhang, Heng Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Watermarking techniques offer a promising way to secure data via embedding
+covert information into the data. A paramount challenge in the domain lies in
+preserving the distribution of original data during watermarking. Our research
+extends and refines existing watermarking framework, placing emphasis on the
+importance of a distribution-preserving (DiP) watermark. Contrary to the
+current strategies, our proposed DiPmark preserves the original token
+distribution during watermarking (stealthy), is detectable without access to
+the language model API or weights (efficient), and is robust to moderate
+changes of tokens (resilient). This is achieved by incorporating a novel
+reweight strategy, combined with a hash function that assigns unique
+\textit{i.i.d.} ciphers based on the context. The empirical benchmarks of our
+approach underscore its stealthiness, efficiency, and resilience, making it a
+robust solution for watermarking tasks that demand impeccable quality
+preservation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ MatFormer: Nested <span class="highlight-title">Transformer</span> for Elastic Inference 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07707v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07707v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                         Devvrit, Sneha Kudugunta, Aditya Kusupati, Tim Dettmers, Kaifeng Chen, Inderjit Dhillon, Yulia Tsvetkov, Hannaneh Hajishirzi, Sham Kakade, Ali Farhadi, Prateek Jain
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Transformer models are deployed in a wide range of settings, from
+multi-accelerator clusters to standalone mobile phones. The diverse inference
+constraints in these scenarios necessitate practitioners to train foundation
+models such as PaLM 2, Llama, & ViTs as a series of models of varying sizes.
+Due to significant training costs, only a select few model sizes are trained
+and supported, limiting more fine-grained control over relevant tradeoffs,
+including latency, cost, and accuracy. This work introduces MatFormer, a nested
+Transformer architecture designed to offer elasticity in a variety of
+deployment constraints. Each Feed Forward Network (FFN) block of a MatFormer
+model is jointly optimized with a few nested smaller FFN blocks. This training
+procedure allows for the Mix'n'Match of model granularities across layers --
+i.e., a trained universal MatFormer model enables extraction of hundreds of
+accurate smaller models, which were never explicitly optimized. We empirically
+demonstrate MatFormer's effectiveness across different model classes (decoders
+& encoders), modalities (language & vision), and scales (up to 2.6B
+parameters). We find that a 2.6B decoder-only MatFormer language model (MatLM)
+allows us to extract smaller models spanning from 1.5B to 2.6B, each exhibiting
+comparable validation loss and one-shot downstream evaluations to their
+independently trained counterparts. Furthermore, we observe that smaller
+encoders extracted from a universal MatFormer-based ViT (MatViT) encoder
+preserve the metric-space structure for adaptive large-scale retrieval.
+Finally, we showcase that speculative decoding with the accurate and consistent
+submodels extracted from MatFormer can further reduce inference latency.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31 pages, 12 figures, first three authors contributed equally</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ From Scarcity to Efficiency: Improving CLIP Training via Visual-enriched
+  Captions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07699v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07699v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengfeng Lai, Haotian Zhang, Wentao Wu, Haoping Bai, Aleksei Timofeev, Xianzhi Du, Zhe Gan, Jiulong Shan, Chen-Nee Chuah, Yinfei Yang, Meng Cao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Web-crawled datasets are pivotal to the success of pre-training
+vision-language models, exemplified by CLIP. However, web-crawled AltTexts can
+be noisy and potentially irrelevant to images, thereby undermining the crucial
+image-text alignment. Existing methods for rewriting captions using large
+language models (LLMs) have shown promise on small, curated datasets like CC3M
+and CC12M. Nevertheless, their efficacy on massive web-captured captions is
+constrained by the inherent noise and randomness in such data. In this study,
+we address this limitation by focusing on two key aspects: data quality and
+data variety. Unlike recent LLM rewriting techniques, we emphasize exploiting
+visual concepts and their integration into the captions to improve data
+quality. For data variety, we propose a novel mixed training scheme that
+optimally leverages AltTexts alongside newly generated Visual-enriched Captions
+(VeC). We use CLIP as one example and adapt the method for CLIP training on
+large-scale web-crawled datasets, named VeCLIP. We conduct a comprehensive
+evaluation of VeCLIP across small, medium, and large scales of raw data. Our
+results show significant advantages in image-text alignment and overall model
+performance, underscoring the effectiveness of VeCLIP in improving CLIP
+training. For example, VeCLIP achieves a remarkable over 20% improvement in
+COCO and Flickr30k retrieval tasks under the 12M setting. For data efficiency,
+we also achieve a notable over 3% improvement while using only 14% of the data
+employed in the vanilla CLIP and 11% in ALIGN.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>CV/ML</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SurroCBM: Concept Bottleneck Surrogate Models for Generative Post-hoc
+  Explanation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07698v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07698v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Pan, Zhenke Liu, Yifei Zhang, Liang Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Explainable AI seeks to bring light to the decision-making processes of
+black-box models. Traditional saliency-based methods, while highlighting
+influential data segments, often lack semantic understanding. Recent
+advancements, such as Concept Activation Vectors (CAVs) and Concept Bottleneck
+Models (CBMs), offer concept-based explanations but necessitate human-defined
+concepts. However, human-annotated concepts are expensive to attain. This paper
+introduces the Concept Bottleneck Surrogate Models (SurroCBM), a novel
+framework that aims to explain the black-box models with automatically
+discovered concepts. SurroCBM identifies shared and unique concepts across
+various black-box models and employs an explainable surrogate model for
+post-hoc explanations. An effective training strategy using self-generated data
+is proposed to enhance explanation quality continuously. Through extensive
+experiments, we demonstrate the efficacy of SurroCBM in concept discovery and
+explanation, underscoring its potential in advancing the field of explainable
+AI.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Controllable Data Generation Via Iterative Data-Property Mutual Mappings 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07683v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07683v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bo Pan, Muran Qin, Shiyu Wang, Yifei Zhang, Liang Zhao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep generative models have been widely used for their ability to generate
+realistic data samples in various areas, such as images, molecules, text, and
+speech. One major goal of data generation is controllability, namely to
+generate new data with desired properties. Despite growing interest in the area
+of controllable generation, significant challenges still remain, including 1)
+disentangling desired properties with unrelated latent variables, 2)
+out-of-distribution property control, and 3) objective optimization for
+out-of-distribution property control. To address these challenges, in this
+paper, we propose a general framework to enhance VAE-based data generators with
+property controllability and ensure disentanglement. Our proposed objective can
+be optimized on both data seen and unseen in the training set. We propose a
+training procedure to train the objective in a semi-supervised manner by
+iteratively conducting mutual mappings between the data and properties. The
+proposed framework is implemented on four VAE-based controllable generators to
+evaluate its performance on property error, disentanglement, generation
+quality, and training time. The results indicate that our proposed framework
+enables more precise control over the properties of generated samples in a
+short training time, ensuring the disentanglement and keeping the validity of
+the generated samples.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Explainable Image Similarity: Integrating Siamese Networks and Grad-CAM 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07678v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07678v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ioannis E. Livieris, Emmanuel Pintelas, Niki Kiriakidou, Panagiotis Pintelas
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the proliferation of image-based applications in various domains, the
+need for accurate and interpretable image similarity measures has become
+increasingly critical. Existing image similarity models often lack
+transparency, making it challenging to understand the reasons why two images
+are considered similar. In this paper, we propose the concept of explainable
+image similarity, where the goal is the development of an approach, which is
+capable of providing similarity scores along with visual factual and
+counterfactual explanations. Along this line, we present a new framework, which
+integrates Siamese Networks and Grad-CAM for providing explainable image
+similarity and discuss the potential benefits and challenges of adopting this
+approach. In addition, we provide a comprehensive discussion about factual and
+counterfactual explanations provided by the proposed framework for assisting
+decision making. The proposed approach has the potential to enhance the
+interpretability, trustworthiness and user acceptance of image-based systems in
+real-world image similarity applications. The implementation code can be found
+in https://github.com/ioannislivieris/Grad_CAM_Siamese.git.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The manuscript has been submitted for publication in "Journal of
+  Imaging"</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Composite Backdoor Attacks Against Large Language Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07676v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07676v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hai Huang, Zhengyu Zhao, Michael Backes, Yun Shen, Yang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) have demonstrated superior performance compared
+to previous methods on various tasks, and often serve as the foundation models
+for many researches and services. However, the untrustworthy third-party LLMs
+may covertly introduce vulnerabilities for downstream tasks. In this paper, we
+explore the vulnerability of LLMs through the lens of backdoor attacks.
+Different from existing backdoor attacks against LLMs, ours scatters multiple
+trigger keys in different prompt components. Such a Composite Backdoor Attack
+(CBA) is shown to be stealthier than implanting the same multiple trigger keys
+in only a single component. CBA ensures that the backdoor is activated only
+when all trigger keys appear. Our experiments demonstrate that CBA is effective
+in both natural language processing (NLP) and multimodal tasks. For instance,
+with $3\%$ poisoning samples against the LLaMA-7B model on the Emotion dataset,
+our attack achieves a $100\%$ Attack Success Rate (ASR) with a False Triggered
+Rate (FTR) below $2.06\%$ and negligible model accuracy degradation. The unique
+characteristics of our CBA can be tailored for various practical scenarios,
+e.g., targeting specific user groups. Our work highlights the necessity of
+increased security research on the trustworthiness of foundation LLMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Stabilizing Estimates of Shapley Values with Control Variates 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07672v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07672v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeremy Goldwasser, Giles Hooker
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Shapley values are among the most popular tools for explaining predictions of
+blackbox machine learning models. However, their high computational cost
+motivates the use of sampling approximations, inducing a considerable degree of
+uncertainty. To stabilize these model explanations, we propose ControlSHAP, an
+approach based on the Monte Carlo technique of control variates. Our
+methodology is applicable to any machine learning model and requires virtually
+no extra computation or modeling effort. On several high-dimensional datasets,
+we find it can produce dramatic reductions in the Monte Carlo variability of
+Shapley estimates.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GRaMuFeN: Graph-based Multi-modal Fake News Detection in Social Media 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07668v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07668v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Makan Kananian, Fatima Badiei, S. AmirAli Gh. Ghahramani
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The proliferation of social media platforms such as Twitter, Instagram, and
+Weibo has significantly enhanced the dissemination of false information. This
+phenomenon grants both individuals and governmental entities the ability to
+shape public opinions, highlighting the need for deploying effective detection
+methods. In this paper, we propose GraMuFeN, a model designed to detect fake
+content by analyzing both the textual and image content of news. GraMuFeN
+comprises two primary components: a text encoder and an image encoder. For
+textual analysis, GraMuFeN treats each text as a graph and employs a Graph
+Convolutional Neural Network (GCN) as the text encoder. Additionally, the
+pre-trained ResNet-152, as a Convolutional Neural Network (CNN), has been
+utilized as the image encoder. By integrating the outputs from these two
+encoders and implementing a contrastive similarity loss function, GraMuFeN
+achieves remarkable results. Extensive evaluations conducted on two publicly
+available benchmark datasets for social media news indicate a 10 % increase in
+micro F1-Score, signifying improvement over existing state-of-the-art models.
+These findings underscore the effectiveness of combining GCN and CNN models for
+detecting fake news in multi-modal data, all while minimizing the additional
+computational burden imposed by model parameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Global Minima, Recoverability Thresholds, and Higher-Order Structure in
+  GNNS 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07667v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07667v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Drake Brown, Trevor Garrity, Kaden Parker, Jason Oliphant, Stone Carson, Cole Hanson, Zachary Boyd
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We analyze the performance of graph neural network (GNN) architectures from
+the perspective of random graph theory. Our approach promises to complement
+existing lenses on GNN analysis, such as combinatorial expressive power and
+worst-case adversarial analysis, by connecting the performance of GNNs to
+typical-case properties of the training data. First, we theoretically
+characterize the nodewise accuracy of one- and two-layer GCNs relative to the
+contextual stochastic block model (cSBM) and related models. We additionally
+prove that GCNs cannot beat linear models under certain circumstances. Second,
+we numerically map the recoverability thresholds, in terms of accuracy, of four
+diverse GNN architectures (GCN, GAT, SAGE, and Graph Transformer) under a
+variety of assumptions about the data. Sample results of this second analysis
+include: heavy-tailed degree distributions enhance GNN performance, GNNs can
+work well on strongly heterophilous graphs, and SAGE and Graph Transformer can
+perform well on arbitrarily noisy edge data, but no architecture handled
+sufficiently noisy feature data well. Finally, we show how both specific
+higher-order structures in synthetic data and the mix of empirical structures
+in real data have dramatic effects (usually negative) on GNN performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>28 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Backtracking Counterfactuals for Causally Compliant Explanations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07665v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07665v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Klaus-Rudolf Kladny, Julius von Kügelgen, Bernhard Schölkopf, Michael Muehlebach
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Counterfactuals can offer valuable insights by answering what would have been
+observed under altered circumstances, conditional on a factual observation.
+Whereas the classical interventional interpretation of counterfactuals has been
+studied extensively, backtracking constitutes a less studied alternative the
+backtracking principle has emerged as an alternative philosophy where all
+causal laws are kept intact. In the present work, we introduce a practical
+method for computing backtracking counterfactuals in structural causal models
+that consist of deep generative components. To this end, we impose conditions
+on the structural assignments that enable the generation of counterfactuals by
+solving a tractable constrained optimization problem in the structured latent
+space of a causal model. Our formulation also facilitates a comparison with
+methods in the field of counterfactual explanations. Compared to these, our
+method represents a versatile, modular and causally compliant alternative. We
+demonstrate these properties experimentally on a modified version of MNIST and
+CelebA.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ The First Pathloss Radio Map Prediction Challenge <span class="chip">ICASSP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07658v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07658v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Çağkan Yapar, Fabian Jaensch, Ron Levie, Gitta Kutyniok, Giuseppe Caire
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  To foster research and facilitate fair comparisons among recently proposed
+pathloss radio map prediction methods, we have launched the ICASSP 2023 First
+Pathloss Radio Map Prediction Challenge. In this short overview paper, we
+briefly describe the pathloss prediction problem, the provided datasets, the
+challenge task and the challenge evaluation methodology. Finally, we present
+the results of the challenge.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICASSP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Audio-Visual Neural Syntax Acquisition 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07654v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07654v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Cheng-I Jeff Lai, Freda Shi, Puyuan Peng, Yoon Kim, Kevin Gimpel, Shiyu Chang, Yung-Sung Chuang, Saurabhchand Bhati, David Cox, David Harwath, Yang Zhang, Karen Livescu, James Glass
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study phrase structure induction from visually-grounded speech. The core
+idea is to first segment the speech waveform into sequences of word segments,
+and subsequently induce phrase structure using the inferred segment-level
+continuous representations. We present the Audio-Visual Neural Syntax Learner
+(AV-NSL) that learns phrase structure by listening to audio and looking at
+images, without ever being exposed to text. By training on paired images and
+spoken captions, AV-NSL exhibits the capability to infer meaningful phrase
+structures that are comparable to those derived by naturally-supervised text
+parsers, for both English and German. Our findings extend prior work in
+unsupervised language acquisition from speech and grounded grammar induction,
+and present one approach to bridge the gap between the two topics.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hypercomplex Multimodal Emotion Recognition from EEG and Peripheral
+  Physiological Signals <span class="chip">ICASSP</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07648v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07648v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Eleonora Lopez, Eleonora Chiarantano, Eleonora Grassucci, Danilo Comminiello
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal emotion recognition from physiological signals is receiving an
+increasing amount of attention due to the impossibility to control them at will
+unlike behavioral reactions, thus providing more reliable information. Existing
+deep learning-based methods still rely on extracted handcrafted features, not
+taking full advantage of the learning ability of neural networks, and often
+adopt a single-modality approach, while human emotions are inherently expressed
+in a multimodal way. In this paper, we propose a hypercomplex multimodal
+network equipped with a novel fusion module comprising parameterized
+hypercomplex multiplications. Indeed, by operating in a hypercomplex domain the
+operations follow algebraic rules which allow to model latent relations among
+learned feature dimensions for a more effective fusion step. We perform
+classification of valence and arousal from electroencephalogram (EEG) and
+peripheral physiological signals, employing the publicly available database
+MAHNOB-HCI surpassing a multimodal state-of-the-art network. The code of our
+work is freely available at https://github.com/ispamm/MHyEEG.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at IEEE ICASSP workshops 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Rethinking the <span class="highlight-title">BERT</span>-like <span class="highlight-title">Pretrain</span>ing for DNA Sequences 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07644v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07644v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chaoqi Liang, Weiqiang Bai, Lifeng Qiao, Yuchen Ren, Jianle Sun, Peng Ye, Hongliang Yan, Xinzhu Ma, Wangmeng Zuo, Wanli Ouyang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With the success of large-scale pretraining in NLP, there is an increasing
+trend of applying it to the domain of life sciences. In particular, pretraining
+methods based on DNA sequences have garnered growing attention due to their
+potential to capture generic information about genes. However, existing
+pretraining methods for DNA sequences largely rely on direct adoptions of BERT
+pretraining from NLP, lacking a comprehensive understanding and a specifically
+tailored approach. To address this research gap, we first conducted a series of
+exploratory experiments and gained several insightful observations: 1) In the
+fine-tuning phase of downstream tasks, when using K-mer overlapping
+tokenization instead of K-mer non-overlapping tokenization, both overlapping
+and non-overlapping pretraining weights show consistent performance
+improvement.2) During the pre-training process, using K-mer overlapping
+tokenization quickly produces clear K-mer embeddings and reduces the loss to a
+very low level, while using K-mer non-overlapping tokenization results in less
+distinct embeddings and continuously decreases the loss. 3) Using overlapping
+tokenization causes the self-attention in the intermediate layers of
+pre-trained models to tend to overly focus on certain tokens, reflecting that
+these layers are not adequately optimized. In summary, overlapping tokenization
+can benefit the fine-tuning of downstream tasks but leads to inadequate
+pretraining with fast convergence. To unleash the pretraining potential, we
+introduce a novel approach called RandomMask, which gradually increases the
+task difficulty of BERT-like pretraining by continuously expanding its mask
+boundary, forcing the model to learn more knowledge. RandomMask is simple but
+effective, achieving top-tier performance across 26 datasets of 28 datasets
+spanning 7 downstream tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Evaluating Large Language Models at Evaluating Instruction Following 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07641v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07641v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhiyuan Zeng, Jiatong Yu, Tianyu Gao, Yu Meng, Tanya Goyal, Danqi Chen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As research in large language models (LLMs) continues to accelerate,
+LLM-based evaluation has emerged as a scalable and cost-effective alternative
+to human evaluations for comparing the ever increasing list of models. This
+paper investigates the efficacy of these "LLM evaluators", particularly in
+using them to assess instruction following, a metric that gauges how closely
+generated text adheres to the given instruction. We introduce a challenging
+meta-evaluation benchmark, LLMBar, designed to test the ability of an LLM
+evaluator in discerning instruction-following outputs. The authors manually
+curated 419 pairs of outputs, one adhering to instructions while the other
+diverging, yet may possess deceptive qualities that mislead an LLM evaluator,
+e.g., a more engaging tone. Contrary to existing meta-evaluation, we discover
+that different evaluators (i.e., combinations of LLMs and prompts) exhibit
+distinct performance on LLMBar and even the highest-scoring ones have
+substantial room for improvement. We also present a novel suite of prompting
+strategies that further close the gap between LLM and human evaluators. With
+LLMBar, we hope to offer more insight into LLM evaluators and foster future
+research in developing better instruction-following models.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Prompt</span> Backdoors in Visual <span class="highlight-title">Prompt</span> Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07632v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07632v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hai Huang, Zhengyu Zhao, Michael Backes, Yun Shen, Yang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fine-tuning large pre-trained computer vision models is infeasible for
+resource-limited users. Visual prompt learning (VPL) has thus emerged to
+provide an efficient and flexible alternative to model fine-tuning through
+Visual Prompt as a Service (VPPTaaS). Specifically, the VPPTaaS provider
+optimizes a visual prompt given downstream data, and downstream users can use
+this prompt together with the large pre-trained model for prediction. However,
+this new learning paradigm may also pose security risks when the VPPTaaS
+provider instead provides a malicious visual prompt. In this paper, we take the
+first step to explore such risks through the lens of backdoor attacks.
+Specifically, we propose BadVisualPrompt, a simple yet effective backdoor
+attack against VPL. For example, poisoning $5\%$ CIFAR10 training data leads to
+above $99\%$ attack success rates with only negligible model accuracy drop by
+$1.5\%$. In particular, we identify and then address a new technical challenge
+related to interactions between the backdoor trigger and visual prompt, which
+does not exist in conventional, model-level backdoors. Moreover, we provide
+in-depth analyses of seven backdoor defenses from model, prompt, and input
+levels. Overall, all these defenses are either ineffective or impractical to
+mitigate our BadVisualPrompt, implying the critical vulnerability of VPL.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Graph <span class="highlight-title">Transformer</span> Network for Flood Forecasting with Heterogeneous
+  Covariates 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07631v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07631v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jimeng Shi, Vitalii Stebliankin, Zhaonan Wang, Shaowen Wang, Giri Narasimhan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Floods can be very destructive causing heavy damage to life, property, and
+livelihoods. Global climate change and the consequent sea-level rise have
+increased the occurrence of extreme weather events, resulting in elevated and
+frequent flood risk. Therefore, accurate and timely flood forecasting in
+coastal river systems is critical to facilitate good flood management. However,
+the computational tools currently used are either slow or inaccurate. In this
+paper, we propose a Flood prediction tool using Graph Transformer Network
+(FloodGTN) for river systems. More specifically, FloodGTN learns the
+spatio-temporal dependencies of water levels at different monitoring stations
+using Graph Neural Networks (GNNs) and an LSTM. It is currently implemented to
+consider external covariates such as rainfall, tide, and the settings of
+hydraulic structures (e.g., outflows of dams, gates, pumps, etc.) along the
+river. We use a Transformer to learn the attention given to external covariates
+in computing water levels. We apply the FloodGTN tool to data from the South
+Florida Water Management District, which manages a coastal area prone to
+frequent storms and hurricanes. Experimental results show that FloodGTN
+outperforms the physics-based model (HEC-RAS) by achieving higher accuracy with
+70% improvement while speeding up run times by at least 500x.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Differentiable Euler Characteristic Transforms for Shape Classification 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07630v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07630v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ernst Roell, Bastian Rieck
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The Euler Characteristic Transform (ECT) has proven to be a powerful
+representation, combining geometrical and topological characteristics of shapes
+and graphs. However, the ECT was hitherto unable to learn task-specific
+representations. We overcome this issue and develop a novel computational layer
+that enables learning the ECT in an end-to-end fashion. Our method DECT is fast
+and computationally efficient, while exhibiting performance on a par with more
+complex models in both graph and point cloud classification tasks. Moreover, we
+show that this seemingly unexpressive statistic still provides the same
+topological expressivity as more complex topological deep learning layers
+provide.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Unsupervised Learning of Sea Surface Height Interpolation from
+  Multi-variate Simulated Satellite Observations 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07626v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07626v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Theo Archambault, Arthur Filoche, Anastase Charantonis, Dominique Bereziat, Sylvie Thiria
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Satellite-based remote sensing missions have revolutionized our understanding
+of the Ocean state and dynamics. Among them, spaceborne altimetry provides
+valuable measurements of Sea Surface Height (SSH), which is used to estimate
+surface geostrophic currents. However, due to the sensor technology employed,
+important gaps occur in SSH observations. Complete SSH maps are produced by the
+altimetry community using linear Optimal Interpolations (OI) such as the
+widely-used Data Unification and Altimeter Combination System (DUACS). However,
+OI is known for producing overly smooth fields and thus misses some
+mesostructures and eddies. On the other hand, Sea Surface Temperature (SST)
+products have much higher data coverage and SST is physically linked to
+geostrophic currents through advection. We design a realistic twin experiment
+to emulate the satellite observations of SSH and SST to evaluate interpolation
+methods. We introduce a deep learning network able to use SST information, and
+a trainable in two settings: one where we have no access to ground truth during
+training and one where it is accessible. Our investigation involves a
+comparative analysis of the aforementioned network when trained using either
+supervised or unsupervised loss functions. We assess the quality of SSH
+reconstructions and further evaluate the network's performance in terms of eddy
+detection and physical properties. We find that it is possible, even in an
+unsupervised setting to use SST to improve reconstruction performance compared
+to SST-agnostic interpolations. We compare our reconstructions to DUACS's and
+report a decrease of 41\% in terms of root mean squared error.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submitted to JAMES. 26 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ PHYDI: Initializing Parameterized Hypercomplex Neural Networks as
+  Identity Functions <span class="chip">SP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07612v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07612v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Matteo Mancanelli, Eleonora Grassucci, Aurelio Uncini, Danilo Comminiello
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Neural models based on hypercomplex algebra systems are growing and
+prolificating for a plethora of applications, ranging from computer vision to
+natural language processing. Hand in hand with their adoption, parameterized
+hypercomplex neural networks (PHNNs) are growing in size and no techniques have
+been adopted so far to control their convergence at a large scale. In this
+paper, we study PHNNs convergence and propose parameterized hypercomplex
+identity initialization (PHYDI), a method to improve their convergence at
+different scales, leading to more robust performance when the number of layers
+scales up, while also reaching the same performance with fewer iterations. We
+show the effectiveness of this approach in different benchmarks and with common
+PHNNs with ResNets- and Transformer-based architecture. The code is available
+at https://github.com/ispamm/PHYDI.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IEEE MLSP 2023 (Honorable Mention TOP 5% Outstanding
+  Papers)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Survey</span> on Imbalanced Data, Representation Learning and SEP Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07598v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07598v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Josias Moukpe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Learning methods have significantly advanced various data-driven tasks
+such as regression, classification, and forecasting. However, much of this
+progress has been predicated on the strong but often unrealistic assumption
+that training datasets are balanced with respect to the targets they contain.
+This misalignment with real-world conditions, where data is frequently
+imbalanced, hampers the effectiveness of such models in practical applications.
+Methods that reconsider that assumption and tackle real-world imbalances have
+begun to emerge and explore avenues to address this challenge. One such
+promising avenue is representation learning, which enables models to capture
+complex data characteristics and generalize better to minority classes. By
+focusing on a richer representation of the feature space, these techniques hold
+the potential to mitigate the impact of data imbalance. In this survey, we
+present deep learning works that step away from the balanced-data assumption,
+employing strategies like representation learning to better approximate
+real-world imbalances. We also highlight a critical application in SEP
+forecasting where addressing data imbalance is paramount for success.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Survey Paper, 4 figures, 16 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Prospective Side Information for Latent MDPs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07596v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07596v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jeongyeol Kwon, Yonathan Efroni, Shie Mannor, Constantine Caramanis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In many interactive decision-making settings, there is latent and unobserved
+information that remains fixed. Consider, for example, a dialogue system, where
+complete information about a user, such as the user's preferences, is not
+given. In such an environment, the latent information remains fixed throughout
+each episode, since the identity of the user does not change during an
+interaction. This type of environment can be modeled as a Latent Markov
+Decision Process (LMDP), a special instance of Partially Observed Markov
+Decision Processes (POMDPs). Previous work established exponential lower bounds
+in the number of latent contexts for the LMDP class. This puts forward a
+question: under which natural assumptions a near-optimal policy of an LMDP can
+be efficiently learned? In this work, we study the class of LMDPs with {\em
+prospective side information}, when an agent receives additional, weakly
+revealing, information on the latent context at the beginning of each episode.
+We show that, surprisingly, this problem is not captured by contemporary
+settings and algorithms designed for partially observed environments. We then
+establish that any sample efficient algorithm must suffer at least
+$\Omega(K^{2/3})$-regret, as opposed to standard $\Omega(\sqrt{K})$ lower
+bounds, and design an algorithm with a matching upper bound.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Transformer</span>s for Green Semantic Communication: Less Energy, More
+  Semantics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07592v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07592v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shubhabrata Mukherjee, Cory Beard, Sejun Song
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Semantic communication aims to transmit meaningful and effective information
+rather than focusing on individual symbols or bits, resulting in benefits like
+reduced latency, bandwidth usage, and higher throughput compared to traditional
+communication. However, semantic communication poses significant challenges due
+to the need for universal metrics for benchmarking the joint effects of
+semantic information loss and practical energy consumption. This research
+presents a novel multi-objective loss function named "Energy-Optimized Semantic
+Loss" (EOSL), addressing the challenge of balancing semantic information loss
+and energy consumption. Through comprehensive experiments on transformer
+models, including CPU and GPU energy usage, it is demonstrated that EOSL-based
+encoder model selection can save up to 90\% of energy while achieving a 44\%
+improvement in semantic similarity performance during inference in this
+experiment. This work paves the way for energy-efficient neural network
+selection and the development of greener semantic communication architectures.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Accurate Use of Label Dependency in Multi-Label Text Classification
+  Through the Lens of Causality 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07588v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07588v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Caoyun Fan, Wenqing Chen, Jidong Tian, Yitian Li, Hao He, Yaohui Jin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multi-Label Text Classification (MLTC) aims to assign the most relevant
+labels to each given text. Existing methods demonstrate that label dependency
+can help to improve the model's performance. However, the introduction of label
+dependency may cause the model to suffer from unwanted prediction bias. In this
+study, we attribute the bias to the model's misuse of label dependency, i.e.,
+the model tends to utilize the correlation shortcut in label dependency rather
+than fusing text information and label dependency for prediction. Motivated by
+causal inference, we propose a CounterFactual Text Classifier (CFTC) to
+eliminate the correlation bias, and make causality-based predictions.
+Specifically, our CFTC first adopts the predict-then-modify backbone to extract
+precise label information embedded in label dependency, then blocks the
+correlation shortcut through the counterfactual de-bias technique with the help
+of the human causal graph. Experimental results on three datasets demonstrate
+that our CFTC significantly outperforms the baselines and effectively
+eliminates the correlation bias in datasets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Applied Intelligence 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fed-GraB: Federated Long-tailed Learning with Self-Adjusting Gradient
+  Balancer <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07587v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07587v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zikai Xiao, Zihan Chen, Songshang Liu, Hualiang Wang, Yang Feng, Jin Hao, Joey Tianyi Zhou, Jian Wu, Howard Hao Yang, Zuozhu Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data privacy and long-tailed distribution are the norms rather than the
+exception in many real-world tasks. This paper investigates a federated
+long-tailed learning (Fed-LT) task in which each client holds a locally
+heterogeneous dataset; if the datasets can be globally aggregated, they jointly
+exhibit a long-tailed distribution. Under such a setting, existing federated
+optimization and/or centralized long-tailed learning methods hardly apply due
+to challenges in (a) characterizing the global long-tailed distribution under
+privacy constraints and (b) adjusting the local learning strategy to cope with
+the head-tail imbalance. In response, we propose a method termed
+$\texttt{Fed-GraB}$, comprised of a Self-adjusting Gradient Balancer (SGB)
+module that re-weights clients' gradients in a closed-loop manner, based on the
+feedback of global long-tailed distribution evaluated by a Direct Prior
+Analyzer (DPA) module. Using $\texttt{Fed-GraB}$, clients can effectively
+alleviate the distribution drift caused by data heterogeneity during the model
+training process and obtain a global model with better performance on the
+minority classes while maintaining the performance of the majority classes.
+Extensive experiments demonstrate that $\texttt{Fed-GraB}$ achieves
+state-of-the-art performance on representative datasets such as CIFAR-10-LT,
+CIFAR-100-LT, ImageNet-LT, and iNaturalist.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Linear Latent World Models in Simple <span class="highlight-title">Transformer</span>s: A Case Study on
+  Othello-<span class="highlight-title">GPT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07582v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07582v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dean S. Hazineh, Zechen Zhang, Jeffery Chiu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Foundation models exhibit significant capabilities in decision-making and
+logical deductions. Nonetheless, a continuing discourse persists regarding
+their genuine understanding of the world as opposed to mere stochastic mimicry.
+This paper meticulously examines a simple transformer trained for Othello,
+extending prior research to enhance comprehension of the emergent world model
+of Othello-GPT. The investigation reveals that Othello-GPT encapsulates a
+linear representation of opposing pieces, a factor that causally steers its
+decision-making process. This paper further elucidates the interplay between
+the linear world representation and causal decision-making, and their
+dependence on layer depth and model complexity. We have made the code public.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ In-Context Unlearning: Language Models as Few Shot Unlearners 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07579v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07579v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Pawelczyk, Seth Neel, Himabindu Lakkaraju
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine unlearning, the study of efficiently removing the impact of specific
+training points on the trained model, has garnered increased attention of late,
+driven by the need to comply with privacy regulations like the \emph{Right to
+be Forgotten}. Although unlearning is particularly relevant for LLMs in light
+of the copyright issues they raise, achieving precise unlearning is
+computationally infeasible for very large models. To this end, recent work has
+proposed several algorithms which approximate the removal of training data
+without retraining the model. These algorithms crucially rely on access to the
+model parameters in order to update them, an assumption that may not hold in
+practice due to computational constraints or when the LLM is accessed via API.
+In this work, we propose a new class of unlearning methods for LLMs we call
+``In-Context Unlearning'', providing inputs in context and without having to
+update model parameters. To unlearn a particular training instance, we provide
+the instance alongside a flipped label and additional correctly labelled
+instances which are prepended as inputs to the LLM at inference time. Our
+experimental results demonstrate that these contexts effectively remove
+specific information from the training set while maintaining performance levels
+that are competitive with (or in some cases exceed) state-of-the-art unlearning
+methods that require access to the LLM parameters.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Analyzing Trendy Twitter Hashtags in the 2022 French Election 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07576v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07576v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aamir Mandviwalla, Lake Yin, Boleslaw K. Szymanski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Regressions trained to predict the future activity of social media users need
+rich features for accurate predictions. Many advanced models exist to generate
+such features; however, the time complexities of their computations are often
+prohibitive when they run on enormous data-sets. Some studies have shown that
+simple semantic network features can be rich enough to use for regressions
+without requiring complex computations. We propose a method for using semantic
+networks as user-level features for machine learning tasks. We conducted an
+experiment using a semantic network of 1037 Twitter hashtags from a corpus of
+3.7 million tweets related to the 2022 French presidential election. A
+bipartite graph is formed where hashtags are nodes and weighted edges connect
+the hashtags reflecting the number of Twitter users that interacted with both
+hashtags. The graph is then transformed into a maximum-spanning tree with the
+most popular hashtag as its root node to construct a hierarchy amongst the
+hashtags. We then provide a vector feature for each user based on this tree. To
+validate the usefulness of our semantic feature we performed a regression
+experiment to predict the response rate of each user with six emotions like
+anger, enjoyment, or disgust. Our semantic feature performs well with the
+regression with most emotions having $R^2$ above 0.5. These results suggest
+that our semantic feature could be considered for use in further experiments
+predicting social media response on big data-sets.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 pages, 1 figure, to be published in Complex Networks 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ROMO: Retrieval-enhanced Offline Model-based Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07560v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07560v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mingcheng Chen, Haoran Zhao, Yuxiang Zhao, Hulei Fan, Hongqiao Gao, Yong Yu, Zheng Tian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data-driven black-box model-based optimization (MBO) problems arise in a
+great number of practical application scenarios, where the goal is to find a
+design over the whole space maximizing a black-box target function based on a
+static offline dataset. In this work, we consider a more general but
+challenging MBO setting, named constrained MBO (CoMBO), where only part of the
+design space can be optimized while the rest is constrained by the environment.
+A new challenge arising from CoMBO is that most observed designs that satisfy
+the constraints are mediocre in evaluation. Therefore, we focus on optimizing
+these mediocre designs in the offline dataset while maintaining the given
+constraints rather than further boosting the best observed design in the
+traditional MBO setting. We propose retrieval-enhanced offline model-based
+optimization (ROMO), a new derivable forward approach that retrieves the
+offline dataset and aggregates relevant samples to provide a trusted
+prediction, and use it for gradient-based optimization. ROMO is simple to
+implement and outperforms state-of-the-art approaches in the CoMBO setting.
+Empirically, we conduct experiments on a synthetic Hartmann (3D) function
+dataset, an industrial CIO dataset, and a suite of modified tasks in the
+Design-Bench benchmark. Results show that ROMO performs well in a wide range of
+constrained optimization tasks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15 pages, 9 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Smootheness-Adaptive Dynamic Pricing with Nonparametric Demand Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07558v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07558v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeqi Ye, Hansheng Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We study the dynamic pricing problem where the demand function is
+nonparametric and H\"older smooth, and we focus on adaptivity to the unknown
+H\"older smoothness parameter $\beta$ of the demand function. Traditionally the
+optimal dynamic pricing algorithm heavily relies on the knowledge of $\beta$ to
+achieve a minimax optimal regret of
+$\widetilde{O}(T^{\frac{\beta+1}{2\beta+1}})$. However, we highlight the
+challenge of adaptivity in this dynamic pricing problem by proving that no
+pricing policy can adaptively achieve this minimax optimal regret without
+knowledge of $\beta$. Motivated by the impossibility result, we propose a
+self-similarity condition to enable adaptivity. Importantly, we show that the
+self-similarity condition does not compromise the problem's inherent complexity
+since it preserves the regret lower bound
+$\Omega(T^{\frac{\beta+1}{2\beta+1}})$. Furthermore, we develop a
+smoothness-adaptive dynamic pricing algorithm and theoretically prove that the
+algorithm achieves this minimax optimal regret bound without the prior
+knowledge $\beta$.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improving Fairness-Accuracy tradeoff with few Test Samples under
+  Covariate Shift 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07535v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07535v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shreyas Havaldar, Jatin Chauhan, Karthikeyan Shanmugam, Jay Nandy, Aravindan Raghuveer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Covariate shift in the test data can significantly downgrade both the
+accuracy and the fairness performance of the model. Ensuring fairness across
+different sensitive groups in such settings is of paramount importance due to
+societal implications like criminal justice. We operate under the unsupervised
+regime where only a small set of unlabeled test samples along with a labeled
+training set is available. Towards this problem, we make three contributions.
+First is a novel composite weighted entropy based objective for prediction
+accuracy which is optimized along with a representation matching loss for
+fairness. We experimentally verify that optimizing with our loss formulation
+outperforms a number of state-of-the-art baselines in the pareto sense with
+respect to the fairness-accuracy tradeoff on several standard datasets. Our
+second contribution is a new setting we term Asymmetric Covariate Shift that,
+to the best of our knowledge, has not been studied before. Asymmetric covariate
+shift occurs when distribution of covariates of one group shifts significantly
+compared to the other groups and this happens when a dominant group is
+over-represented. While this setting is extremely challenging for current
+baselines, We show that our proposed method significantly outperforms them. Our
+third contribution is theoretical, where we show that our weighted entropy term
+along with prediction loss on the training set approximates test loss under
+covariate shift. Empirically and through formal sample complexity bounds, we
+show that this approximation to the unseen test loss does not depend on
+importance sampling variance which affects many other baselines.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Human-Centered Evaluation of XAI Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07534v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07534v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Karam Dawoud, Wojciech Samek, Sebastian Lapuschkin, Sebastian Bosse
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the ever-evolving field of Artificial Intelligence, a critical challenge
+has been to decipher the decision-making processes within the so-called "black
+boxes" in deep learning. Over recent years, a plethora of methods have emerged,
+dedicated to explaining decisions across diverse tasks. Particularly in tasks
+like image classification, these methods typically identify and emphasize the
+pivotal pixels that most influence a classifier's prediction. Interestingly,
+this approach mirrors human behavior: when asked to explain our rationale for
+classifying an image, we often point to the most salient features or aspects.
+Capitalizing on this parallel, our research embarked on a user-centric study.
+We sought to objectively measure the interpretability of three leading
+explanation methods: (1) Prototypical Part Network, (2) Occlusion, and (3)
+Layer-wise Relevance Propagation. Intriguingly, our results highlight that
+while the regions spotlighted by these methods can vary widely, they all offer
+humans a nearly equivalent depth of understanding. This enables users to
+discern and categorize images efficiently, reinforcing the value of these
+methods in enhancing AI transparency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Provable Advantage of Parameterized Quantum Circuit in Function
+  Approximation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07528v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07528v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhan Yu, Qiuhao Chen, Yuling Jiao, Yinan Li, Xiliang Lu, Xin Wang, Jerry Zhijian Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding the power of parameterized quantum circuits (PQCs) in
+accomplishing machine learning tasks is one of the most important questions in
+quantum machine learning. In this paper, we analyze the expressivity of PQCs
+through the lens of function approximation. Previously established universal
+approximation theorems for PQCs are mainly nonconstructive, leading us to the
+following question: How large do the PQCs need to be to approximate the target
+function up to a given error? We exhibit explicit constructions of data
+re-uploading PQCs for approximating continuous and smooth functions and
+establish quantitative approximation error bounds in terms of the width, the
+depth and the number of trainable parameters of the PQCs. To achieve this, we
+utilize techniques from quantum signal processing and linear combinations of
+unitaries to construct PQCs that implement multivariate polynomials. We
+implement global and local approximation techniques using Bernstein polynomials
+and local Taylor expansion and analyze their performances in the quantum
+setting. We also compare our proposed PQCs to nearly optimal deep neural
+networks in approximating high-dimensional smooth functions, showing that the
+ratio between model sizes of PQC and deep neural networks is exponentially
+small with respect to the input dimension. This suggests a potentially novel
+avenue for showcasing quantum advantages in quantum machine learning.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>31pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploiting Causal Graph Priors with Posterior Sampling for Reinforcement
+  Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07518v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07518v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mirco Mutti, Riccardo De Santi, Marcello Restelli, Alexander Marx, Giorgia Ramponi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Posterior sampling allows the exploitation of prior knowledge of the
+environment's transition dynamics to improve the sample efficiency of
+reinforcement learning. The prior is typically specified as a class of
+parametric distributions, a task that can be cumbersome in practice, often
+resulting in the choice of uninformative priors. In this work, we propose a
+novel posterior sampling approach in which the prior is given as a (partial)
+causal graph over the environment's variables. The latter is often more natural
+to design, such as listing known causal dependencies between biometric features
+in a medical treatment study. Specifically, we propose a hierarchical Bayesian
+procedure, called C-PSRL, simultaneously learning the full causal graph at the
+higher level and the parameters of the resulting factored dynamics at the lower
+level. For this procedure, we provide an analysis of its Bayesian regret, which
+explicitly connects the regret rate with the degree of prior knowledge. Our
+numerical evaluation conducted in illustrative domains confirms that C-PSRL
+strongly improves the efficiency of posterior sampling with an uninformative
+prior while performing close to posterior sampling with the full causal graph.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Unified Remote Sensing Anomaly Detector Across Modalities and Scenes
+  via Deviation Relationship Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07511v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07511v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingtao Li, Xinyu Wang, Hengwei Zhao, Liangpei Zhang, Yanfei Zhong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remote sensing anomaly detector can find the objects deviating from the
+background as potential targets. Given the diversity in earth anomaly types, a
+unified anomaly detector across modalities and scenes should be cost-effective
+and flexible to new earth observation sources and anomaly types. However, the
+current anomaly detectors are limited to a single modality and single scene,
+since they aim to learn the varying background distribution. Motivated by the
+universal anomaly deviation pattern, in that anomalies exhibit deviations from
+their local context, we exploit this characteristic to build a unified anomaly
+detector. Firstly, we reformulate the anomaly detection task as an undirected
+bilayer graph based on the deviation relationship, where the anomaly score is
+modeled as the conditional probability, given the pattern of the background and
+normal objects. The learning objective is then expressed as a conditional
+probability ranking problem. Furthermore, we design an instantiation of the
+reformulation in the data, architecture, and optimization aspects. Simulated
+spectral and spatial anomalies drive the instantiated architecture. The model
+is optimized directly for the conditional probability ranking. The proposed
+model was validated in five modalities including the hyperspectral, visible
+light, synthetic aperture radar (SAR), infrared and low light to show its
+unified detection ability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Journal paper</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Leveraging Hierarchical Feature Sharing for Efficient <span class="highlight-title">Dataset</span>
+  Condensation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07506v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07506v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Haizhong Zheng, Jiachen Sun, Shutong Wu, Bhavya Kailkhura, Zhuoqing Mao, Chaowei Xiao, Atul Prakash
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Given a real-world dataset, data condensation (DC) aims to synthesize a
+significantly smaller dataset that captures the knowledge of this dataset for
+model training with high performance. Recent works propose to enhance DC with
+data parameterization, which condenses data into parameterized data containers
+rather than pixel space. The intuition behind data parameterization is to
+encode shared features of images to avoid additional storage costs. In this
+paper, we recognize that images share common features in a hierarchical way due
+to the inherent hierarchical structure of the classification system, which is
+overlooked by current data parameterization methods. To better align DC with
+this hierarchical nature and encourage more efficient information sharing
+inside data containers, we propose a novel data parameterization architecture,
+Hierarchical Memory Network (HMN). HMN stores condensed data in a three-tier
+structure, representing the dataset-level, class-level, and instance-level
+features. Another helpful property of the hierarchical architecture is that HMN
+naturally ensures good independence among images despite achieving information
+sharing. This enables instance-level pruning for HMN to reduce redundant
+information, thereby further minimizing redundancy and enhancing performance.
+We evaluate HMN on four public datasets (SVHN, CIFAR10, CIFAR100, and
+Tiny-ImageNet) and compare HMN with eight DC baselines. The evaluation results
+show that our proposed method outperforms all baselines, even when trained with
+a batch-based loss consuming less GPU memory.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Sample-Driven Federated Learning for Energy-Efficient and Real-Time IoT
+  Sensing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07497v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07497v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Minh Ngoc Luu, Minh-Duong Nguyen, Ebrahim Bedeer, Van Duc Nguyen, Dinh Thai Hoang, Diep N. Nguyen, Quoc-Viet Pham
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the domain of Federated Learning (FL) systems, recent cutting-edge methods
+heavily rely on ideal conditions convergence analysis. Specifically, these
+approaches assume that the training datasets on IoT devices possess similar
+attributes to the global data distribution. However, this approach fails to
+capture the full spectrum of data characteristics in real-time sensing FL
+systems. In order to overcome this limitation, we suggest a new approach system
+specifically designed for IoT networks with real-time sensing capabilities. Our
+approach takes into account the generalization gap due to the user's data
+sampling process. By effectively controlling this sampling process, we can
+mitigate the overfitting issue and improve overall accuracy. In particular, We
+first formulate an optimization problem that harnesses the sampling process to
+concurrently reduce overfitting while maximizing accuracy. In pursuit of this
+objective, our surrogate optimization problem is adept at handling energy
+efficiency while optimizing the accuracy with high generalization. To solve the
+optimization problem with high complexity, we introduce an online reinforcement
+learning algorithm, named Sample-driven Control for Federated Learning (SCFL)
+built on the Soft Actor-Critic (A2C) framework. This enables the agent to
+dynamically adapt and find the global optima even in changing environments. By
+leveraging the capabilities of SCFL, our system offers a promising solution for
+resource allocation in FL systems with real-time sensing capabilities.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Model-based Clustering of Individuals' Ecological Momentary Assessment
+  Time-series Data for Improving Forecasting Performance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07491v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07491v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mandani Ntekouli, Gerasimos Spanakis, Lourens Waldorp, Anne Roefs
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Through Ecological Momentary Assessment (EMA) studies, a number of
+time-series data is collected across multiple individuals, continuously
+monitoring various items of emotional behavior. Such complex data is commonly
+analyzed in an individual level, using personalized models. However, it is
+believed that additional information of similar individuals is likely to
+enhance these models leading to better individuals' description. Thus,
+clustering is investigated with an aim to group together the most similar
+individuals, and subsequently use this information in group-based models in
+order to improve individuals' predictive performance. More specifically, two
+model-based clustering approaches are examined, where the first is using
+model-extracted parameters of personalized models, whereas the second is
+optimized on the model-based forecasting performance. Both methods are then
+analyzed using intrinsic clustering evaluation measures (e.g. Silhouette
+coefficients) as well as the performance of a downstream forecasting scheme,
+where each forecasting group-model is devoted to describe all individuals
+belonging to one cluster. Among these, clustering based on performance shows
+the best results, in terms of all examined evaluation measures. As another
+level of evaluation, those group-models' performance is compared to three
+baseline scenarios, the personalized, the all-in-one group and the random
+group-based concept. According to this comparison, the superiority of
+clustering-based methods is again confirmed, indicating that the utilization of
+group-based information could be effectively enhance the overall performance of
+all individuals' data.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>17 pages, 7 figures, BNAIC/BeNeLearn 2023 (Joint International
+  Scientific Conferences on AI and Machine Learning)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ KwaiYiiMath: Technical Report 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07488v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07488v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiayi Fu, Lei Lin, Xiaoyang Gao, Pengli Liu, Zhengzong Chen, Zhirui Yang, Shengnan Zhang, Xue Zheng, Yan Li, Yuliang Liu, Xucheng Ye, Yiqiao Liao, Chao Liao, Bin Chen, Chengru Song, Junchen Wan, Zijia Lin, Fuzheng Zhang, Zhongyuan Wang, Di Zhang, Kun Gai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in large language models (LLMs) have demonstrated
+remarkable abilities in handling a variety of natural language processing (NLP)
+downstream tasks, even on mathematical tasks requiring multi-step reasoning. In
+this report, we introduce the KwaiYiiMath which enhances the mathematical
+reasoning abilities of KwaiYiiBase1, by applying Supervised Fine-Tuning (SFT)
+and Reinforced Learning from Human Feedback (RLHF), including on both English
+and Chinese mathematical tasks. Meanwhile, we also constructed a small-scale
+Chinese primary school mathematics test set (named KMath), consisting of 188
+examples to evaluate the correctness of the problem-solving process generated
+by the models. Empirical studies demonstrate that KwaiYiiMath can achieve
+state-of-the-art (SOTA) performance on GSM8k, CMath, and KMath compared with
+the similar size models, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>technical report</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Nonlinear embeddings for conserving Hamiltonians and other quantities
+  with Neural Galerkin schemes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07485v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07485v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Paul Schwerdtner, Philipp Schulze, Jules Berman, Benjamin Peherstorfer
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work focuses on the conservation of quantities such as Hamiltonians,
+mass, and momentum when solution fields of partial differential equations are
+approximated with nonlinear parametrizations such as deep networks. The
+proposed approach builds on Neural Galerkin schemes that are based on the
+Dirac--Frenkel variational principle to train nonlinear parametrizations
+sequentially in time. We first show that only adding constraints that aim to
+conserve quantities in continuous time can be insufficient because the
+nonlinear dependence on the parameters implies that even quantities that are
+linear in the solution fields become nonlinear in the parameters and thus are
+challenging to discretize in time. Instead, we propose Neural Galerkin schemes
+that compute at each time step an explicit embedding onto the manifold of
+nonlinearly parametrized solution fields to guarantee conservation of
+quantities. The embeddings can be combined with standard explicit and implicit
+time integration schemes. Numerical experiments demonstrate that the proposed
+approach conserves quantities up to machine precision.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>29 pages, 8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning Predicts Biomarker Status and Discovers Related
+  Histomorphology Characteristics for Low-Grade Glioma 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07464v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07464v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zijie Fang, Yihan Liu, Yifeng Wang, Xiangyang Zhang, Yang Chen, Changjing Cai, Yiyang Lin, Ying Han, Zhi Wang, Shan Zeng, Hong Shen, Jun Tan, Yongbing Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Biomarker detection is an indispensable part in the diagnosis and treatment
+of low-grade glioma (LGG). However, current LGG biomarker detection methods
+rely on expensive and complex molecular genetic testing, for which
+professionals are required to analyze the results, and intra-rater variability
+is often reported. To overcome these challenges, we propose an interpretable
+deep learning pipeline, a Multi-Biomarker Histomorphology Discoverer
+(Multi-Beholder) model based on the multiple instance learning (MIL) framework,
+to predict the status of five biomarkers in LGG using only hematoxylin and
+eosin-stained whole slide images and slide-level biomarker status labels.
+Specifically, by incorporating the one-class classification into the MIL
+framework, accurate instance pseudo-labeling is realized for instance-level
+supervision, which greatly complements the slide-level labels and improves the
+biomarker prediction performance. Multi-Beholder demonstrates superior
+prediction performance and generalizability for five LGG biomarkers
+(AUROC=0.6469-0.9735) in two cohorts (n=607) with diverse races and scanning
+protocols. Moreover, the excellent interpretability of Multi-Beholder allows
+for discovering the quantitative and qualitative correlations between biomarker
+status and histomorphology characteristics. Our pipeline not only provides a
+novel approach for biomarker prediction, enhancing the applicability of
+molecular treatments for LGG patients but also facilitates the discovery of new
+mechanisms in molecular functionality and LGG progression.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>47 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Uncovering ECG Changes during Healthy Aging using Explainable AI 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07463v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07463v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Gabriel Ott, Yannik Schaubelt, Juan Miguel Lopez Alcaraz, Wilhelm Haverkamp, Nils Strodthoff
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cardiovascular diseases remain the leading global cause of mortality. This
+necessitates a profound understanding of heart aging processes to diagnose
+constraints in cardiovascular fitness. Traditionally, most of such insights
+have been drawn from the analysis of electrocardiogram (ECG) feature changes of
+individuals as they age. However, these features, while informative, may
+potentially obscure underlying data relationships. In this paper, we employ a
+deep-learning model and a tree-based model to analyze ECG data from a robust
+dataset of healthy individuals across varying ages in both raw signals and ECG
+feature format. Explainable AI techniques are then used to identify ECG
+features or raw signal characteristics are most discriminative for
+distinguishing between age groups. Our analysis with tree-based classifiers
+reveal age-related declines in inferred breathing rates and identifies notably
+high SDANN values as indicative of elderly individuals, distinguishing them
+from younger adults. Furthermore, the deep-learning model underscores the
+pivotal role of the P-wave in age predictions across all age groups, suggesting
+potential changes in the distribution of different P-wave types with age. These
+findings shed new light on age-related ECG changes, offering insights that
+transcend traditional feature-based approaches.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, 8 figures, code available under
+  https://github.com/AI4HealthUOL/ECG-aging</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Efficient machine-learning surrogates for large-scale geological carbon
+  and energy storage 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07461v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07461v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Teeratorn Kadeethum, Stephen J. Verzi, Hongkyu Yoon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Geological carbon and energy storage are pivotal for achieving net-zero
+carbon emissions and addressing climate change. However, they face
+uncertainties due to geological factors and operational limitations, resulting
+in possibilities of induced seismic events or groundwater contamination. To
+overcome these challenges, we propose a specialized machine-learning (ML) model
+to manage extensive reservoir models efficiently.
+  While ML approaches hold promise for geological carbon storage, the
+substantial computational resources required for large-scale analysis are the
+obstacle. We've developed a method to reduce the training cost for deep neural
+operator models, using domain decomposition and a topology embedder to link
+spatio-temporal points. This approach allows accurate predictions within the
+model's domain, even for untrained data, enhancing ML efficiency for
+large-scale geological storage applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ProbTS: A Unified Toolkit to Probe Deep Time-series Forecasting 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07446v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07446v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiawen Zhang, Xumeng Wen, Shun Zheng, Jia Li, Jiang Bian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time-series forecasting serves as a linchpin in a myriad of applications,
+spanning various domains. With the growth of deep learning, this arena has
+bifurcated into two salient branches: one focuses on crafting specific neural
+architectures tailored for time series, and the other harnesses advanced deep
+generative models for probabilistic forecasting. While both branches have made
+significant progress, their differences across data scenarios, methodological
+focuses, and decoding schemes pose profound, yet unexplored, research
+questions. To bridge this knowledge chasm, we introduce ProbTS, a pioneering
+toolkit developed to synergize and compare these two distinct branches. Endowed
+with a unified data module, a modularized model module, and a comprehensive
+evaluator module, ProbTS allows us to revisit and benchmark leading methods
+from both branches. The scrutiny with ProbTS highlights their distinct
+characteristics, relative strengths and weaknesses, and areas that need further
+exploration. Our analyses point to new avenues for research, aiming for more
+effective time-series forecasting.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Branched Deep Convolutional Network for Forecasting the Occurrence of
+  Hazes in Paris using Meteorological Maps with Different Characteristic
+  Spatial Scales 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07437v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07437v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chien Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  A deep learning platform has been developed to forecast the occurrence of the
+low visibility events or hazes. It is trained by using multi-decadal daily
+regional maps of various meteorological and hydrological variables as input
+features and surface visibility observations as the targets. To better preserve
+the characteristic spatial information of different input features for
+training, two branched architectures have recently been developed for the case
+of Paris hazes. These new architectures have improved the performance of the
+network, producing reasonable scores in both validation and a blind forecasting
+evaluation using the data of 2021 and 2022 that have not been used in the
+training and validation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generalized Mixture Model for Extreme Events Forecasting in Time Series
+  Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07435v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07435v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jincheng Wang, Yue Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time Series Forecasting (TSF) is a widely researched topic with broad
+applications in weather forecasting, traffic control, and stock price
+prediction. Extreme values in time series often significantly impact human and
+natural systems, but predicting them is challenging due to their rare
+occurrence. Statistical methods based on Extreme Value Theory (EVT) provide a
+systematic approach to modeling the distribution of extremes, particularly the
+Generalized Pareto (GP) distribution for modeling the distribution of
+exceedances beyond a threshold. To overcome the subpar performance of deep
+learning in dealing with heavy-tailed data, we propose a novel framework to
+enhance the focus on extreme events. Specifically, we propose a Deep Extreme
+Mixture Model with Autoencoder (DEMMA) for time series prediction. The model
+comprises two main modules: 1) a generalized mixture distribution based on the
+Hurdle model and a reparameterized GP distribution form independent of the
+extreme threshold, 2) an Autoencoder-based LSTM feature extractor and a
+quantile prediction module with a temporal attention mechanism. We demonstrate
+the effectiveness of our approach on multiple real-world rainfall datasets.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ HealthWalk: Promoting Health and Mobility through Sensor-Based Rollator
+  Walker Assistance 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07434v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07434v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ivanna Kramer, Kevin Weirauch, Sabine Bauer, Mark Oliver Mints, Peer Neubert
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Rollator walkers allow people with physical limitations to increase their
+mobility and give them the confidence and independence to participate in
+society for longer. However, rollator walker users often have poor posture,
+leading to further health problems and, in the worst case, falls. Integrating
+sensors into rollator walker designs can help to address this problem and
+results in a platform that allows several other interesting use cases. This
+paper briefly overviews existing systems and the current research directions
+and challenges in this field. We also present our early HealthWalk rollator
+walker prototype for data collection with older people, rheumatism, multiple
+sclerosis and Parkinson patients, and individuals with visual impairments.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Imitation Learning from Observation with Automatic Discount Scheduling <span class="chip">ICLR 2024</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07433v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07433v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuyang Liu, Weijun Dong, Yingdong Hu, Chuan Wen, Zhao-Heng Yin, Chongjie Zhang, Yang Gao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Humans often acquire new skills through observation and imitation. For
+robotic agents, learning from the plethora of unlabeled video demonstration
+data available on the Internet necessitates imitating the expert without access
+to its action, presenting a challenge known as Imitation Learning from
+Observations (ILfO). A common approach to tackle ILfO problems is to convert
+them into inverse reinforcement learning problems, utilizing a proxy reward
+computed from the agent's and the expert's observations. Nonetheless, we
+identify that tasks characterized by a progress dependency property pose
+significant challenges for such approaches; in these tasks, the agent needs to
+initially learn the expert's preceding behaviors before mastering the
+subsequent ones. Our investigation reveals that the main cause is that the
+reward signals assigned to later steps hinder the learning of initial
+behaviors. To address this challenge, we present a novel ILfO framework that
+enables the agent to master earlier behaviors before advancing to later ones.
+We introduce an Automatic Discount Scheduling (ADS) mechanism that adaptively
+alters the discount factor in reinforcement learning during the training phase,
+prioritizing earlier rewards initially and gradually engaging later rewards
+only when the earlier behaviors have been mastered. Our experiments, conducted
+on nine Meta-World tasks, demonstrate that our method significantly outperforms
+state-of-the-art methods across all tasks, including those that are unsolvable
+by them.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Submitted to ICLR 2024</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Non-backtracking Graph Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07430v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07430v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Seonghyun Park, Narae Ryu, Gahee Kim, Dongyeop Woo, Se-Young Yun, Sungsoo Ahn
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The celebrated message-passing updates for graph neural networks allow the
+representation of large-scale graphs with local and computationally tractable
+updates. However, the local updates suffer from backtracking, i.e., a message
+flows through the same edge twice and revisits the previously visited node.
+Since the number of message flows increases exponentially with the number of
+updates, the redundancy in local updates prevents the graph neural network from
+accurately recognizing a particular message flow for downstream tasks. In this
+work, we propose to resolve such a redundancy via the non-backtracking graph
+neural network (NBA-GNN) that updates a message without incorporating the
+message from the previously visited node. We further investigate how NBA-GNN
+alleviates the over-squashing of GNNs, and establish a connection between
+NBA-GNN and the impressive performance of non-backtracking updates for
+stochastic block model recovery. We empirically verify the effectiveness of our
+NBA-GNN on long-range graph benchmark and transductive node classification
+problems.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Quantum-Enhanced Forecasting: Leveraging Quantum Gramian Angular Field
+  and CNNs for Stock Return Predictions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07427v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07427v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhengmeng Xu, Hai Lin
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We propose a time series forecasting method named Quantum Gramian Angular
+Field (QGAF). This approach merges the advantages of quantum computing
+technology with deep learning, aiming to enhance the precision of time series
+classification and forecasting. We successfully transformed stock return time
+series data into two-dimensional images suitable for Convolutional Neural
+Network (CNN) training by designing specific quantum circuits. Distinct from
+the classical Gramian Angular Field (GAF) approach, QGAF's uniqueness lies in
+eliminating the need for data normalization and inverse cosine calculations,
+simplifying the transformation process from time series data to two-dimensional
+images. To validate the effectiveness of this method, we conducted experiments
+on datasets from three major stock markets: the China A-share market, the Hong
+Kong stock market, and the US stock market. Experimental results revealed that
+compared to the classical GAF method, the QGAF approach significantly improved
+time series prediction accuracy, reducing prediction errors by an average of
+25\% for Mean Absolute Error (MAE) and 48\% for Mean Squared Error (MSE). This
+research confirms the potential and promising prospects of integrating quantum
+computing with deep learning techniques in financial time series forecasting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Revisiting Plasticity in Visual Reinforcement Learning: Data, Modules
+  and Training Stages 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07418v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07418v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Guozheng Ma, Lu Li, Sen Zhang, Zixuan Liu, Zhen Wang, Yixin Chen, Li Shen, Xueqian Wang, Dacheng Tao
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Plasticity, the ability of a neural network to evolve with new data, is
+crucial for high-performance and sample-efficient visual reinforcement learning
+(VRL). Although methods like resetting and regularization can potentially
+mitigate plasticity loss, the influences of various components within the VRL
+framework on the agent's plasticity are still poorly understood. In this work,
+we conduct a systematic empirical exploration focusing on three primary
+underexplored facets and derive the following insightful conclusions: (1) data
+augmentation is essential in maintaining plasticity; (2) the critic's
+plasticity loss serves as the principal bottleneck impeding efficient training;
+and (3) without timely intervention to recover critic's plasticity in the early
+stages, its loss becomes catastrophic. These insights suggest a novel strategy
+to address the high replay ratio (RR) dilemma, where exacerbated plasticity
+loss hinders the potential improvements of sample efficiency brought by
+increased reuse frequency. Rather than setting a static RR for the entire
+training process, we propose Adaptive RR, which dynamically adjusts the RR
+based on the critic's plasticity level. Extensive evaluations indicate that
+Adaptive RR not only avoids catastrophic plasticity loss in the early stages
+but also benefits from more frequent reuse in later phases, resulting in
+superior sample efficiency.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ What can knowledge graph alignment gain with Neuro-Symbolic learning
+  approaches? 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07417v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07417v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Pedro Giesteira Cotovio, Ernesto Jimenez-Ruiz, Catia Pesquita
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Knowledge Graphs (KG) are the backbone of many data-intensive applications
+since they can represent data coupled with its meaning and context. Aligning
+KGs across different domains and providers is necessary to afford a fuller and
+integrated representation. A severe limitation of current KG alignment (KGA)
+algorithms is that they fail to articulate logical thinking and reasoning with
+lexical, structural, and semantic data learning. Deep learning models are
+increasingly popular for KGA inspired by their good performance in other tasks,
+but they suffer from limitations in explainability, reasoning, and data
+efficiency. Hybrid neurosymbolic learning models hold the promise of
+integrating logical and data perspectives to produce high-quality alignments
+that are explainable and support validation through human-centric approaches.
+This paper examines the current state of the art in KGA and explores the
+potential for neurosymbolic integration, highlighting promising research
+directions for combining these fields.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Novel Voronoi-based Convolutional Neural Network Framework for Pushing
+  Person Detection in Crowd Videos 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07416v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07416v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ahmed Alia, Mohammed Maree, Mohcine Chraibi, Armin Seyfried
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Analyzing the microscopic dynamics of pushing behavior within crowds can
+offer valuable insights into crowd patterns and interactions. By identifying
+instances of pushing in crowd videos, a deeper understanding of when, where,
+and why such behavior occurs can be achieved. This knowledge is crucial to
+creating more effective crowd management strategies, optimizing crowd flow, and
+enhancing overall crowd experiences. However, manually identifying pushing
+behavior at the microscopic level is challenging, and the existing automatic
+approaches cannot detect such microscopic behavior. Thus, this article
+introduces a novel automatic framework for identifying pushing in videos of
+crowds on a microscopic level. The framework comprises two main components: i)
+Feature extraction and ii) Video labeling. In the feature extraction component,
+a new Voronoi-based method is developed for determining the local regions
+associated with each person in the input video. Subsequently, these regions are
+fed into EfficientNetV1B0 Convolutional Neural Network to extract the deep
+features of each person over time. In the second component, a combination of a
+fully connected layer with a Sigmoid activation function is employed to analyze
+these deep features and annotate the individuals involved in pushing within the
+video. The framework is trained and evaluated on a new dataset created using
+six real-world experiments, including their corresponding ground truths. The
+experimental findings indicate that the suggested framework outperforms seven
+baseline methods that are employed for comparative analysis purposes.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>21 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ NuTime: Numerically Multi-Scaled Embedding for Large-Scale Time Series
+  <span class="highlight-title">Pretrain</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07402v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07402v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chenguo Lin, Xumeng Wen, Wei Cao, Congrui Huang, Jiang Bian, Stephen Lin, Zhirong Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent research on time-series self-supervised models shows great promise in
+learning semantic representations. However, it has been limited to small-scale
+datasets, e.g., thousands of temporal sequences. In this work, we make key
+technical contributions that are tailored to the numerical properties of
+time-series data and allow the model to scale to large datasets, e.g., millions
+of temporal sequences. We adopt the Transformer architecture by first
+partitioning the input into non-overlapping windows. Each window is then
+characterized by its normalized shape and two scalar values denoting the mean
+and standard deviation within each window. To embed scalar values that may
+possess arbitrary numerical scales to high-dimensional vectors, we propose a
+numerically multi-scaled embedding module enumerating all possible scales for
+the scalar values. The model undergoes pretraining using the proposed
+numerically multi-scaled embedding with a simple contrastive objective on a
+large-scale dataset containing over a million sequences. We study its transfer
+performance on a number of univariate and multivariate classification
+benchmarks. Our method exhibits remarkable improvement against previous
+representation learning approaches and establishes the new state of the art,
+even compared with domain-specific non-learning-based methods.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Kernel and Image Quality Estimators for Optimizing Robotic
+  Ultrasound Controller using Bayesian Optimization 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07392v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07392v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Deepak Raina, SH Chandrashekhara, Richard Voyles, Juan Wachs, Subir Kumar Saha
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Ultrasound is a commonly used medical imaging modality that requires expert
+sonographers to manually maneuver the ultrasound probe based on the acquired
+image. Autonomous Robotic Ultrasound (A-RUS) is an appealing alternative to
+this manual procedure in order to reduce sonographers' workload. The key
+challenge to A-RUS is optimizing the ultrasound image quality for the region of
+interest across different patients. This requires knowledge of anatomy,
+recognition of error sources and precise probe position, orientation and
+pressure. Sample efficiency is important while optimizing these parameters
+associated with the robotized probe controller. Bayesian Optimization (BO), a
+sample-efficient optimization framework, has recently been applied to optimize
+the 2D motion of the probe. Nevertheless, further improvements are needed to
+improve the sample efficiency for high-dimensional control of the probe. We aim
+to overcome this problem by using a neural network to learn a low-dimensional
+kernel in BO, termed as Deep Kernel (DK). The neural network of DK is trained
+using probe and image data acquired during the procedure. The two image quality
+estimators are proposed that use a deep convolution neural network and provide
+real-time feedback to the BO. We validated our framework using these two
+feedback functions on three urinary bladder phantoms. We obtained over 50%
+increase in sample efficiency for 6D control of the robotized probe.
+Furthermore, our results indicate that this performance enhancement in BO is
+independent of the specific training dataset, demonstrating inter-patient
+adaptability.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IEEE International Symposium on Medical Robotics (ISMR)
+  2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Histopathological Image Classification and Vulnerability Analysis using
+  Federated Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07380v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07380v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sankalp Vyas, Amar Nath Patra, Raj Mani Shukla
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Healthcare is one of the foremost applications of machine learning (ML).
+Traditionally, ML models are trained by central servers, which aggregate data
+from various distributed devices to forecast the results for newly generated
+data. This is a major concern as models can access sensitive user information,
+which raises privacy concerns. A federated learning (FL) approach can help
+address this issue: A global model sends its copy to all clients who train
+these copies, and the clients send the updates (weights) back to it. Over time,
+the global model improves and becomes more accurate. Data privacy is protected
+during training, as it is conducted locally on the clients' devices.
+  However, the global model is susceptible to data poisoning. We develop a
+privacy-preserving FL technique for a skin cancer dataset and show that the
+model is prone to data poisoning attacks. Ten clients train the model, but one
+of them intentionally introduces flipped labels as an attack. This reduces the
+accuracy of the global model. As the percentage of label flipping increases,
+there is a noticeable decrease in accuracy. We use a stochastic gradient
+descent optimization algorithm to find the most optimal accuracy for the model.
+Although FL can protect user privacy for healthcare diagnostics, it is also
+vulnerable to data poisoning, which must be addressed.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IEEE International Conference on Trust, Security and
+  Privacy in Computing and Communications (TrustCom)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Causal Unsupervised Semantic Segmentation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07379v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07379v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junho Kim, Byung-Kwan Lee, Yong Man Ro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Unsupervised semantic segmentation aims to achieve high-quality semantic
+grouping without human-labeled annotations. With the advent of self-supervised
+pre-training, various frameworks utilize the pre-trained features to train
+prediction heads for unsupervised dense prediction. However, a significant
+challenge in this unsupervised setup is determining the appropriate level of
+clustering required for segmenting concepts. To address it, we propose a novel
+framework, CAusal Unsupervised Semantic sEgmentation (CAUSE), which leverages
+insights from causal inference. Specifically, we bridge intervention-oriented
+approach (i.e., frontdoor adjustment) to define suitable two-step tasks for
+unsupervised prediction. The first step involves constructing a concept
+clusterbook as a mediator, which represents possible concept prototypes at
+different levels of granularity in a discretized form. Then, the mediator
+establishes an explicit link to the subsequent concept-wise self-supervised
+learning for pixel-level grouping. Through extensive experiments and analyses
+on various datasets, we corroborate the effectiveness of CAUSE and achieve
+state-of-the-art performance in unsupervised semantic segmentation.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>code available:
+  https://github.com/ByungKwanLee/Causal-Unsupervised-Segmentation</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Experimental quantum natural gradient optimization in photonics 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07371v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07371v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yizhi Wang, Shichuan Xue, Yaxuan Wang, Jiangfang Ding, Weixu Shi, Dongyang Wang, Yong Liu, Yingwen Liu, Xiang Fu, Guangyao Huang, Anqi Huang, Mingtang Deng, Junjie Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Variational quantum algorithms (VQAs) combining the advantages of
+parameterized quantum circuits and classical optimizers, promise practical
+quantum applications in the Noisy Intermediate-Scale Quantum era. The
+performance of VQAs heavily depends on the optimization method. Compared with
+gradient-free and ordinary gradient descent methods, the quantum natural
+gradient (QNG), which mirrors the geometric structure of the parameter space,
+can achieve faster convergence and avoid local minima more easily, thereby
+reducing the cost of circuit executions. We utilized a fully programmable
+photonic chip to experimentally estimate the QNG in photonics for the first
+time. We obtained the dissociation curve of the He-H$^+$ cation and achieved
+chemical accuracy, verifying the outperformance of QNG optimization on a
+photonic device. Our work opens up a vista of utilizing QNG in photonics to
+implement practical near-term quantum applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Orthogonal Random Features: Explicit Forms and Sharp Inequalities 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07370v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07370v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nizar Demni, Hachem Kadri
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Random features have been introduced to scale up kernel methods via
+randomization techniques. In particular, random Fourier features and orthogonal
+random features were used to approximate the popular Gaussian kernel. The
+former is performed by a random Gaussian matrix and leads exactly to the
+Gaussian kernel after averaging. In this work, we analyze the bias and the
+variance of the kernel approximation based on orthogonal random features which
+makes use of Haar orthogonal matrices. We provide explicit expressions for
+these quantities using normalized Bessel functions and derive sharp exponential
+bounds supporting the view that orthogonal random features are more informative
+than random Fourier features.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improved Analysis of Sparse Linear Regression in Local Differential
+  Privacy Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07367v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07367v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liyang Zhu, Meng Ding, Vaneet Aggarwal, Jinhui Xu, Di Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we revisit the problem of sparse linear regression in the
+local differential privacy (LDP) model. Existing research in the
+non-interactive and sequentially local models has focused on obtaining the
+lower bounds for the case where the underlying parameter is $1$-sparse, and
+extending such bounds to the more general $k$-sparse case has proven to be
+challenging. Moreover, it is unclear whether efficient non-interactive LDP
+(NLDP) algorithms exist. To address these issues, we first consider the problem
+in the $\epsilon$ non-interactive LDP model and provide a lower bound of
+$\Omega(\frac{\sqrt{dk\log d}}{\sqrt{n}\epsilon})$ on the $\ell_2$-norm
+estimation error for sub-Gaussian data, where $n$ is the sample size and $d$ is
+the dimension of the space. We propose an innovative NLDP algorithm, the very
+first of its kind for the problem. As a remarkable outcome, this algorithm also
+yields a novel and highly efficient estimator as a valuable by-product. Our
+algorithm achieves an upper bound of
+$\tilde{O}({\frac{d\sqrt{k}}{\sqrt{n}\epsilon}})$ for the estimation error when
+the data is sub-Gaussian, which can be further improved by a factor of
+$O(\sqrt{d})$ if the server has additional public but unlabeled data. For the
+sequentially interactive LDP model, we show a similar lower bound of
+$\Omega({\frac{\sqrt{dk}}{\sqrt{n}\epsilon}})$. As for the upper bound, we
+rectify a previous method and show that it is possible to achieve a bound of
+$\tilde{O}(\frac{k\sqrt{d}}{\sqrt{n}\epsilon})$. Our findings reveal
+fundamental differences between the non-private case, central DP model, and
+local DP model in the sparse linear regression problem.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ GraphControl: Adding Conditional Control to Universal Graph <span class="highlight-title">Pre-train</span>ed
+  Models for Graph Domain Transfer Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07365v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07365v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yun Zhu, Yaoke Wang, Haizhou Shi, Zhenshuo Zhang, Siliang Tang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Graph-structured data is ubiquitous in the world which models complex
+relationships between objects, enabling various Web applications. Daily
+influxes of unlabeled graph data on the Web offer immense potential for these
+applications. Graph self-supervised algorithms have achieved significant
+success in acquiring generic knowledge from abundant unlabeled graph data.
+These pre-trained models can be applied to various downstream Web applications,
+saving training time and improving downstream (target) performance. However,
+different graphs, even across seemingly similar domains, can differ
+significantly in terms of attribute semantics, posing difficulties, if not
+infeasibility, for transferring the pre-trained models to downstream tasks.
+Concretely speaking, for example, the additional task-specific node information
+in downstream tasks (specificity) is usually deliberately omitted so that the
+pre-trained representation (transferability) can be leveraged. The trade-off as
+such is termed as "transferability-specificity dilemma" in this work. To
+address this challenge, we introduce an innovative deployment module coined as
+GraphControl, motivated by ControlNet, to realize better graph domain transfer
+learning. Specifically, by leveraging universal structural pre-trained models
+and GraphControl, we align the input space across various graphs and
+incorporate unique characteristics of target data as conditional inputs. These
+conditions will be progressively integrated into the model during fine-tuning
+or prompt tuning through ControlNet, facilitating personalized deployment.
+Extensive experiments show that our method significantly enhances the
+adaptability of pre-trained models on target attributed datasets, achieving
+1.4-3x performance gain. Furthermore, it outperforms training-from-scratch
+methods on target data with a comparable margin and exhibits faster
+convergence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Diagnosing Bipolar Disorder from 3-D Structural Magnetic Resonance
+  Images Using a Hybrid GAN-CNN Method 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07359v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07359v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Masood Hamed Saghayan, Mohammad Hossein Zolfagharnasab, Ali Khadem, Farzam Matinfar, Hassan Rashidi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Bipolar Disorder (BD) is a psychiatric condition diagnosed by repetitive
+cycles of hypomania and depression. Since diagnosing BD relies on subjective
+behavioral assessments over a long period, a solid diagnosis based on objective
+criteria is not straightforward. The current study responded to the described
+obstacle by proposing a hybrid GAN-CNN model to diagnose BD from 3-D structural
+MRI Images (sMRI). The novelty of this study stems from diagnosing BD from sMRI
+samples rather than conventional datasets such as functional MRI (fMRI),
+electroencephalography (EEG), and behavioral symptoms while removing the data
+insufficiency usually encountered when dealing with sMRI samples. The impact of
+various augmentation ratios is also tested using 5-fold cross-validation. Based
+on the results, this study obtains an accuracy rate of 75.8%, a sensitivity of
+60.3%, and a specificity of 82.5%, which are 3-5% higher than prior work while
+utilizing less than 6% sample counts. Next, it is demonstrated that a 2- D
+layer-based GAN generator can effectively reproduce complex 3D brain samples, a
+more straightforward technique than manual image processing. Lastly, the
+optimum augmentation threshold for the current study using 172 sMRI samples is
+50%, showing the applicability of the described method for larger sMRI
+datasets. In conclusion, it is established that data augmentation using GAN
+improves the accuracy of the CNN classifier using sMRI samples, thus developing
+more reliable decision support systems to assist practitioners in identifying
+BD patients more reliably and in a shorter period
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ IMITATE: Clinical Prior Guided Hierarchical Vision-Language <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07355v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07355v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Che Liu, Sibo Cheng, Miaojing Shi, Anand Shah, Wenjia Bai, Rossella Arcucci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In the field of medical Vision-Language Pre-training (VLP), significant
+efforts have been devoted to deriving text and image features from both
+clinical reports and associated medical images. However, most existing methods
+may have overlooked the opportunity in leveraging the inherent hierarchical
+structure of clinical reports, which are generally split into `findings' for
+descriptive content and `impressions' for conclusive observation. Instead of
+utilizing this rich, structured format, current medical VLP approaches often
+simplify the report into either a unified entity or fragmented tokens. In this
+work, we propose a novel clinical prior guided VLP framework named IMITATE to
+learn the structure information from medical reports with hierarchical
+vision-language alignment. The framework derives multi-level visual features
+from the chest X-ray (CXR) images and separately aligns these features with the
+descriptive and the conclusive text encoded in the hierarchical medical report.
+Furthermore, a new clinical-informed contrastive loss is introduced for
+cross-modal learning, which accounts for clinical prior knowledge in
+formulating sample correlations in contrastive learning. The proposed model,
+IMITATE, outperforms baseline VLP methods across six different datasets,
+spanning five medical imaging downstream tasks. Comprehensive experimental
+results highlight the advantages of integrating the hierarchical structure of
+medical reports for vision-language alignment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Under Review</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Atom-Motif Contrastive <span class="highlight-title">Transformer</span> for Molecular Property Prediction <span class="chip">AAAI-24</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07351v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07351v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wentao Yu, Shuo Chen, Chen Gong, Gang Niu, Masashi Sugiyama
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recently, Graph Transformer (GT) models have been widely used in the task of
+Molecular Property Prediction (MPP) due to their high reliability in
+characterizing the latent relationship among graph nodes (i.e., the atoms in a
+molecule). However, most existing GT-based methods usually explore the basic
+interactions between pairwise atoms, and thus they fail to consider the
+important interactions among critical motifs (e.g., functional groups consisted
+of several atoms) of molecules. As motifs in a molecule are significant
+patterns that are of great importance for determining molecular properties
+(e.g., toxicity and solubility), overlooking motif interactions inevitably
+hinders the effectiveness of MPP. To address this issue, we propose a novel
+Atom-Motif Contrastive Transformer (AMCT), which not only explores the
+atom-level interactions but also considers the motif-level interactions. Since
+the representations of atoms and motifs for a given molecule are actually two
+different views of the same instance, they are naturally aligned to generate
+the self-supervisory signals for model training. Meanwhile, the same motif can
+exist in different molecules, and hence we also employ the contrastive loss to
+maximize the representation agreement of identical motifs across different
+molecules. Finally, in order to clearly identify the motifs that are critical
+in deciding the properties of each molecule, we further construct a
+property-aware attention mechanism into our learning framework. Our proposed
+AMCT is extensively evaluated on seven popular benchmark datasets, and both
+quantitative and qualitative results firmly demonstrate its effectiveness when
+compared with the state-of-the-art methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>submit to AAAI-24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Fast-ELECTRA for Efficient <span class="highlight-title">Pre-train</span>ing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07347v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07347v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chengyu Dong, Liyuan Liu, Hao Cheng, Jingbo Shang, Jianfeng Gao, Xiaodong Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  ELECTRA pre-trains language models by detecting tokens in a sequence that
+have been replaced by an auxiliary model. Although ELECTRA offers a significant
+boost in efficiency, its potential is constrained by the training cost brought
+by the auxiliary model. Notably, this model, which is jointly trained with the
+main model, only serves to assist the training of the main model and is
+discarded post-training. This results in a substantial amount of training cost
+being expended in vain. To mitigate this issue, we propose Fast-ELECTRA, which
+leverages an existing language model as the auxiliary model. To construct a
+learning curriculum for the main model, we smooth its output distribution via
+temperature scaling following a descending schedule. Our approach rivals the
+performance of state-of-the-art ELECTRA-style pre-training methods, while
+significantly eliminating the computation and memory cost brought by the joint
+training of the auxiliary model. Our method also reduces the sensitivity to
+hyper-parameters and enhances the pre-training stability.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Towards Foundation Models for Learning on Tabular Data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07338v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07338v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Zhang, Xumeng Wen, Shun Zheng, Wei Xu, Jiang Bian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning on tabular data underpins numerous real-world applications. Despite
+considerable efforts in developing effective learning models for tabular data,
+current transferable tabular models remain in their infancy, limited by either
+the lack of support for direct instruction following in new tasks or the
+neglect of acquiring foundational knowledge and capabilities from diverse
+tabular datasets. In this paper, we propose Tabular Foundation Models (TabFMs)
+to overcome these limitations. TabFMs harness the potential of generative
+tabular learning, employing a pre-trained large language model (LLM) as the
+base model and fine-tuning it using purpose-designed objectives on an extensive
+range of tabular datasets. This approach endows TabFMs with a profound
+understanding and universal capabilities essential for learning on tabular
+data. Our evaluations underscore TabFM's effectiveness: not only does it
+significantly excel in instruction-following tasks like zero-shot and
+in-context inference, but it also showcases performance that approaches, and in
+instances, even transcends, the renowned yet mysterious closed-source LLMs like
+GPT-4. Furthermore, when fine-tuning with scarce data, our model achieves
+remarkable efficiency and maintains competitive performance with abundant
+training data. Finally, while our results are promising, we also delve into
+TabFM's limitations and potential opportunities, aiming to stimulate and
+expedite future research on developing more potent TabFMs.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Exploring Social Motion Latent Space and Human Awareness for Effective
+  Robot Navigation in Crowded Environments <span class="chip">IROS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07335v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07335v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Junaid Ahmed Ansari, Satyajit Tourani, Gourav Kumar, Brojeshwar Bhowmick
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work proposes a novel approach to social robot navigation by learning to
+generate robot controls from a social motion latent space. By leveraging this
+social motion latent space, the proposed method achieves significant
+improvements in social navigation metrics such as success rate, navigation
+time, and trajectory length while producing smoother (less jerk and angular
+deviations) and more anticipatory trajectories. The superiority of the proposed
+method is demonstrated through comparison with baseline models in various
+scenarios. Additionally, the concept of humans' awareness towards the robot is
+introduced into the social robot navigation framework, showing that
+incorporating human awareness leads to shorter and smoother trajectories owing
+to humans' ability to positively interact with the robot.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted at IROS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ An Adversarial Example for Direct Logit Attribution: Memory Management
+  in gelu-4l 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07325v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07325v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        James Dao, Yeu-Tong Lao, Can Rager, Jett Janiak
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We provide concrete evidence for memory management in a 4-layer transformer.
+Specifically, we identify clean-up behavior, in which model components
+consistently remove the output of preceeding components during a forward pass.
+Our findings suggest that the interpretability technique Direct Logit
+Attribution provides misleading results. We show explicit examples where this
+technique is inaccurate, as it does not account for clean-up behavior.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Multichannel consecutive data cross-extraction with 1DCNN-attention for
+  diagnosis of power <span class="highlight-title">transformer</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07323v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07323v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Wei Zheng, Guogang Zhang, Chenchen Zhao, Qianqian Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Power transformer plays a critical role in grid infrastructure, and its
+diagnosis is paramount for maintaining stable operation. However, the current
+methods for transformer diagnosis focus on discrete dissolved gas analysis,
+neglecting deep feature extraction of multichannel consecutive data. The
+unutilized sequential data contains the significant temporal information
+reflecting the transformer condition. In light of this, the structure of
+multichannel consecutive data cross-extraction (MCDC) is proposed in this
+article in order to comprehensively exploit the intrinsic characteristic and
+evaluate the states of transformer. Moreover, for the better accommodation in
+scenario of transformer diagnosis, one dimensional convolution neural network
+attention (1DCNN-attention) mechanism is introduced and offers a more efficient
+solution given the simplified spatial complexity. Finally, the effectiveness of
+MCDC and the superior generalization ability, compared with other algorithms,
+are validated in experiments conducted on a dataset collected from real
+operation cases of power transformer. Additionally, the better stability of
+1DCNN-attention has also been certified.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ On the Impact of Cross-Domain Data on German Language Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07321v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07321v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Amin Dada, Aokun Chen, Cheng Peng, Kaleb E Smith, Ahmad Idrissi-Yaghir, Constantin Marc Seibold, Jianning Li, Lars Heiliger, Christoph M. Friedrich, Daniel Truhn, Jan Egger, Jiang Bian, Jens Kleesiek, Yonghui Wu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Traditionally, large language models have been either trained on general web
+crawls or domain-specific data. However, recent successes of generative large
+language models, have shed light on the benefits of cross-domain datasets. To
+examine the significance of prioritizing data diversity over quality, we
+present a German dataset comprising texts from five domains, along with another
+dataset aimed at containing high-quality data. Through training a series of
+models ranging between 122M and 750M parameters on both datasets, we conduct a
+comprehensive benchmark on multiple downstream tasks. Our findings demonstrate
+that the models trained on the cross-domain dataset outperform those trained on
+quality data alone, leading to improvements up to $4.45\%$ over the previous
+state-of-the-art. The models are available at
+https://huggingface.co/ikim-uk-essen
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>13 pages, 1 figure, accepted at Findings of the Association for
+  Computational Linguistics: EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Byzantine-Resilient Decentralized Multi-Armed Bandits 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07320v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07320v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jingxuan Zhu, Alec Koppel, Alvaro Velasquez, Ji Liu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In decentralized cooperative multi-armed bandits (MAB), each agent observes a
+distinct stream of rewards, and seeks to exchange information with others to
+select a sequence of arms so as to minimize its regret. Agents in the
+cooperative setting can outperform a single agent running a MAB method such as
+Upper-Confidence Bound (UCB) independently. In this work, we study how to
+recover such salient behavior when an unknown fraction of the agents can be
+Byzantine, that is, communicate arbitrarily wrong information in the form of
+reward mean-estimates or confidence sets. This framework can be used to model
+attackers in computer networks, instigators of offensive content into
+recommender systems, or manipulators of financial markets. Our key contribution
+is the development of a fully decentralized resilient upper confidence bound
+(UCB) algorithm that fuses an information mixing step among agents with a
+truncation of inconsistent and extreme values. This truncation step enables us
+to establish that the performance of each normal agent is no worse than the
+classic single-agent UCB1 algorithm in terms of regret, and more importantly,
+the cumulative regret of all normal agents is strictly better than the
+non-cooperative case, provided that each agent has at least 3f+1 neighbors
+where f is the maximum possible Byzantine agents in each agent's neighborhood.
+Extensions to time-varying neighbor graphs, and minimax lower bounds are
+further established on the achievable regret. Experiments corroborate the
+merits of this framework in practice.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Molecule-Edit Templates for Efficient and Accurate Retrosynthesis
+  Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07313v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07313v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mikołaj Sacha, Michał Sadowski, Piotr Kozakowski, Ruard van Workum, Stanisław Jastrzębski
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Retrosynthesis involves determining a sequence of reactions to synthesize
+complex molecules from simpler precursors. As this poses a challenge in organic
+chemistry, machine learning has offered solutions, particularly for predicting
+possible reaction substrates for a given target molecule. These solutions
+mainly fall into template-based and template-free categories. The former is
+efficient but relies on a vast set of predefined reaction patterns, while the
+latter, though more flexible, can be computationally intensive and less
+interpretable. To address these issues, we introduce METRO (Molecule-Edit
+Templates for RetrOsynthesis), a machine-learning model that predicts reactions
+using minimal templates - simplified reaction patterns capturing only essential
+molecular changes - reducing computational overhead and achieving
+state-of-the-art results on standard benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ WiGenAI: The Symphony of Wireless and Generative AI via Diffusion Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07312v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07312v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mehdi Letafati, Samad Ali, Matti Latva-aho
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Innovative foundation models, such as GPT-3 and stable diffusion models, have
+made a paradigm shift in the realm of artificial intelligence (AI) towards
+generative AI-based systems. In unison, from data communication and networking
+perspective, AI and machine learning (AI/ML) algorithms are envisioned to be
+pervasively incorporated into the future generations of wireless communications
+systems, highlighting the need for novel AI-native solutions for the emergent
+communication scenarios. In this article, we outline the applications of
+generative AI in wireless communication systems to lay the foundations for
+research in this field. Diffusion-based generative models, as the new
+state-of-the-art paradigm of generative models, are introduced, and their
+applications in wireless communication systems are discussed. Two case studies
+are also presented to showcase how diffusion models can be exploited for the
+development of resilient AI-native communication systems. Specifically, we
+propose denoising diffusion probabilistic models (DDPM) for a wireless
+communication scheme with non-ideal transceivers, where 30% improvement is
+achieved in terms of bit error rate. As the second application, DDPMs are
+employed at the transmitter to shape the constellation symbols, highlighting a
+robust out-of-distribution performance. Finally, future directions and open
+issues for the development of generative AI-based wireless systems are
+discussed to promote future research endeavors towards wireless generative AI
+(WiGenAI).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ SNOiC: Soft Labeling and Noisy Mixup based Open Intent Classification
+  Model 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07306v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07306v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Aditi Kanwar, Aditi Seetha, Satyendra Singh Chouhan, Rajdeep Niyogi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a Soft Labeling and Noisy Mixup-based open intent
+classification model (SNOiC). Most of the previous works have used
+threshold-based methods to identify open intents, which are prone to
+overfitting and may produce biased predictions. Additionally, the need for more
+available data for an open intent class presents another limitation for these
+existing models. SNOiC combines Soft Labeling and Noisy Mixup strategies to
+reduce the biasing and generate pseudo-data for open intent class. The
+experimental results on four benchmark datasets show that the SNOiC model
+achieves a minimum and maximum performance of 68.72\% and 94.71\%,
+respectively, in identifying open intents. Moreover, compared to
+state-of-the-art models, the SNOiC model improves the performance of
+identifying open intents by 0.93\% (minimum) and 12.76\% (maximum). The model's
+efficacy is further established by analyzing various parameters used in the
+proposed model. An ablation study is also conducted, which involves creating
+three model variants to validate the effectiveness of the SNOiC model.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>9 Pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Beyond Memorization: Violating Privacy Via Inference with Large Language
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07298v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07298v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Robin Staab, Mark Vero, Mislav Balunović, Martin Vechev
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Current privacy research on large language models (LLMs) primarily focuses on
+the issue of extracting memorized training data. At the same time, models'
+inference capabilities have increased drastically. This raises the key question
+of whether current LLMs could violate individuals' privacy by inferring
+personal attributes from text given at inference time. In this work, we present
+the first comprehensive study on the capabilities of pretrained LLMs to infer
+personal attributes from text. We construct a dataset consisting of real Reddit
+profiles, and show that current LLMs can infer a wide range of personal
+attributes (e.g., location, income, sex), achieving up to $85\%$ top-1 and
+$95.8\%$ top-3 accuracy at a fraction of the cost ($100\times$) and time
+($240\times$) required by humans. As people increasingly interact with
+LLM-powered chatbots across all aspects of life, we also explore the emerging
+threat of privacy-invasive chatbots trying to extract personal information
+through seemingly benign questions. Finally, we show that common mitigations,
+i.e., text anonymization and model alignment, are currently ineffective at
+protecting user privacy against LLM inference. Our findings highlight that
+current LLMs can infer personal data at a previously unattainable scale. In the
+absence of working defenses, we advocate for a broader discussion around LLM
+privacy implications beyond memorization, striving for a wider privacy
+protection.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Score Regularized Policy Optimization through Diffusion Behavior 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07297v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07297v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Huayu Chen, Cheng Lu, Zhengyi Wang, Hang Su, Jun Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent developments in offline reinforcement learning have uncovered the
+immense potential of diffusion modeling, which excels at representing
+heterogeneous behavior policies. However, sampling from diffusion policies is
+considerably slow because it necessitates tens to hundreds of iterative
+inference steps for one action. To address this issue, we propose to extract an
+efficient deterministic inference policy from critic models and pretrained
+diffusion behavior models, leveraging the latter to directly regularize the
+policy gradient with the behavior distribution's score function during
+optimization. Our method enjoys powerful generative capabilities of diffusion
+modeling while completely circumventing the computationally intensive and
+time-consuming diffusion sampling scheme, both during training and evaluation.
+Extensive results on D4RL tasks show that our method boosts action sampling
+speed by more than 25 times compared with various leading diffusion-based
+methods in locomotion tasks, while still maintaining state-of-the-art
+performance.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ BioT5: Enriching Cross-modal Integration in Biology with Chemical
+  Knowledge and Natural Language Associations <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07276v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07276v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Qizhi Pei, Wei Zhang, Jinhua Zhu, Kehan Wu, Kaiyuan Gao, Lijun Wu, Yingce Xia, Rui Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Recent advancements in biological research leverage the integration of
+molecules, proteins, and natural language to enhance drug discovery. However,
+current models exhibit several limitations, such as the generation of invalid
+molecular SMILES, underutilization of contextual information, and equal
+treatment of structured and unstructured knowledge. To address these issues, we
+propose $\mathbf{BioT5}$, a comprehensive pre-training framework that enriches
+cross-modal integration in biology with chemical knowledge and natural language
+associations. $\mathbf{BioT5}$ utilizes SELFIES for $100%$ robust molecular
+representations and extracts knowledge from the surrounding context of
+bio-entities in unstructured biological literature. Furthermore,
+$\mathbf{BioT5}$ distinguishes between structured and unstructured knowledge,
+leading to more effective utilization of information. After fine-tuning, BioT5
+shows superior performance across a wide range of tasks, demonstrating its
+strong capability of capturing underlying relations and properties of
+bio-entities. Our code is available at
+$\href{https://github.com/QizhiPei/BioT5}{Github}$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Empirical Methods in Natural Language Processing (EMNLP 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Why Does Sharpness-Aware Minimization Generalize Better Than SGD? <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07269v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07269v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zixiang Chen, Junkai Zhang, Yiwen Kou, Xiangning Chen, Cho-Jui Hsieh, Quanquan Gu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The challenge of overfitting, in which the model memorizes the training data
+and fails to generalize to test data, has become increasingly significant in
+the training of large neural networks. To tackle this challenge,
+Sharpness-Aware Minimization (SAM) has emerged as a promising training method,
+which can improve the generalization of neural networks even in the presence of
+label noise. However, a deep understanding of how SAM works, especially in the
+setting of nonlinear neural networks and classification tasks, remains largely
+missing. This paper fills this gap by demonstrating why SAM generalizes better
+than Stochastic Gradient Descent (SGD) for a certain data model and two-layer
+convolutional ReLU networks. The loss landscape of our studied problem is
+nonsmooth, thus current explanations for the success of SAM based on the
+Hessian information are insufficient. Our result explains the benefits of SAM,
+particularly its ability to prevent noise learning in the early stages, thereby
+facilitating more effective learning of features. Experiments on both synthetic
+and real data corroborate our theory.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>52 pages, 4 figures, 2 tables. In NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ RaftFed: A Lightweight Federated Learning Framework for Vehicular Crowd
+  Intelligence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07268v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07268v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Changan Yang, Yaxing Chen, Yao Zhang, Helei Cui, Zhiwen Yu, Bin Guo, Zheng Yan, Zijiang Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Vehicular crowd intelligence (VCI) is an emerging research field. Facilitated
+by state-of-the-art vehicular ad-hoc networks and artificial intelligence,
+various VCI applications come to place, e.g., collaborative sensing,
+positioning, and mapping. The collaborative property of VCI applications
+generally requires data to be shared among participants, thus forming
+network-wide intelligence. How to fulfill this process without compromising
+data privacy remains a challenging issue. Although federated learning (FL) is a
+promising tool to solve the problem, adapting conventional FL frameworks to VCI
+is nontrivial. First, the centralized model aggregation is unreliable in VCI
+because of the existence of stragglers with unfavorable channel conditions.
+Second, existing FL schemes are vulnerable to Non-IID data, which is
+intensified by the data heterogeneity in VCI. This paper proposes a novel
+federated learning framework called RaftFed to facilitate privacy-preserving
+VCI. The experimental results show that RaftFed performs better than baselines
+regarding communication overhead, model accuracy, and model convergence.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>8 pages,8 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Classification of Dysarthria based on the Levels of Severity. A
+  Systematic <span class="highlight-title">Review</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07264v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07264v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Afnan Al-Ali, Somaya Al-Maadeed, Moutaz Saleh, Rani Chinnappa Naidu, Zachariah C Alex, Prakash Ramachandran, Rajeev Khoodeeram, Rajesh Kumar M
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dysarthria is a neurological speech disorder that can significantly impact
+affected individuals' communication abilities and overall quality of life. The
+accurate and objective classification of dysarthria and the determination of
+its severity are crucial for effective therapeutic intervention. While
+traditional assessments by speech-language pathologists (SLPs) are common, they
+are often subjective, time-consuming, and can vary between practitioners.
+Emerging machine learning-based models have shown the potential to provide a
+more objective dysarthria assessment, enhancing diagnostic accuracy and
+reliability. This systematic review aims to comprehensively analyze current
+methodologies for classifying dysarthria based on severity levels.
+Specifically, this review will focus on determining the most effective set and
+type of features that can be used for automatic patient classification and
+evaluating the best AI techniques for this purpose. We will systematically
+review the literature on the automatic classification of dysarthria severity
+levels. Sources of information will include electronic databases and grey
+literature. Selection criteria will be established based on relevance to the
+research questions. Data extraction will include methodologies used, the type
+of features extracted for classification, and AI techniques employed. The
+findings of this systematic review will contribute to the current understanding
+of dysarthria classification, inform future research, and support the
+development of improved diagnostic tools. The implications of these findings
+could be significant in advancing patient care and improving therapeutic
+outcomes for individuals affected by dysarthria.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>no comments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep ReLU networks and high-order finite element methods II: Chebyshev
+  emulation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07261v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07261v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Joost A. A. Opschoor, Christoph Schwab
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Expression rates and stability in Sobolev norms of deep ReLU neural networks
+(NNs) in terms of the number of parameters defining the NN for continuous,
+piecewise polynomial functions, on arbitrary, finite partitions $\mathcal{T}$
+of a bounded interval $(a,b)$ are addressed. Novel constructions of ReLU NN
+surrogates encoding the approximated functions in terms of Chebyshev polynomial
+expansion coefficients are developed. Chebyshev coefficients can be computed
+easily from the values of the function in the Clenshaw--Curtis points using the
+inverse fast Fourier transform. Bounds on expression rates and stability that
+are superior to those of constructions based on ReLU NN emulations of monomials
+considered in [Opschoor, Petersen, Schwab, 2020] are obtained. All emulation
+bounds are explicit in terms of the (arbitrary) partition of the interval, the
+target emulation accuracy and the polynomial degree in each element of the
+partition. ReLU NN emulation error estimates are provided for various classes
+of functions and norms, commonly encountered in numerical analysis. In
+particular, we show exponential ReLU emulation rate bounds for analytic
+functions with point singularities and develop an interface between Chebfun
+approximations and constructive ReLU NN emulations.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ ADMEOOD: Out-of-Distribution Benchmark for Drug Property Prediction 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07253v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07253v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shuoying Wei, Xinlong Wen, Lida Zhu, Songquan Li, Rongbo Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Obtaining accurate and valid information for drug molecules is a crucial and
+challenging task. However, chemical knowledge and information have been
+accumulated over the past 100 years from various regions, laboratories, and
+experimental purposes. Little has been explored in terms of the
+out-of-distribution (OOD) problem with noise and inconsistency, which may lead
+to weak robustness and unsatisfied performance. This study proposes a novel
+benchmark ADMEOOD, a systematic OOD dataset curator and benchmark specifically
+designed for drug property prediction. ADMEOOD obtained 27 ADME (Absorption,
+Distribution, Metabolism, Excretion) drug properties from Chembl and relevant
+literature. Additionally, it includes two kinds of OOD data shifts: Noise Shift
+and Concept Conflict Drift (CCD). Noise Shift responds to the noise level by
+categorizing the environment into different confidence levels. On the other
+hand, CCD describes the data which has inconsistent label among the original
+data. Finally, it tested on a variety of domain generalization models, and the
+experimental results demonstrate the effectiveness of the proposed partition
+method in ADMEOOD: ADMEOOD demonstrates a significant difference performance
+between in-distribution and out-of-distribution data. Moreover, ERM (Empirical
+Risk Minimization) and other models exhibit distinct trends in performance
+across different domains and measurement types.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ A Comparative Study of <span class="highlight-title">Pre-train</span>ed CNNs and GRU-Based Attention for
+  Image Caption Generation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07252v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07252v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Rashid Khan, Bingding Huang, Haseeb Hassan, Asim Zaman, Zhongfu Ye
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Image captioning is a challenging task involving generating a textual
+description for an image using computer vision and natural language processing
+techniques. This paper proposes a deep neural framework for image caption
+generation using a GRU-based attention mechanism. Our approach employs multiple
+pre-trained convolutional neural networks as the encoder to extract features
+from the image and a GRU-based language model as the decoder to generate
+descriptive sentences. To improve performance, we integrate the Bahdanau
+attention model with the GRU decoder to enable learning to focus on specific
+image parts. We evaluate our approach using the MSCOCO and Flickr30k datasets
+and show that it achieves competitive scores compared to state-of-the-art
+methods. Our proposed framework can bridge the gap between computer vision and
+natural language and can be extended to specific domains.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>15pages, 10 figures, 5 tables. 2023 the 5th International Conference
+  on Robotics and Computer Vision (ICRCV 2023). arXiv admin note: substantial
+  text overlap with arXiv:2203.01594</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Synthesizing Missing MRI Sequences from Available Modalities using
+  Generative Adversarial Networks in BraTS <span class="highlight-title">Dataset</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07250v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07250v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ibrahim Ethem Hamamci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Glioblastoma is a highly aggressive and lethal form of brain cancer. Magnetic
+resonance imaging (MRI) plays a significant role in the diagnosis, treatment
+planning, and follow-up of glioblastoma patients due to its non-invasive and
+radiation-free nature. The International Brain Tumor Segmentation (BraTS)
+challenge has contributed to generating numerous AI algorithms to accurately
+and efficiently segment glioblastoma sub-compartments using four structural
+(T1, T1Gd, T2, T2-FLAIR) MRI scans. However, these four MRI sequences may not
+always be available. To address this issue, Generative Adversarial Networks
+(GANs) can be used to synthesize the missing MRI sequences. In this paper, we
+implement and utilize an open-source GAN approach that takes any three MRI
+sequences as input to generate the missing fourth structural sequence. Our
+proposed approach is contributed to the community-driven generally nuanced deep
+learning framework (GaNDLF) and demonstrates promising results in synthesizing
+high-quality and realistic MRI sequences, enabling clinicians to improve their
+diagnostic capabilities and support the application of AI methods to brain
+tumor MRI quantification.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Crowd Counting in Harsh Weather using Image Denoising with Pix2Pix GANs 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07245v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07245v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Muhammad Asif Khan, Hamid Menouar, Ridha Hamila
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Visual crowd counting estimates the density of the crowd using deep learning
+models such as convolution neural networks (CNNs). The performance of the model
+heavily relies on the quality of the training data that constitutes crowd
+images. In harsh weather such as fog, dust, and low light conditions, the
+inference performance may severely degrade on the noisy and blur images. In
+this paper, we propose the use of Pix2Pix generative adversarial network (GAN)
+to first denoise the crowd images prior to passing them to the counting model.
+A Pix2Pix network is trained using synthetic noisy images generated from
+original crowd images and then the pretrained generator is then used in the
+inference engine to estimate the crowd density in unseen, noisy crowd images.
+The performance is tested on JHU-Crowd dataset to validate the significance of
+the proposed method particularly when high reliability and accuracy are
+required.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>The paper has been accepted for presentation in IEEE 38th
+  International Conference on Image and Vision Computing New Zealand (IVCNZ
+  2023). The final manuscript can be accessed at ieeexplore</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Surrogate modeling for stochastic crack growth processes in structural
+  health monitoring applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07241v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07241v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicholas E. Silionis, Konstantinos N. Anyfantis
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Fatigue crack growth is one of the most common types of deterioration in
+metal structures with significant implications on their reliability. Recent
+advances in Structural Health Monitoring (SHM) have motivated the use of
+structural response data to predict future crack growth under uncertainty, in
+order to enable a transition towards predictive maintenance. Accurately
+representing different sources of uncertainty in stochastic crack growth (SCG)
+processes is a non-trivial task. The present work builds on previous research
+on physics-based SCG modeling under both material and load-related uncertainty.
+The aim here is to construct computationally efficient, probabilistic surrogate
+models for SCG processes that successfully encode these different sources of
+uncertainty. An approach inspired by latent variable modeling is employed that
+utilizes Gaussian Process (GP) regression models to enable the surrogates to be
+used to generate prior distributions for different Bayesian SHM tasks as the
+application of interest. Implementation is carried out in a numerical setting
+and model performance is assessed for two fundamental crack SHM problems;
+namely crack length monitoring (damage quantification) and crack growth
+monitoring (damage prognosis).
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 9 figures. Preprint submitted to Elsevier journal</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CacheGen: Fast Context Loading for Language Model Applications 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07240v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07240v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yuhan Liu, Hanchen Li, Kuntai Du, Jiayi Yao, Yihua Cheng, Yuyang Huang, Shan Lu, Michael Maire, Henry Hoffmann, Ari Holtzman, Ganesh Ananthanarayanan, Junchen Jiang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As large language models (LLMs) take on more complex tasks, their inputs
+incorporate longer contexts to respond to questions that require domain
+knowledge or user-specific conversational histories. Yet, using long contexts
+poses a challenge for responsive LLM systems, as nothing can be generated until
+all the contexts are fetched to and processed by the LLM. Existing systems
+optimize only the computation delay in context processing (e.g., by caching
+intermediate key-value features of the text context) but often cause longer
+network delays in context fetching (e.g., key-value features consume orders of
+magnitude larger bandwidth than the text context).
+  This paper presents CacheGen to minimize the delays in fetching and
+processing contexts for LLMs. CacheGen reduces the bandwidth needed for
+transmitting long contexts' key-value (KV) features through a novel encoder
+that compresses KV features into more compact bitstream representations. The
+encoder combines adaptive quantization with a tailored arithmetic coder, taking
+advantage of the KV features' distributional properties, such as locality
+across tokens. Furthermore, CacheGen minimizes the total delay in fetching and
+processing a context by using a controller that determines when to load the
+context as compressed KV features or raw text and picks the appropriate
+compression level if loaded as KV features. We test CacheGen on three models of
+various sizes and three datasets of different context lengths. Compared to
+recent methods that handle long contexts, CacheGen reduces bandwidth usage by
+3.7-4.3x and the total delay in fetching and processing contexts by 2.7-3x
+while maintaining similar LLM performance on various tasks as loading the text
+contexts.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Are GATs Out of Balance? <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07235v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07235v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nimrah Mustafa, Aleksandar Bojchevski, Rebekka Burkholz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While the expressive power and computational capabilities of graph neural
+networks (GNNs) have been theoretically studied, their optimization and
+learning dynamics, in general, remain largely unexplored. Our study undertakes
+the Graph Attention Network (GAT), a popular GNN architecture in which a node's
+neighborhood aggregation is weighted by parameterized attention coefficients.
+We derive a conservation law of GAT gradient flow dynamics, which explains why
+a high portion of parameters in GATs with standard initialization struggle to
+change during training. This effect is amplified in deeper GATs, which perform
+significantly worse than their shallow counterparts. To alleviate this problem,
+we devise an initialization scheme that balances the GAT network. Our approach
+i) allows more effective propagation of gradients and in turn enables
+trainability of deeper networks, and ii) attains a considerable speedup in
+training and convergence time in comparison to the standard initialization. Our
+main theorem serves as a stepping stone to studying the learning dynamics of
+positive homogeneous models with attention mechanisms.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>24 pages. To be published in Advances in Neural Information
+  Processing Systems (NeurIPS), 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Hierarchical Decomposition of <span class="highlight-title">Prompt</span>-Based Continual Learning:
+  Rethinking Obscured Sub-optimality <span class="chip">NeurIPS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07234v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07234v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liyuan Wang, Jingyi Xie, Xingxing Zhang, Mingyi Huang, Hang Su, Jun Zhu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Prompt-based continual learning is an emerging direction in leveraging
+pre-trained knowledge for downstream continual learning, and has almost reached
+the performance pinnacle under supervised pre-training. However, our empirical
+research reveals that the current strategies fall short of their full potential
+under the more realistic self-supervised pre-training, which is essential for
+handling vast quantities of unlabeled data in practice. This is largely due to
+the difficulty of task-specific knowledge being incorporated into instructed
+representations via prompt parameters and predicted by uninstructed
+representations at test time. To overcome the exposed sub-optimality, we
+conduct a theoretical analysis of the continual learning objective in the
+context of pre-training, and decompose it into hierarchical components:
+within-task prediction, task-identity inference, and task-adaptive prediction.
+Following these empirical and theoretical insights, we propose Hierarchical
+Decomposition (HiDe-)Prompt, an innovative approach that explicitly optimizes
+the hierarchical components with an ensemble of task-specific prompts and
+statistics of both uninstructed and instructed representations, further with
+the coordination of a contrastive regularization strategy. Our extensive
+experiments demonstrate the superior performance of HiDe-Prompt and its
+robustness to pre-training paradigms in continual learning (e.g., up to 15.01%
+and 9.61% lead on Split CIFAR-100 and Split ImageNet-R, respectively). Our code
+is available at \url{https://github.com/thu-ml/HiDe-Prompt}.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>23 pages, 20 figures, 11 tables, accepted by NeurIPS as a Spotlight</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ <span class="highlight-title">Self-supervised</span> Pocket <span class="highlight-title">Pretrain</span>ing via Protein Fragment-Surroundings
+  Alignment 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07229v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07229v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bowen Gao, Yinjun Jia, Yuanle Mo, Yuyan Ni, Weiying Ma, Zhiming Ma, Yanyan Lan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Pocket representations play a vital role in various biomedical applications,
+such as druggability estimation, ligand affinity prediction, and de novo drug
+design. While existing geometric features and pretrained representations have
+demonstrated promising results, they usually treat pockets independent of
+ligands, neglecting the fundamental interactions between them. However, the
+limited pocket-ligand complex structures available in the PDB database (less
+than 100 thousand non-redundant pairs) hampers large-scale pretraining
+endeavors for interaction modeling. To address this constraint, we propose a
+novel pocket pretraining approach that leverages knowledge from high-resolution
+atomic protein structures, assisted by highly effective pretrained small
+molecule representations. By segmenting protein structures into drug-like
+fragments and their corresponding pockets, we obtain a reasonable simulation of
+ligand-receptor interactions, resulting in the generation of over 5 million
+complexes. Subsequently, the pocket encoder is trained in a contrastive manner
+to align with the representation of pseudo-ligand furnished by some pretrained
+small molecule encoders. Our method, named ProFSA, achieves state-of-the-art
+performance across various tasks, including pocket druggability prediction,
+pocket matching, and ligand binding affinity prediction. Notably, ProFSA
+surpasses other pretraining methods by a substantial margin. Moreover, our work
+opens up a new avenue for mitigating the scarcity of protein-ligand complex
+data through the utilization of high-quality and diverse protein structure
+databases.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Deep Learning for blind spectral unmixing of LULC classes with MODIS
+  multispectral time series and ancillary data 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07223v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07223v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        José Rodríguez-Ortega, Rohaifa Khaldi, Domingo Alcaraz-Segura, Siham Tabik
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Remotely sensed data are dominated by mixed Land Use and Land Cover (LULC)
+types. Spectral unmixing is a technique to extract information from mixed
+pixels into their constituent LULC types and corresponding abundance fractions.
+Traditionally, solving this task has relied on either classical methods that
+require prior knowledge of endmembers or machine learning methods that avoid
+explicit endmembers calculation, also known as blind spectral unmixing (BSU).
+Most BSU studies based on Deep Learning (DL) focus on one time-step
+hyperspectral data, yet its acquisition remains quite costly compared with
+multispectral data. To our knowledge, here we provide the first study on BSU of
+LULC classes using multispectral time series data with DL models. We further
+boost the performance of a Long-Short Term Memory (LSTM)-based model by
+incorporating geographic plus topographic (geo-topographic) and climatic
+ancillary information. Our experiments show that combining spectral-temporal
+input data together with geo-topographic and climatic information substantially
+improves the abundance estimation of LULC classes in mixed pixels. To carry out
+this study, we built a new labeled dataset of the region of Andalusia (Spain)
+with monthly multispectral time series of pixels for the year 2013 from MODIS
+at 460m resolution, for two hierarchical levels of LULC classes, named
+Andalusia MultiSpectral MultiTemporal Unmixing (Andalusia-MSMTU). This dataset
+provides, at the pixel level, a multispectral time series plus ancillary
+information annotated with the abundance of each LULC class inside each pixel.
+The dataset and code are available to the public.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Using Learnable Physics for Real-Time Exercise Form Recommendations <span class="chip">RecSys '23</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07221v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07221v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abhishek Jaiswal, Gautam Chauhan, Nisheeth Srivastava
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Good posture and form are essential for safe and productive exercising. Even
+in gym settings, trainers may not be readily available for feedback.
+Rehabilitation therapies and fitness workouts can thus benefit from recommender
+systems that provide real-time evaluation. In this paper, we present an
+algorithmic pipeline that can diagnose problems in exercise techniques and
+offer corrective recommendations, with high sensitivity and specificity in
+real-time. We use MediaPipe for pose recognition, count repetitions using
+peak-prominence detection, and use a learnable physics simulator to track
+motion evolution for each exercise. A test video is diagnosed based on
+deviations from the prototypical learned motion using statistical learning. The
+system is evaluated on six full and upper body exercises. These real-time
+recommendations, counseled via low-cost equipment like smartphones, will allow
+exercisers to rectify potential mistakes making self-practice feasible while
+reducing the risk of workout injuries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM RecSys '23, 12 pages , 7 Figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ COPlanner: Plan to Roll Out Conservatively but to Explore Optimistically
+  for Model-Based RL 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07220v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07220v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xiyao Wang, Ruijie Zheng, Yanchao Sun, Ruonan Jia, Wichayaporn Wongkamjan, Huazhe Xu, Furong Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Dyna-style model-based reinforcement learning contains two phases: model
+rollouts to generate sample for policy learning and real environment
+exploration using current policy for dynamics model learning. However, due to
+the complex real-world environment, it is inevitable to learn an imperfect
+dynamics model with model prediction error, which can further mislead policy
+learning and result in sub-optimal solutions. In this paper, we propose
+$\texttt{COPlanner}$, a planning-driven framework for model-based methods to
+address the inaccurately learned dynamics model problem with conservative model
+rollouts and optimistic environment exploration. $\texttt{COPlanner}$ leverages
+an uncertainty-aware policy-guided model predictive control (UP-MPC) component
+to plan for multi-step uncertainty estimation. This estimated uncertainty then
+serves as a penalty during model rollouts and as a bonus during real
+environment exploration respectively, to choose actions. Consequently,
+$\texttt{COPlanner}$ can avoid model uncertain regions through conservative
+model rollouts, thereby alleviating the influence of model error.
+Simultaneously, it explores high-reward model uncertain regions to reduce model
+error actively through optimistic real environment exploration.
+$\texttt{COPlanner}$ is a plug-and-play framework that can be applied to any
+dyna-style model-based methods. Experimental results on a series of
+proprioceptive and visual continuous control tasks demonstrate that both sample
+efficiency and asymptotic performance of strong model-based methods are
+significantly improved combined with $\texttt{COPlanner}$.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 12 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Improved Membership Inference Attacks Against Language Classification
+  Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07219v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07219v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shlomit Shachor, Natalia Razinkov, Abigail Goldsteen
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial intelligence systems are prevalent in everyday life, with use
+cases in retail, manufacturing, health, and many other fields. With the rise in
+AI adoption, associated risks have been identified, including privacy risks to
+the people whose data was used to train models. Assessing the privacy risks of
+machine learning models is crucial to enabling knowledgeable decisions on
+whether to use, deploy, or share a model. A common approach to privacy risk
+assessment is to run one or more known attacks against the model and measure
+their success rate. We present a novel framework for running membership
+inference attacks against classification models. Our framework takes advantage
+of the ensemble method, generating many specialized attack models for different
+subsets of the data. We show that this approach achieves higher accuracy than
+either a single attack model or an attack model per class label, both on
+classical and language classification tasks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Enhancing Neural Architecture Search with Multiple Hardware Constraints
+  for Deep Learning Model Deployment on Tiny IoT Devices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07217v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07217v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Alessio Burrello, Matteo Risso, Beatrice Alessandra Motetti, Enrico Macii, Luca Benini, Daniele Jahier Pagliari
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The rapid proliferation of computing domains relying on Internet of Things
+(IoT) devices has created a pressing need for efficient and accurate
+deep-learning (DL) models that can run on low-power devices. However,
+traditional DL models tend to be too complex and computationally intensive for
+typical IoT end-nodes. To address this challenge, Neural Architecture Search
+(NAS) has emerged as a popular design automation technique for co-optimizing
+the accuracy and complexity of deep neural networks. Nevertheless, existing NAS
+techniques require many iterations to produce a network that adheres to
+specific hardware constraints, such as the maximum memory available on the
+hardware or the maximum latency allowed by the target application. In this
+work, we propose a novel approach to incorporate multiple constraints into
+so-called Differentiable NAS optimization methods, which allows the generation,
+in a single shot, of a model that respects user-defined constraints on both
+memory and latency in a time comparable to a single standard training. The
+proposed approach is evaluated on five IoT-relevant benchmarks, including the
+MLPerf Tiny suite and Tiny ImageNet, demonstrating that, with a single search,
+it is possible to reduce memory and latency by 87.4% and 54.2%, respectively
+(as defined by our targets), while ensuring non-inferior accuracy on
+state-of-the-art hand-tuned deep neural networks for TinyML.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted for publication at the IEEE Transactions on Emerging Topics
+  in Computing</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Generative Modeling on Manifolds Through Mixture of Riemannian Diffusion
+  Processes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07216v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07216v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jaehyeong Jo, Sung Ju Hwang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Learning the distribution of data on Riemannian manifolds is crucial for
+modeling data from non-Euclidean space, which is required by many applications
+from diverse scientific fields. Yet, existing generative models on manifolds
+suffer from expensive divergence computation or rely on approximations of heat
+kernel. These limitations restrict their applicability to simple geometries and
+hinder scalability to high dimensions. In this work, we introduce the
+Riemannian Diffusion Mixture, a principled framework for building a generative
+process on manifolds as a mixture of endpoint-conditioned diffusion processes
+instead of relying on the denoising approach of previous diffusion models, for
+which the generative process is characterized by its drift guiding toward the
+most probable endpoint with respect to the geometry of the manifold. We further
+propose a simple yet efficient training objective for learning the mixture
+process, that is readily applicable to general manifolds. Our method
+outperforms previous generative models on various manifolds while scaling to
+high dimensions and requires a dramatically reduced number of in-training
+simulation steps for general manifolds.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Bridging the Gap between Newton-Raphson Method and Regularized Policy
+  Iteration 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07211v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07211v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyang Li, Chuxiong Hu, Yunan Wang, Guojian Zhan, Jie Li, Shengbo Eben Li
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Regularization is one of the most important techniques in reinforcement
+learning algorithms. The well-known soft actor-critic algorithm is a special
+case of regularized policy iteration where the regularizer is chosen as Shannon
+entropy. Despite some empirical success of regularized policy iteration, its
+theoretical underpinnings remain unclear. This paper proves that regularized
+policy iteration is strictly equivalent to the standard Newton-Raphson method
+in the condition of smoothing out Bellman equation with strongly convex
+functions. This equivalence lays the foundation of a unified analysis for both
+global and local convergence behaviors of regularized policy iteration. We
+prove that regularized policy iteration has global linear convergence with the
+rate being $\gamma$ (discount factor). Furthermore, this algorithm converges
+quadratically once it enters a local region around the optimal value. We also
+show that a modified version of regularized policy iteration, i.e., with
+finite-step policy evaluation, is equivalent to inexact Newton method where the
+Newton iteration formula is solved with truncated iterations. We prove that the
+associated algorithm achieves an asymptotic linear convergence rate of
+$\gamma^M$ in which $M$ denotes the number of steps carried out in policy
+evaluation. Our results take a solid step towards a better understanding of the
+convergence properties of regularized policy iteration algorithms.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Robust Safe Reinforcement Learning under Adversarial Disturbances 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07207v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07207v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zeyang Li, Chuxiong Hu, Shengbo Eben Li, Jia Cheng, Yunan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Safety is a primary concern when applying reinforcement learning to
+real-world control tasks, especially in the presence of external disturbances.
+However, existing safe reinforcement learning algorithms rarely account for
+external disturbances, limiting their applicability and robustness in practice.
+To address this challenge, this paper proposes a robust safe reinforcement
+learning framework that tackles worst-case disturbances. First, this paper
+presents a policy iteration scheme to solve for the robust invariant set, i.e.,
+a subset of the safe set, where persistent safety is only possible for states
+within. The key idea is to establish a two-player zero-sum game by leveraging
+the safety value function in Hamilton-Jacobi reachability analysis, in which
+the protagonist (i.e., control inputs) aims to maintain safety and the
+adversary (i.e., external disturbances) tries to break down safety. This paper
+proves that the proposed policy iteration algorithm converges monotonically to
+the maximal robust invariant set. Second, this paper integrates the proposed
+policy iteration scheme into a constrained reinforcement learning algorithm
+that simultaneously synthesizes the robust invariant set and uses it for
+constrained policy optimization. This algorithm tackles both optimality and
+safety, i.e., learning a policy that attains high rewards while maintaining
+safety under worst-case disturbances. Experiments on classic control tasks show
+that the proposed method achieves zero constraint violation with learned
+worst-case adversarial disturbances, while other baseline algorithms violate
+the safety constraints substantially. Our proposed method also attains
+comparable performance as the baselines even in the absence of the adversary.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ State of the Art on Diffusion Models for Visual Computing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07204v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07204v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryan Po, Wang Yifan, Vladislav Golyanik, Kfir Aberman, Jonathan T. Barron, Amit H. Bermano, Eric Ryan Chan, Tali Dekel, Aleksander Holynski, Angjoo Kanazawa, C. Karen Liu, Lingjie Liu, Ben Mildenhall, Matthias Nießner, Björn Ommer, Christian Theobalt, Peter Wonka, Gordon Wetzstein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The field of visual computing is rapidly advancing due to the emergence of
+generative artificial intelligence (AI), which unlocks unprecedented
+capabilities for the generation, editing, and reconstruction of images, videos,
+and 3D scenes. In these domains, diffusion models are the generative AI
+architecture of choice. Within the last year alone, the literature on
+diffusion-based tools and applications has seen exponential growth and relevant
+papers are published across the computer graphics, computer vision, and AI
+communities with new works appearing daily on arXiv. This rapid growth of the
+field makes it difficult to keep up with all recent developments. The goal of
+this state-of-the-art report (STAR) is to introduce the basic mathematical
+concepts of diffusion models, implementation details and design choices of the
+popular Stable Diffusion model, as well as overview important aspects of these
+generative AI tools, including personalization, conditioning, inversion, among
+others. Moreover, we give a comprehensive overview of the rapidly growing
+literature on diffusion-based generation and editing, categorized by the type
+of generated medium, including 2D images, videos, 3D objects, locomotion, and
+4D scenes. Finally, we discuss available datasets, metrics, open challenges,
+and social implications. This STAR provides an intuitive starting point to
+explore this exciting topic for researchers, artists, and practitioners alike.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ TidyBot: Personalized Robot Assistance with Large Language Models <span class="chip">IROS</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.05658v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.05658v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jimmy Wu, Rika Antonova, Adam Kan, Marion Lepert, Andy Zeng, Shuran Song, Jeannette Bohg, Szymon Rusinkiewicz, Thomas Funkhouser
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  For a robot to personalize physical assistance effectively, it must learn
+user preferences that can be generally reapplied to future scenarios. In this
+work, we investigate personalization of household cleanup with robots that can
+tidy up rooms by picking up objects and putting them away. A key challenge is
+determining the proper place to put each object, as people's preferences can
+vary greatly depending on personal taste or cultural background. For instance,
+one person may prefer storing shirts in the drawer, while another may prefer
+them on the shelf. We aim to build systems that can learn such preferences from
+just a handful of examples via prior interactions with a particular person. We
+show that robots can combine language-based planning and perception with the
+few-shot summarization capabilities of large language models (LLMs) to infer
+generalized user preferences that are broadly applicable to future
+interactions. This approach enables fast adaptation and achieves 91.2% accuracy
+on unseen objects in our benchmark dataset. We also demonstrate our approach on
+a real-world mobile manipulator called TidyBot, which successfully puts away
+85.0% of objects in real-world test scenarios.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to Autonomous Robots (AuRo) - Special Issue: Large Language
+  Models in Robotics, 2023 and IEEE/RSJ International Conference on Intelligent
+  Robots and Systems (IROS), 2023. Project page:
+  https://tidybot.cs.princeton.edu</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Performance of Deep Learning models with transfer learning for
+  multiple-step-ahead forecasts in monthly time series 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.11196v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.11196v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martín Solís, Luis-Alexander Calvo-Valverde
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Deep Learning and transfer learning models are being used to generate time
+series forecasts; however, there is scarce evidence about their performance
+prediction that it is more evident for monthly time series. The purpose of this
+paper is to compare Deep Learning models with transfer learning and without
+transfer learning and other traditional methods used for monthly forecasts to
+answer three questions about the suitability of Deep Learning and Transfer
+Learning to generate predictions of time series. Time series of M4 and M3
+competitions were used for the experiments. The results suggest that deep
+learning models based on TCN, LSTM, and CNN with transfer learning tend to
+surpass the performance prediction of other traditional methods. On the other
+hand, TCN and LSTM, trained directly on the target time series, got similar or
+better performance than traditional methods for some forecast horizons.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>20 pages, 7 figures, 5 tables</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ High-dimensional and universally consistent k-sample tests 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/1910.08883v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/1910.08883v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sambit Panda, Cencheng Shen, Ronan Perry, Jelle Zorn, Antoine Lutz, Carey E. Priebe, Joshua T. Vogelstein
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The k-sample testing problem involves determining whether $k$ groups of data
+points are each drawn from the same distribution. The standard method for
+k-sample testing in biomedicine is Multivariate analysis of variance (MANOVA),
+despite that it depends on strong, and often unsuitable, parametric
+assumptions. Moreover, independence testing and k-sample testing are closely
+related, and several universally consistent high-dimensional independence tests
+such as distance correlation (Dcorr) and Hilbert-Schmidt-Independence-Criterion
+(Hsic) enjoy solid theoretical and empirical properties. In this paper, we
+prove that independence tests achieve universally consistent k-sample testing
+and that k-sample statistics such as Energy and Maximum Mean Discrepancy (MMD)
+are precisely equivalent to Dcorr. An empirical evaluation of nonparametric
+independence tests showed that they generally perform better than the popular
+MANOVA test, even in Gaussian distributed scenarios. The evaluation included
+several popular independence statistics and covered a comprehensive set of
+simulations. Additionally, the testing approach was extended to perform
+multiway and multilevel tests, which were demonstrated in a simulated study as
+well as a real-world fMRI brain scans with a set of attributes.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning from Very Little Data: On the Value of Landscape Analysis for
+  Predicting Software Project Health 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2301.06577v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2301.06577v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Andre Lustosa, Tim Menzies
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  When data is scarce, software analytics can make many mistakes. For example,
+consider learning predictors for open source project health (e.g. the number of
+closed pull requests in twelve months time). The training data for this task
+may be very small (e.g. five years of data, collected every month means just 60
+rows of training data). The models generated from such tiny data sets can make
+many prediction errors.
+  Those errors can be tamed by a {\em landscape analysis} that selects better
+learner control parameters. Our niSNEAK tool (a)~clusters the data to find the
+general landscape of the hyperparameters; then (b)~explores a few
+representatives from each part of that landscape. niSNEAK is both faster and
+more effective than prior state-of-the-art hyperparameter optimization
+algorithms (e.g. FLASH, HYPEROPT, OPTUNA).
+  The configurations found by niSNEAK have far less error than other methods.
+For example, for project health indicators such as $C$= number of commits;
+$I$=number of closed issues, and $R$=number of closed pull requests, niSNEAK's
+12 month prediction errors are \{I=0\%, R=33\%\,C=47\%\}
+  Based on the above, we recommend landscape analytics (e.g. niSNEAK)
+especially when learning from very small data sets. This paper only explores
+the application of niSNEAK to project health. That said, we see nothing in
+principle that prevents the application of this technique to a wider range of
+problems.
+  To assist other researchers in repeating, improving, or even refuting our
+results, all our scripts and data are available on GitHub at
+https://github.com/zxcv123456qwe/niSneak
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Schema-adaptable Knowledge Graph Construction <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08703v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08703v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hongbin Ye, Honghao Gui, Xin Xu, Huajun Chen, Ningyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conventional Knowledge Graph Construction (KGC) approaches typically follow
+the static information extraction paradigm with a closed set of pre-defined
+schema. As a result, such approaches fall short when applied to dynamic
+scenarios or domains, whereas a new type of knowledge emerges. This
+necessitates a system that can handle evolving schema automatically to extract
+information for KGC. To address this need, we propose a new task called
+schema-adaptable KGC, which aims to continually extract entity, relation, and
+event based on a dynamically changing schema graph without re-training. We
+first split and convert existing datasets based on three principles to build a
+benchmark, i.e., horizontal schema expansion, vertical schema expansion, and
+hybrid schema expansion; then investigate the schema-adaptable performance of
+several well-known approaches such as Text2Event, TANL, UIE and GPT-3.5. We
+further propose a simple yet effective baseline dubbed \textsc{AdaKGC}, which
+contains schema-enriched prefix instructor and schema-conditioned dynamic
+decoding to better handle evolving schema. Comprehensive experimental results
+illustrate that AdaKGC can outperform baselines but still have room for
+improvement. We hope the proposed work can deliver benefits to the community.
+Code and datasets available at https://github.com/zjunlp/AdaKGC.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 (Findings)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Editing Large Language Models: Problems, Methods, and Opportunities <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.13172v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.13172v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunzhi Yao, Peng Wang, Bozhong Tian, Siyuan Cheng, Zhoubo Li, Shumin Deng, Huajun Chen, Ningyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Despite the ability to train capable LLMs, the methodology for maintaining
+their relevancy and rectifying errors remains elusive. To this end, the past
+few years have witnessed a surge in techniques for editing LLMs, the objective
+of which is to efficiently alter the behavior of LLMs within a specific domain
+without negatively impacting performance across other inputs. This paper
+embarks on a deep exploration of the problems, methods, and opportunities
+related to model editing for LLMs. In particular, we provide an exhaustive
+overview of the task definition and challenges associated with model editing,
+along with an in-depth empirical analysis of the most progressive methods
+currently at our disposal. We also build a new benchmark dataset to facilitate
+a more robust evaluation and pinpoint enduring issues intrinsic to existing
+techniques. Our objective is to provide valuable insights into the
+effectiveness and feasibility of each editing technique, thereby assisting the
+community in making informed decisions on the selection of the most appropriate
+method for a specific task or context. Code and datasets are available at
+https://github.com/zjunlp/EasyEdit.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023. Updated with new experiments</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DataPerf: Benchmarks for Data-Centric AI Development 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.10062v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.10062v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mark Mazumder, Colby Banbury, Xiaozhe Yao, Bojan Karlaš, William Gaviria Rojas, Sudnya Diamos, Greg Diamos, Lynn He, Alicia Parrish, Hannah Rose Kirk, Jessica Quaye, Charvi Rastogi, Douwe Kiela, David Jurado, David Kanter, Rafael Mosquera, Juan Ciro, Lora Aroyo, Bilge Acun, Lingjiao Chen, Mehul Smriti Raje, Max Bartolo, Sabri Eyuboglu, Amirata Ghorbani, Emmett Goodman, Oana Inel, Tariq Kane, Christine R. Kirkpatrick, Tzu-Sheng Kuo, Jonas Mueller, Tristan Thrush, Joaquin Vanschoren, Margaret Warren, Adina Williams, Serena Yeung, Newsha Ardalani, Praveen Paritosh, Lilith Bath-Leah, Ce Zhang, James Zou, Carole-Jean Wu, Cody Coleman, Andrew Ng, Peter Mattson, Vijay Janapa Reddi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Machine learning research has long focused on models rather than datasets,
+and prominent datasets are used for common ML tasks without regard to the
+breadth, difficulty, and faithfulness of the underlying problems. Neglecting
+the fundamental importance of data has given rise to inaccuracy, bias, and
+fragility in real-world applications, and research is hindered by saturation
+across existing dataset benchmarks. In response, we present DataPerf, a
+community-led benchmark suite for evaluating ML datasets and data-centric
+algorithms. We aim to foster innovation in data-centric AI through competition,
+comparability, and reproducibility. We enable the ML community to iterate on
+datasets, instead of just architectures, and we provide an open, online
+platform with multiple rounds of challenges to support this iterative
+development. The first iteration of DataPerf contains five benchmarks covering
+a wide spectrum of data-centric techniques, tasks, and modalities in vision,
+speech, acquisition, debugging, and diffusion prompting, and we support hosting
+new contributed benchmarks from the community. The benchmarks, online
+evaluation platform, and baseline implementations are open source, and the
+MLCommons Association will maintain DataPerf to ensure long-term benefits to
+academia and industry.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Chat with the Environment: Interactive Multimodal Perception Using Large
+  Language Models <span class="chip">IROS2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2303.08268v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2303.08268v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Xufeng Zhao, Mengdi Li, Cornelius Weber, Muhammad Burhan Hafez, Stefan Wermter
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Programming robot behavior in a complex world faces challenges on multiple
+levels, from dextrous low-level skills to high-level planning and reasoning.
+Recent pre-trained Large Language Models (LLMs) have shown remarkable reasoning
+ability in few-shot robotic planning. However, it remains challenging to ground
+LLMs in multimodal sensory input and continuous action output, while enabling a
+robot to interact with its environment and acquire novel information as its
+policies unfold. We develop a robot interaction scenario with a partially
+observable state, which necessitates a robot to decide on a range of epistemic
+actions in order to sample sensory information among multiple modalities,
+before being able to execute the task correctly. Matcha (Multimodal environment
+chatting) agent, an interactive perception framework, is therefore proposed
+with an LLM as its backbone, whose ability is exploited to instruct epistemic
+actions and to reason over the resulting multimodal sensations (vision, sound,
+haptics, proprioception), as well as to plan an entire task execution based on
+the interactively acquired information. Our study demonstrates that LLMs can
+provide high-level planning and reasoning skills and control interactive robot
+behavior in a multimodal environment, while multimodal modules with the context
+of the environmental state help ground the LLMs and extend their processing
+ability. The project website can be found at https://matcha-agent.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>IROS2023, Detroit. See the project website at
+  https://matcha-agent.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Clifford Group Equivariant Neural Networks <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.11141v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.11141v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        David Ruhe, Johannes Brandstetter, Patrick Forré
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce Clifford Group Equivariant Neural Networks: a novel approach for
+constructing $\mathrm{O}(n)$- and $\mathrm{E}(n)$-equivariant models. We
+identify and study the $\textit{Clifford group}$, a subgroup inside the
+Clifford algebra whose definition we adjust to achieve several favorable
+properties. Primarily, the group's action forms an orthogonal automorphism that
+extends beyond the typical vector space to the entire Clifford algebra while
+respecting the multivector grading. This leads to several non-equivalent
+subrepresentations corresponding to the multivector decomposition. Furthermore,
+we prove that the action respects not just the vector space structure of the
+Clifford algebra but also its multiplicative structure, i.e., the geometric
+product. These findings imply that every polynomial in multivectors, An
+advantage worth mentioning is that we obtain expressive layers that can
+elegantly generalize to inner-product spaces of any dimension. We demonstrate,
+notably from a single core implementation, state-of-the-art performance on
+several distinct tasks, including a three-dimensional $n$-body experiment, a
+four-dimensional Lorentz-equivariant high-energy physics experiment, and a
+five-dimensional convex hull experiment.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Published at NeurIPS 2023 (Oral)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ <span class="highlight-title">Bert</span>RLFuzzer: A <span class="highlight-title">BERT</span> and Reinforcement Learning based Fuzzer 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.12534v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.12534v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Piyush Jha, Joseph Scott, Jaya Sriram Ganeshna, Mudit Singh, Vijay Ganesh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present a novel tool BertRLFuzzer, a BERT and Reinforcement Learning (RL)
+based fuzzer aimed at finding security vulnerabilities for Web applications.
+BertRLFuzzer works as follows: given a set of seed inputs, the fuzzer performs
+grammar-adhering and attack-provoking mutation operations on them to generate
+candidate attack vectors. The key insight of BertRLFuzzer is the use of RL with
+a BERT model as an agent to guide the fuzzer to efficiently learn
+grammar-adhering and attack-provoking mutation operators. In order to establish
+the efficacy of BertRLFuzzer we compare it against a total of 13 black box and
+white box fuzzers over a benchmark of 9 victim websites with over 16K LOC. We
+observed a significant improvement, relative to the nearest competing tool, in
+terms of time to first attack (54% less), new vulnerabilities found (17 new
+vulnerabilities), and attack rate (4.4% more attack vectors generated).
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Probabilistically Robust Recourse: Navigating the Trade-offs between
+  Costs and Robustness in Algorithmic Recourse <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.06768v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.06768v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Pawelczyk, Teresa Datta, Johannes van-den-Heuvel, Gjergji Kasneci, Himabindu Lakkaraju
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As machine learning models are increasingly being employed to make
+consequential decisions in real-world settings, it becomes critical to ensure
+that individuals who are adversely impacted (e.g., loan denied) by the
+predictions of these models are provided with a means for recourse. While
+several approaches have been proposed to construct recourses for affected
+individuals, the recourses output by these methods either achieve low costs
+(i.e., ease-of-implementation) or robustness to small perturbations (i.e.,
+noisy implementations of recourses), but not both due to the inherent
+trade-offs between the recourse costs and robustness. Furthermore, prior
+approaches do not provide end users with any agency over navigating the
+aforementioned trade-offs. In this work, we address the above challenges by
+proposing the first algorithmic framework which enables users to effectively
+manage the recourse cost vs. robustness trade-offs. More specifically, our
+framework Probabilistically ROBust rEcourse (\texttt{PROBE}) lets users choose
+the probability with which a recourse could get invalidated (recourse
+invalidation rate) if small changes are made to the recourse i.e., the recourse
+is implemented somewhat noisily. To this end, we propose a novel objective
+function which simultaneously minimizes the gap between the achieved
+(resulting) and desired recourse invalidation rates, minimizes recourse costs,
+and also ensures that the resulting recourse achieves a positive model
+prediction. We develop novel theoretical results to characterize the recourse
+invalidation rates corresponding to any given instance w.r.t. different classes
+of underlying models (e.g., linear models, tree based models etc.), and
+leverage these results to efficiently optimize the proposed objective.
+Experimental evaluation with multiple real world datasets demonstrates the
+efficacy of the proposed framework.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2023, camera ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Trade-Off between Actionable Explanations and the Right to be
+  Forgotten <span class="chip">ICLR 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2208.14137v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2208.14137v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Martin Pawelczyk, Tobias Leemann, Asia Biega, Gjergji Kasneci
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  As machine learning (ML) models are increasingly being deployed in
+high-stakes applications, policymakers have suggested tighter data protection
+regulations (e.g., GDPR, CCPA). One key principle is the "right to be
+forgotten" which gives users the right to have their data deleted. Another key
+principle is the right to an actionable explanation, also known as algorithmic
+recourse, allowing users to reverse unfavorable decisions. To date, it is
+unknown whether these two principles can be operationalized simultaneously.
+Therefore, we introduce and study the problem of recourse invalidation in the
+context of data deletion requests. More specifically, we theoretically and
+empirically analyze the behavior of popular state-of-the-art algorithms and
+demonstrate that the recourses generated by these algorithms are likely to be
+invalidated if a small number of data deletion requests (e.g., 1 or 2) warrant
+updates of the predictive model. For the setting of differentiable models, we
+suggest a framework to identify a minimal subset of critical training points
+which, when removed, maximize the fraction of invalidated recourses. Using our
+framework, we empirically show that the removal of as little as 2 data
+instances from the training set can invalidate up to 95 percent of all
+recourses output by popular state-of-the-art algorithms. Thus, our work raises
+fundamental questions about the compatibility of "the right to an actionable
+explanation" in the context of the "right to be forgotten", while also
+providing constructive insights on the determining factors of recourse
+robustness.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>ICLR 2023 camera ready version</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ On the Probability of Immunity 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.11942v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.11942v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jose M. Peña
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This work is devoted to the study of the probability of immunity, i.e. the
+effect occurs whether exposed or not. We derive necessary and sufficient
+conditions for non-immunity and $\epsilon$-bounded immunity, i.e. the
+probability of immunity is zero and $\epsilon$-bounded, respectively. The
+former allows us to estimate the probability of benefit (i.e., the effect
+occurs if and only if exposed) from a randomized controlled trial, and the
+latter allows us to produce bounds of the probability of benefit that are
+tighter than the existing ones. We also introduce the concept of indirect
+immunity (i.e., through a mediator) and repeat our previous analysis for it.
+Finally, we propose a method for sensitivity analysis of the probability of
+immunity under unmeasured confounding.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Edge Video Analytics: A <span class="highlight-title">Survey</span> on Applications, Systems and Enabling
+  Techniques 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.15751v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.15751v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Renjie Xu, Saiedeh Razavi, Rong Zheng
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video, as a key driver in the global explosion of digital information, can
+create tremendous benefits for human society. Governments and enterprises are
+deploying innumerable cameras for a variety of applications, e.g., law
+enforcement, emergency management, traffic control, and security surveillance,
+all facilitated by video analytics (VA). This trend is spurred by the rapid
+advancement of deep learning (DL), which enables more precise models for object
+classification, detection, and tracking. Meanwhile, with the proliferation of
+Internet-connected devices, massive amounts of data are generated daily,
+overwhelming the cloud. Edge computing, an emerging paradigm that moves
+workloads and services from the network core to the network edge, has been
+widely recognized as a promising solution. The resulting new intersection, edge
+video analytics (EVA), begins to attract widespread attention. Nevertheless,
+only a few loosely-related surveys exist on this topic. The basic concepts of
+EVA (e.g., definition, architectures) were not fully elucidated due to the
+rapid development of this domain. To fill these gaps, we provide a
+comprehensive survey of the recent efforts on EVA. In this paper, we first
+review the fundamentals of edge computing, followed by an overview of VA. EVA
+systems and their enabling techniques are discussed next. In addition, we
+introduce prevalent frameworks and datasets to aid future researchers in the
+development of EVA systems. Finally, we discuss existing challenges and foresee
+future research directions. We believe this survey will help readers comprehend
+the relationship between VA and edge computing, and spark new ideas on EVA.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted in IEEE Communications Surveys and Tutorials, 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-kernel Correntropy-based Orientation Estimation of IMUs: Gradient
+  Descent Methods 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06548v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06548v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Shilei Li, Lijing Li, Dawei Shi, Yunjiang Lou, Ling Shi
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents two computationally efficient algorithms for the
+orientation estimation of inertial measurement units (IMUs): the
+correntropy-based gradient descent (CGD) and the correntropy-based decoupled
+orientation estimation (CDOE). Traditional methods, such as gradient descent
+(GD) and decoupled orientation estimation (DOE), rely on the mean squared error
+(MSE) criterion, making them vulnerable to external acceleration and magnetic
+interference. To address this issue, we demonstrate that the multi-kernel
+correntropy loss (MKCL) is an optimal objective function for maximum likelihood
+estimation (MLE) when the noise follows a type of heavy-tailed distribution. In
+certain situations, the estimation error of the MKCL is bounded even in the
+presence of arbitrarily large outliers. By replacing the standard MSE cost
+function with MKCL, we develop the CGD and CDOE algorithms. We evaluate the
+effectiveness of our proposed methods by comparing them with existing
+algorithms in various situations. Experimental results indicate that our
+proposed methods (CGD and CDOE) outperform their conventional counterparts (GD
+and DOE), especially when faced with external acceleration and magnetic
+disturbances. Furthermore, the new algorithms demonstrate significantly lower
+computational complexity than Kalman filter-based approaches, making them
+suitable for applications with low-cost microprocessors.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>16 pages</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ An Empirical Study of Multimodal Model Merging <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.14933v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.14933v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yi-Lin Sung, Linjie Li, Kevin Lin, Zhe Gan, Mohit Bansal, Lijuan Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Model merging (e.g., via interpolation or task arithmetic) fuses multiple
+models trained on different tasks to generate a multi-task solution. The
+technique has been proven successful in previous studies, where the models are
+trained on similar tasks and with the same initialization. In this paper, we
+expand on this concept to a multimodal setup by merging transformers trained on
+different modalities. Furthermore, we conduct our study for a novel goal where
+we can merge vision, language, and cross-modal transformers of a
+modality-specific architecture to create a parameter-efficient
+modality-agnostic architecture. Through comprehensive experiments, we
+systematically investigate the key factors impacting model performance after
+merging, including initialization, merging mechanisms, and model architectures.
+We also propose two metrics that assess the distance between weights to be
+merged and can serve as an indicator of the merging outcomes. Our analysis
+leads to an effective training recipe for matching the performance of the
+modality-agnostic baseline (i.e., pre-trained from scratch) via model merging.
+Our method also outperforms naive merging significantly on various tasks, with
+improvements of 3% on VQA, 7% on COCO retrieval, 25% on NLVR2, 14% on Flickr30k
+and 3% on ADE20k. Our code is available at https://github.com/ylsung/vl-merging
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023 Findings</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Vibroacoustic Frequency Response Prediction with Query-based Operator
+  Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05469v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05469v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jan van Delden, Julius Schultz, Christopher Blech, Sabine C. Langer, Timo Lüddecke
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Understanding vibroacoustic wave propagation in mechanical structures like
+airplanes, cars and houses is crucial to ensure health and comfort of their
+users. To analyze such systems, designers and engineers primarily consider the
+dynamic response in the frequency domain, which is computed through expensive
+numerical simulations like the finite element method. In contrast, data-driven
+surrogate models offer the promise of speeding up these simulations, thereby
+facilitating tasks like design optimization, uncertainty quantification, and
+design space exploration. We present a structured benchmark for a
+representative vibroacoustic problem: Predicting the frequency response for
+vibrating plates with varying forms of beadings. The benchmark features a total
+of 12,000 plate geometries with an associated numerical solution and introduces
+evaluation metrics to quantify the prediction quality. To address the frequency
+response prediction task, we propose a novel frequency query operator model,
+which is trained to map plate geometries to frequency response functions. By
+integrating principles from operator learning and implicit models for shape
+encoding, our approach effectively addresses the prediction of resonance peaks
+of frequency responses. We evaluate the method on our vibrating-plates
+benchmark and find that it outperforms DeepONets, Fourier Neural Operators and
+more traditional neural network architectures. The code and dataset are
+available from https://eckerlab.org/code/delden2023_plate.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Machine-Learning-Assisted and Real-Time-Feedback-Controlled Growth of
+  InAs/GaAs Quantum Dots 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.12898v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.12898v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Chao Shen, Wenkang Zhan, Kaiyao Xin, Manyang Li, Zhenyu Sun, Hui Cong, Chi Xu, Jian Tang, Zhaofeng Wu, Bo Xu, Zhongming Wei, Chunlai Xue, Chao Zhao, Zhanguo Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Self-assembled InAs/GaAs quantum dots (QDs) have properties highly valuable
+for developing various optoelectronic devices such as QD lasers and single
+photon sources. The applications strongly rely on the density and quality of
+these dots, which has motivated studies of the growth process control to
+realize high-quality epi-wafers and devices. Establishing the process
+parameters in molecular beam epitaxy (MBE) for a specific density of QDs is a
+multidimensional optimization challenge, usually addressed through
+time-consuming and iterative trial-and-error. Here, we report a real-time
+feedback control method to realize the growth of QDs with arbitrary density,
+which is fully automated and intelligent. We developed a machine learning (ML)
+model named 3D ResNet 50 trained using reflection high-energy electron
+diffraction (RHEED) videos as input instead of static images and providing
+real-time feedback on surface morphologies for process control. As a result, we
+demonstrated that ML from previous growth could predict the post-growth density
+of QDs, by successfully tuning the QD densities in near-real time from 1.5E10
+cm-2 down to 3.8E8 cm-2 or up to 1.4E11 cm-2. Compared to traditional methods,
+our approach, with in situ tuning capabilities and excellent reliability, can
+dramatically expedite the material optimization process and improve the
+reproducibility of MBE, constituting significant progress for thin film growth
+techniques. The concepts and methodologies proved feasible in this work are
+promising to be applied to a variety of material growth processes, which will
+revolutionize semiconductor manufacturing for optoelectronic and
+microelectronic industries.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Instruction<span class="highlight-title">GPT</span>-4: A 200-Instruction Paradigm for Fine-Tuning Mini<span class="highlight-title">GPT</span>-4 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2308.12067v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2308.12067v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lai Wei, Zihao Jiang, Weiran Huang, Lichao Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Multimodal large language models are typically trained in two stages: first
+pre-training on image-text pairs, and then fine-tuning using supervised
+vision-language instruction data. Recent studies have shown that large language
+models can achieve satisfactory results even with a limited amount of
+high-quality instruction-following data. In this paper, we introduce
+InstructionGPT-4, which is fine-tuned on a small dataset comprising only 200
+examples, amounting to approximately 6\% of the instruction-following data used
+in the alignment dataset for MiniGPT-4. To achieve this, we first propose
+several metrics to access the quality of multimodal instruction data. Based on
+these metrics, we present an effective and trainable data selector to
+automatically identify and filter low-quality vision-language data. By
+employing this method, InstructionGPT-4 outperforms the original MiniGPT-4 on
+various evaluations. Overall, our findings demonstrate that less but
+high-quality instruction tuning data is efficient in enabling multimodal large
+language models to generate better output. Our code is available at
+https://github.com/waltonfuture/InstructionGPT-4.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Discovering interpretable elastoplasticity models via the neural
+  polynomial method enabled symbolic regressions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.13149v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.13149v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Bahador Bahmani, Hyoung Suk Suh, WaiChing Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Conventional neural network elastoplasticity models are often perceived as
+lacking interpretability. This paper introduces a two-step machine learning
+approach that returns mathematical models interpretable by human experts. In
+particular, we introduce a surrogate model where yield surfaces are expressed
+in terms of a set of single-variable feature mappings obtained from supervised
+learning. A postprocessing step is then used to re-interpret the set of
+single-variable neural network mapping functions into mathematical form through
+symbolic regression. This divide-and-conquer approach provides several
+important advantages. First, it enables us to overcome the scaling issue of
+symbolic regression algorithms. From a practical perspective, it enhances the
+portability of learned models for partial differential equation solvers written
+in different programming languages. Finally, it enables us to have a concrete
+understanding of the attributes of the materials, such as convexity and
+symmetries of models, through automated derivations and reasoning. Numerical
+examples have been provided, along with an open-source code to enable third
+party validation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ MP-FedCL: Multiprototype Federated Contrastive Learning for Edge
+  Intelligence 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.01950v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.01950v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yu Qiao, Md. Shirajum Munir, Apurba Adhikary, Huy Q. Le, Avi Deb Raha, Chaoning Zhang, Choong Seon Hong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Federated learning-assisted edge intelligence enables privacy protection in
+modern intelligent services. However, not independent and identically
+distributed (non-IID) distribution among edge clients can impair the local
+model performance. The existing single prototype-based strategy represents a
+class by using the mean of the feature space. However, feature spaces are
+usually not clustered, and a single prototype may not represent a class well.
+Motivated by this, this paper proposes a multi-prototype federated contrastive
+learning approach (MP-FedCL) which demonstrates the effectiveness of using a
+multi-prototype strategy over a single-prototype under non-IID settings,
+including both label and feature skewness. Specifically, a multi-prototype
+computation strategy based on \textit{k-means} is first proposed to capture
+different embedding representations for each class space, using multiple
+prototypes ($k$ centroids) to represent a class in the embedding space. In each
+global round, the computed multiple prototypes and their respective model
+parameters are sent to the edge server for aggregation into a global prototype
+pool, which is then sent back to all clients to guide their local training.
+Finally, local training for each client minimizes their own supervised learning
+tasks and learns from shared prototypes in the global prototype pool through
+supervised contrastive learning, which encourages them to learn knowledge
+related to their own class from others and reduces the absorption of unrelated
+knowledge in each global iteration. Experimental results on MNIST, Digit-5,
+Office-10, and DomainNet show that our method outperforms multiple baselines,
+with an average test accuracy improvement of about 4.6\% and 10.4\% under
+feature and label non-IID distributions, respectively.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by IEEE Internet of Things</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Neural networks with linear threshold activations: structure and
+  algorithms 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2111.08117v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2111.08117v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Sammy Khalife, Hongyu Cheng, Amitabh Basu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this article we present new results on neural networks with linear
+threshold activation functions. We precisely characterize the class of
+functions that are representable by such neural networks and show that 2 hidden
+layers are necessary and sufficient to represent any function representable in
+the class. This is a surprising result in the light of recent exact
+representability investigations for neural networks using other popular
+activation functions like rectified linear units (ReLU). We also give precise
+bounds on the sizes of the neural networks required to represent any function
+in the class. Finally, we design an algorithm to solve the empirical risk
+minimization (ERM) problem to global optimality for these neural networks with
+a fixed architecture. The algorithm's running time is polynomial in the size of
+the data sample, if the input dimension and the size of the network
+architecture are considered fixed constants. The algorithm is unique in the
+sense that it works for any architecture with any number of layers, whereas
+previous polynomial time globally optimal algorithms work only for very
+restricted classes of architectures. Using these insights, we propose a new
+class of neural networks that we call shortcut linear threshold networks. To
+the best of our knowledge, this way of designing neural networks has not been
+explored before in the literature. We show that these neural networks have
+several desirable theoretical properties.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HyperAttention: Long-context Attention in Near-Linear Time 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05869v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05869v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Insu Han, Rajesh Jayaram, Amin Karbasi, Vahab Mirrokni, David P. Woodruff, Amir Zandieh
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We present an approximate attention mechanism named HyperAttention to address
+the computational challenges posed by the growing complexity of long contexts
+used in Large Language Models (LLMs). Recent work suggests that in the
+worst-case scenario, quadratic time is necessary unless the entries of the
+attention matrix are bounded or the matrix has low stable rank. We introduce
+two parameters which measure: (1) the max column norm in the normalized
+attention matrix, and (2) the ratio of row norms in the unnormalized attention
+matrix after detecting and removing large entries. We use these fine-grained
+parameters to capture the hardness of the problem. Despite previous lower
+bounds, we are able to achieve a linear time sampling algorithm even when the
+matrix has unbounded entries or a large stable rank, provided the above
+parameters are small. HyperAttention features a modular design that easily
+accommodates integration of other fast low-level implementations, particularly
+FlashAttention. Empirically, employing Locality Sensitive Hashing (LSH) to
+identify large entries, HyperAttention outperforms existing methods, giving
+significant speed improvements compared to state-of-the-art solutions like
+FlashAttention. We validate the empirical performance of HyperAttention on a
+variety of different long-context length datasets. For example, HyperAttention
+makes the inference time of ChatGLM2 50\% faster on 32k context length while
+perplexity increases from 5.6 to 6.3. On larger context length, e.g., 131k,
+with causal masking, HyperAttention offers 5-fold speedup on a single attention
+layer.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Bias Mitigation for Machine Learning Classifiers: A Comprehensive <span class="highlight-title">Survey</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2207.07068v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2207.07068v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Max Hort, Zhenpeng Chen, Jie M. Zhang, Mark Harman, Federica Sarro
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper provides a comprehensive survey of bias mitigation methods for
+achieving fairness in Machine Learning (ML) models. We collect a total of 341
+publications concerning bias mitigation for ML classifiers. These methods can
+be distinguished based on their intervention procedure (i.e., pre-processing,
+in-processing, post-processing) and the technique they apply. We investigate
+how existing bias mitigation methods are evaluated in the literature. In
+particular, we consider datasets, metrics and benchmarking. Based on the
+gathered insights (e.g., What is the most popular fairness metric? How many
+datasets are used for evaluating bias mitigation methods?), we hope to support
+practitioners in making informed choices when developing and evaluating new
+bias mitigation methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>52 pages, 7 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ LPML: LLM-<span class="highlight-title">Prompt</span>ing Markup Language for Mathematical Reasoning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.13078v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.13078v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ryutaro Yamauchi, Sho Sonoda, Akiyoshi Sannai, Wataru Kumagai
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In utilizing large language models (LLMs) for mathematical reasoning,
+addressing the errors in the reasoning and calculation present in the generated
+text by LLMs is a crucial challenge. In this paper, we propose a novel
+framework that integrates the Chain-of-Thought (CoT) method with an external
+tool (Python REPL). We discovered that by prompting LLMs to generate structured
+text in XML-like markup language, we could seamlessly integrate CoT and the
+external tool and control the undesired behaviors of LLMs. With our approach,
+LLMs can utilize Python computation to rectify errors within CoT. We applied
+our method to ChatGPT (GPT-3.5) to solve challenging mathematical problems and
+demonstrated that combining CoT and Python REPL through the markup language
+enhances the reasoning capability of LLMs. Our approach enables LLMs to write
+the markup language and perform advanced mathematical reasoning using only
+zero-shot prompting.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ EXACT: How to Train Your Accuracy 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2205.09615v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2205.09615v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Ivan Karpukhin, Stanislav Dereka, Sergey Kolesnikov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Classification tasks are usually evaluated in terms of accuracy. However,
+accuracy is discontinuous and cannot be directly optimized using gradient
+ascent. Popular methods minimize cross-entropy, hinge loss, or other surrogate
+losses, which can lead to suboptimal results. In this paper, we propose a new
+optimization framework by introducing stochasticity to a model's output and
+optimizing expected accuracy, i.e. accuracy of the stochastic model. Extensive
+experiments on linear models and deep image classification show that the
+proposed optimization method is a powerful alternative to widely used
+classification losses.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analysis of Rainfall Variability and Water Extent of Selected Hydropower
+  Reservoir Using Google Earth Engine (GEE): A Case Study from Two Tropical
+  Countries, Sri Lanka and Vietnam 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05682v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05682v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Punsisi Rajakaruna, Surajit Ghosh, Bunyod Holmatov
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This study presents a comprehensive remote sensing analysis of rainfall
+patterns and selected hydropower reservoir water extent in two tropical monsoon
+countries, Vietnam and Sri Lanka. The aim is to understand the relationship
+between remotely sensed rainfall data and the dynamic changes (monthly) in
+reservoir water extent. The analysis utilizes high-resolution optical imagery
+and Sentinel-1 Synthetic Aperture Radar (SAR) data to observe and monitor water
+bodies during different weather conditions, especially during the monsoon
+season. The average annual rainfall for both countries is determined, and
+spatiotemporal variations in monthly average rainfall are examined at regional
+and reservoir basin levels using the Climate Hazards Group InfraRed
+Precipitation with Station (CHIRPS) dataset from 1981 to 2022. Water extents
+are derived for selected reservoirs using Sentinel-1 SAR Ground Range Detected
+(GRD) images in Vietnam and Sri Lanka from 2017 to 2022. The images are
+pre-processed and corrected using terrain correction and refined Lee filter. An
+automated thresholding algorithm, OTSU, distinguishes water and land, taking
+advantage of both VV and VH polarization data. The connected pixel count
+threshold is applied to enhance result accuracy. The results indicate a clear
+relationship between rainfall patterns and reservoir water extent, with
+increased precipitation during the monsoon season leading to higher water
+extents in the later months. This study contributes to understanding how
+rainfall variability impacts reservoir water resources in tropical monsoon
+regions. The preliminary findings can inform water resource management
+strategies and support these countries' decision-making processes related to
+hydropower generation, flood management, and irrigation.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Knowledge Rumination for <span class="highlight-title">Pre-train</span>ed Language Models <span class="chip">EMNLP 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.08732v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.08732v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yunzhi Yao, Peng Wang, Shengyu Mao, Chuanqi Tan, Fei Huang, Huajun Chen, Ningyu Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Previous studies have revealed that vanilla pre-trained language models
+(PLMs) lack the capacity to handle knowledge-intensive NLP tasks alone; thus,
+several works have attempted to integrate external knowledge into PLMs.
+However, despite the promising outcome, we empirically observe that PLMs may
+have already encoded rich knowledge in their pre-trained parameters but fail to
+fully utilize them when applying them to knowledge-intensive tasks. In this
+paper, we propose a new paradigm dubbed Knowledge Rumination to help the
+pre-trained language model utilize that related latent knowledge without
+retrieving it from the external corpus. By simply adding a prompt like "As far
+as I know" to the PLMs, we try to review related latent knowledge and inject
+them back into the model for knowledge consolidation. We apply the proposed
+knowledge rumination to various language models, including RoBERTa, DeBERTa,
+and GPT-3. Experimental results on six commonsense reasoning tasks and GLUE
+benchmarks demonstrate the effectiveness of our proposed approach, which proves
+that the knowledge stored in PLMs can be better exploited to enhance
+performance. Code is available in
+https://github.com/zjunlp/knowledge-rumination.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>EMNLP 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Elastic Decision <span class="highlight-title">Transformer</span> <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2307.02484v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2307.02484v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yueh-Hua Wu, Xiaolong Wang, Masashi Hamaya
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper introduces Elastic Decision Transformer (EDT), a significant
+advancement over the existing Decision Transformer (DT) and its variants.
+Although DT purports to generate an optimal trajectory, empirical evidence
+suggests it struggles with trajectory stitching, a process involving the
+generation of an optimal or near-optimal trajectory from the best parts of a
+set of sub-optimal trajectories. The proposed EDT differentiates itself by
+facilitating trajectory stitching during action inference at test time,
+achieved by adjusting the history length maintained in DT. Further, the EDT
+optimizes the trajectory by retaining a longer history when the previous
+trajectory is optimal and a shorter one when it is sub-optimal, enabling it to
+"stitch" with a more optimal trajectory. Extensive experimentation demonstrates
+EDT's ability to bridge the performance gap between DT-based and Q
+Learning-based approaches. In particular, the EDT outperforms Q Learning-based
+methods in a multi-task regime on the D4RL locomotion benchmark and Atari
+games. Videos are available at: https://kristery.github.io/edt/
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2023</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Construction of Knowledge Graphs: State and Challenges 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2302.11509v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2302.11509v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marvin Hofer, Daniel Obraczka, Alieh Saeedi, Hanna Köpcke, Erhard Rahm
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  With knowledge graphs (KGs) at the center of numerous applications such as
+recommender systems and question answering, the need for generalized pipelines
+to construct and continuously update such KGs is increasing. While the
+individual steps that are necessary to create KGs from unstructured (e.g. text)
+and structured data sources (e.g. databases) are mostly well-researched for
+their one-shot execution, their adoption for incremental KG updates and the
+interplay of the individual steps have hardly been investigated in a systematic
+manner so far. In this work, we first discuss the main graph models for KGs and
+introduce the major requirement for future KG construction pipelines. Next, we
+provide an overview of the necessary steps to build high-quality KGs, including
+cross-cutting topics such as metadata management, ontology development, and
+quality assurance. We then evaluate the state of the art of KG construction
+w.r.t the introduced requirements for specific popular KGs as well as some
+recent tools and strategies for KG construction. Finally, we identify areas in
+need of further research and improvement.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>51 pages, 5 figures, 4 tables, 328 references</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Video alignment using unsupervised learning of local and global features 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.06841v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.06841v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Niloufar Fakhfour, Mohammad ShahverdiKondori, Hoda Mohammadzade
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In this paper, we tackle the problem of video alignment, the process of
+matching the frames of a pair of videos containing similar actions. The main
+challenge in video alignment is that accurate correspondence should be
+established despite the differences in the execution processes and appearances
+between the two videos. We introduce an unsupervised method for alignment that
+uses global and local features of the frames. In particular, we introduce
+effective features for each video frame using three machine vision tools:
+person detection, pose estimation, and VGG network. Then, the features are
+processed and combined to construct a multidimensional time series that
+represents the video. The resulting time series are used to align videos of the
+same actions using a novel version of dynamic time warping named Diagonalized
+Dynamic Time Warping(DDTW). The main advantage of our approach is that no
+training is required, which makes it applicable for any new type of action
+without any need to collect training samples for it. For evaluation, we
+considered video synchronization and phase classification tasks on the Penn
+action dataset. Also, for an effective evaluation of the video synchronization
+task, we present a new metric called Enclosed Area Error(EAE). The results show
+that our method outperforms previous state-of-the-art methods, such as TCC, and
+other self-supervised and weakly supervised methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>19 pages, 6 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Analysis of Convolutions, Non-linearity and Depth in Graph Neural
+  Networks using Neural Tangent Kernel 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.09809v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.09809v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Mahalakshmi Sabanayagam, Pascal Esser, Debarghya Ghoshdastidar
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The fundamental principle of Graph Neural Networks (GNNs) is to exploit the
+structural information of the data by aggregating the neighboring nodes using a
+`graph convolution' in conjunction with a suitable choice for the network
+architecture, such as depth and activation functions. Therefore, understanding
+the influence of each of the design choice on the network performance is
+crucial. Convolutions based on graph Laplacian have emerged as the dominant
+choice with the symmetric normalization of the adjacency matrix as the most
+widely adopted one. However, some empirical studies show that row normalization
+of the adjacency matrix outperforms it in node classification. Despite the
+widespread use of GNNs, there is no rigorous theoretical study on the
+representation power of these convolutions, that could explain this behavior.
+Similarly, the empirical observation of the linear GNNs performance being on
+par with non-linear ReLU GNNs lacks rigorous theory.
+  In this work, we theoretically analyze the influence of different aspects of
+the GNN architecture using the Graph Neural Tangent Kernel in a semi-supervised
+node classification setting. Under the population Degree Corrected Stochastic
+Block Model, we prove that: (i) linear networks capture the class information
+as good as ReLU networks; (ii) row normalization preserves the underlying class
+structure better than other convolutions; (iii) performance degrades with
+network depth due to over-smoothing, but the loss in class information is the
+slowest in row normalization; (iv) skip connections retain the class
+information even at infinite depth, thereby eliminating over-smoothing. We
+finally validate our theoretical findings numerically and on real datasets such
+as Cora and Citeseer.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>41 pages, 24 figures. Code available at
+  https://github.com/mahalakshmi-sabanayagam/NTK_GCN</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reachability In Simple Neural Networks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2203.07941v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2203.07941v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Marco Sälzer, Martin Lange
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We investigate the complexity of the reachability problem for (deep) neural
+networks: does it compute valid output given some valid input? It was recently
+claimed that the problem is NP-complete for general neural networks and
+specifications over the input/output dimension given by conjunctions of linear
+inequalities. We recapitulate the proof and repair some flaws in the original
+upper and lower bound proofs. Motivated by the general result, we show that
+NP-hardness already holds for restricted classes of simple specifications and
+neural networks. Allowing for a single hidden layer and an output dimension of
+one as well as neural networks with just one negative, zero and one positive
+weight or bias is sufficient to ensure NP-hardness. Additionally, we give a
+thorough discussion and outlook of possible extensions for this direction of
+research on neural network verification.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>arXiv admin note: substantial text overlap with arXiv:2108.13179</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Rapid Exploration for Open-World Navigation with Latent Goal Models 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2104.05859v5">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2104.05859v5.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Dhruv Shah, Benjamin Eysenbach, Gregory Kahn, Nicholas Rhinehart, Sergey Levine
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We describe a robotic learning system for autonomous exploration and
+navigation in diverse, open-world environments. At the core of our method is a
+learned latent variable model of distances and actions, along with a
+non-parametric topological memory of images. We use an information bottleneck
+to regularize the learned policy, giving us (i) a compact visual representation
+of goals, (ii) improved generalization capabilities, and (iii) a mechanism for
+sampling feasible goals for exploration. Trained on a large offline dataset of
+prior experience, the model acquires a representation of visual goals that is
+robust to task-irrelevant distractors. We demonstrate our method on a mobile
+ground robot in open-world exploration scenarios. Given an image of a goal that
+is up to 80 meters away, our method leverages its representation to explore and
+discover the goal in under 20 minutes, even amidst previously-unseen obstacles
+and weather conditions. Please check out the project website for videos of our
+experiments and information about the real-world dataset used at
+https://sites.google.com/view/recon-robot.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Presented at 5th Annual Conference on Robot Learning (CoRL 2021),
+  London, UK as an Oral Talk. Project page and dataset release at
+  https://sites.google.com/view/recon-robot</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Learning Intra- and Inter-Cell Differences for Accurate Battery Lifespan
+  Prediction across Diverse Conditions 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05052v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05052v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Han Zhang, Yuqi Li, Shun Zheng, Ziheng Lu, Xiaofan Gui, Wei Xu, Jiang Bian
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Battery life prediction holds significant practical value for battery
+research and development. Currently, many data-driven models rely on early
+electrical signals from specific target batteries to predict their lifespan. A
+common shortfall is that most existing methods are developed based on specific
+aging conditions, which not only limits their model's capability but also
+diminishes their effectiveness in predicting degradation under varied
+conditions. As a result, these models often miss out on fully benefiting from
+the rich historical data available under other conditions. Here, to address
+above, we introduce an approach that explicitly captures differences between
+electrical signals of a target battery and a reference battery, irrespective of
+their materials and aging conditions, to forecast the target battery life.
+Through this inter-cell difference, we not only enhance the feature space but
+also pave the way for a universal battery life prediction framework.
+Remarkably, our model that combines the inter- and intra-cell differences
+shines across diverse conditions, standing out in its efficiency and accuracy
+using all accessible datasets. An essential application of our approach is its
+capability to leverage data from older batteries effectively, enabling newer
+batteries to capitalize on insights gained from past batteries. This work not
+only enriches the battery data utilization strategy but also sets the stage for
+smarter battery management system in the future.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Enhancing Sample Efficiency in Black-box Combinatorial Optimization via
+  Symmetric Replay Training 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.01276v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.01276v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Hyeonah Kim, Minsu Kim, Sungsoo Ahn, Jinkyoo Park
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Black-box combinatorial optimization (black-box CO) is frequently encountered
+in various industrial fields, such as drug discovery or hardware design.
+Despite its widespread relevance, solving black-box CO problems is highly
+challenging due to the vast combinatorial solution space and resource-intensive
+nature of black-box function evaluations. These inherent complexities induce
+significant constraints on the efficacy of existing deep reinforcement learning
+(DRL) methods when applied to practical problem settings. For efficient
+exploration with the limited availability of function evaluations, this paper
+introduces a new generic method to enhance sample efficiency. We propose
+symmetric replay training that leverages the high-reward samples and their
+under-explored regions in the symmetric space. In replay training, the policy
+is trained to imitate the symmetric trajectories of these high-rewarded
+samples. The proposed method is beneficial for the exploration of highly
+rewarded regions without the necessity for additional online interactions -
+free. The experimental results show that our method consistently improves the
+sample efficiency of various DRL methods on real-world tasks, including
+molecular optimization and hardware design.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>18 pages (including 6 pages of the appendix)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ HyperMask: Adaptive Hypernetwork-based Masks for Continual Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.00113v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.00113v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kamil Książek, Przemysław Spurek
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Artificial neural networks suffer from catastrophic forgetting when they are
+sequentially trained on multiple tasks. To overcome this problem, there exist
+many continual learning strategies. One of the most effective is the
+hypernetwork-based approach. The hypernetwork generates the weights of a target
+model based on the task's identity. The model's main limitation is that
+hypernetwork can produce completely different nests for each task.
+Consequently, each task is solved separately. The model does not use
+information from the network dedicated to previous tasks and practically
+produces new architectures when it learns the subsequent tasks. To solve such a
+problem, we use the lottery ticket hypothesis, which postulates the existence
+of sparse subnetworks, named winning tickets, that preserve the performance of
+a full network. In the paper, we propose a method called HyperMask, which
+trains a single network for all tasks. Hypernetwork produces semi-binary masks
+to obtain target subnetworks dedicated to new tasks. This solution inherits the
+ability of the hypernetwork to adapt to new tasks with minimal forgetting.
+Moreover, due to the lottery ticket hypothesis, we can use a single network
+with weighted subnets dedicated to each task.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Multi-timestep models for Model-based Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.05672v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.05672v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Abdelhakim Benechehab, Giuseppe Paolo, Albert Thomas, Maurizio Filippone, Balázs Kégl
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  In model-based reinforcement learning (MBRL), most algorithms rely on
+simulating trajectories from one-step dynamics models learned on data. A
+critical challenge of this approach is the compounding of one-step prediction
+errors as length of the trajectory grows. In this paper we tackle this issue by
+using a multi-timestep objective to train one-step models. Our objective is a
+weighted sum of a loss function (e.g., negative log-likelihood) at various
+future horizons. We explore and test a range of weights profiles. We find that
+exponentially decaying weights lead to models that significantly improve the
+long-horizon R2 score. This improvement is particularly noticeable when the
+models were evaluated on noisy data. Finally, using a soft actor-critic (SAC)
+agent in pure batch reinforcement learning (RL) and iterated batch RL
+scenarios, we found that our multi-timestep models outperform or match standard
+one-step models. This was especially evident in a noisy variant of the
+considered environment, highlighting the potential of our approach in
+real-world applications.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ SGDP: A Stream-Graph Neural Network Based Data Prefetcher <span class="chip">IJCNN
+  2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.03864v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.03864v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyuan Yang, Rongshang Li, Qiquan Shi, Xijun Li, Gang Hu, Xing Li, Mingxuan Yuan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Data prefetching is important for storage system optimization and access
+performance improvement. Traditional prefetchers work well for mining access
+patterns of sequential logical block address (LBA) but cannot handle complex
+non-sequential patterns that commonly exist in real-world applications. The
+state-of-the-art (SOTA) learning-based prefetchers cover more LBA accesses.
+However, they do not adequately consider the spatial interdependencies between
+LBA deltas, which leads to limited performance and robustness. This paper
+proposes a novel Stream-Graph neural network-based Data Prefetcher (SGDP).
+Specifically, SGDP models LBA delta streams using a weighted directed graph
+structure to represent interactive relations among LBA deltas and further
+extracts hybrid features by graph neural networks for data prefetching. We
+conduct extensive experiments on eight real-world datasets. Empirical results
+verify that SGDP outperforms the SOTA methods in terms of the hit ratio by
+6.21%, the effective prefetching ratio by 7.00%, and speeds up inference time
+by 3.13X on average. Besides, we generalize SGDP to different variants by
+different stream constructions, further expanding its application scenarios and
+demonstrating its robustness. SGDP offers a novel data prefetching solution and
+has been verified in commercial hybrid storage systems in the experimental
+phase. Our codes and appendix are available at
+https://github.com/yyysjz1997/SGDP/.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by International Joint Conference on Neural Networks (IJCNN
+  2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DCdetector: Dual Attention Contrastive Representation Learning for Time
+  Series Anomaly Detection <span class="chip">KDD</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.10347v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.10347v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yiyuan Yang, Chaoli Zhang, Tian Zhou, Qingsong Wen, Liang Sun
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Time series anomaly detection is critical for a wide range of applications.
+It aims to identify deviant samples from the normal sample distribution in time
+series. The most fundamental challenge for this task is to learn a
+representation map that enables effective discrimination of anomalies.
+Reconstruction-based methods still dominate, but the representation learning
+with anomalies might hurt the performance with its large abnormal loss. On the
+other hand, contrastive learning aims to find a representation that can clearly
+distinguish any instance from the others, which can bring a more natural and
+promising representation for time series anomaly detection. In this paper, we
+propose DCdetector, a multi-scale dual attention contrastive representation
+learning model. DCdetector utilizes a novel dual attention asymmetric design to
+create the permutated environment and pure contrastive loss to guide the
+learning process, thus learning a permutation invariant representation with
+superior discrimination abilities. Extensive experiments show that DCdetector
+achieves state-of-the-art results on multiple time series anomaly detection
+benchmark datasets. Code is publicly available at
+https://github.com/DAMO-DI-ML/KDD2023-DCdetector.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM SIGKDD International Conference on Knowledge
+  Discovery & Data Mining (KDD 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Efficient MILP Decomposition in Quantum Computing for ReLU Network
+  Robustness 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2305.00472v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2305.00472v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Nicola Franco, Tom Wollschläger, Benedikt Poggel, Stephan Günnemann, Jeanette Miriam Lorenz
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Emerging quantum computing technologies, such as Noisy Intermediate-Scale
+Quantum (NISQ) devices, offer potential advancements in solving mathematical
+optimization problems. However, limitations in qubit availability, noise, and
+errors pose challenges for practical implementation. In this study, we examine
+two decomposition methods for Mixed-Integer Linear Programming (MILP) designed
+to reduce the original problem size and utilize available NISQ devices more
+efficiently. We concentrate on breaking down the original problem into smaller
+subproblems, which are then solved iteratively using a combined
+quantum-classical hardware approach. We conduct a detailed analysis for the
+decomposition of MILP with Benders and Dantzig-Wolfe methods. In our analysis,
+we show that the number of qubits required to solve Benders is exponentially
+large in the worst-case, while remains constant for Dantzig-Wolfe.
+Additionally, we leverage Dantzig-Wolfe decomposition on the use-case of
+certifying the robustness of ReLU networks. Our experimental results
+demonstrate that this approach can save up to 90\% of qubits compared to
+existing methods on quantum annealing and gate-based quantum computers.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Exposing Influence Campaigns in the Age of LLMs: A Behavioral-Based AI
+  Approach to Detecting State-Sponsored Trolls 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2210.08786v6">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2210.08786v6.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Fatima Ezzeddine, Luca Luceri, Omran Ayoub, Ihab Sbeity, Gianluca Nogara, Emilio Ferrara, Silvia Giordano
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The detection of state-sponsored trolls operating in influence campaigns on
+social media is a critical and unsolved challenge for the research community,
+which has significant implications beyond the online realm. To address this
+challenge, we propose a new AI-based solution that identifies troll accounts
+solely through behavioral cues associated with their sequences of sharing
+activity, encompassing both their actions and the feedback they receive from
+others. Our approach does not incorporate any textual content shared and
+consists of two steps: First, we leverage an LSTM-based classifier to determine
+whether account sequences belong to a state-sponsored troll or an organic,
+legitimate user. Second, we employ the classified sequences to calculate a
+metric named the "Troll Score", quantifying the degree to which an account
+exhibits troll-like behavior. To assess the effectiveness of our method, we
+examine its performance in the context of the 2016 Russian interference
+campaign during the U.S. Presidential election. Our experiments yield
+compelling results, demonstrating that our approach can identify account
+sequences with an AUC close to 99% and accurately differentiate between Russian
+trolls and organic users with an AUC of 91%. Notably, our behavioral-based
+approach holds a significant advantage in the ever-evolving landscape, where
+textual and linguistic properties can be easily mimicked by Large Language
+Models (LLMs): In contrast to existing language-based techniques, it relies on
+more challenging-to-replicate behavioral cues, ensuring greater resilience in
+identifying influence campaigns, especially given the potential increase in the
+usage of LLMs for generating inauthentic content. Finally, we assessed the
+generalizability of our solution to various entities driving different
+information operations and found promising results that will guide future
+research.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>22</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ DYffusion: A Dynamics-informed Diffusion Model for Spatiotemporal
+  Forecasting <span class="chip">NeurIPS 2023</span>
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2306.01984v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2306.01984v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Salva Rühling Cachay, Bo Zhao, Hailey Joren, Rose Yu
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  While diffusion models can successfully generate data and make predictions,
+they are predominantly designed for static images. We propose an approach for
+efficiently training diffusion models for probabilistic spatiotemporal
+forecasting, where generating stable and accurate rollout forecasts remains
+challenging, Our method, DYffusion, leverages the temporal dynamics in the
+data, directly coupling it with the diffusion steps in the model. We train a
+stochastic, time-conditioned interpolator and a forecaster network that mimic
+the forward and reverse processes of standard diffusion models, respectively.
+DYffusion naturally facilitates multi-step and long-range forecasting, allowing
+for highly flexible, continuous-time sampling trajectories and the ability to
+trade-off performance with accelerated sampling at inference time. In addition,
+the dynamics-informed diffusion process in DYffusion imposes a strong inductive
+bias and significantly improves computational efficiency compared to
+traditional Gaussian noise-based diffusion models. Our approach performs
+competitively on probabilistic forecasting of complex dynamics in sea surface
+temperatures, Navier-Stokes flows, and spring mesh systems.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted to NeurIPS 2023; Code is available at:
+  https://github.com/Rose-STL-Lab/dyffusion</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Autothrottle: A Practical Bi-Level Approach to Resource Management for
+  SLO-Targeted Microservices 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2212.12180v4">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2212.12180v4.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zibo Wang, Pinghe Li, Chieh-Jan Mike Liang, Feng Wu, Francis Y. Yan
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Achieving resource efficiency while preserving end-user experience is
+non-trivial for cloud application operators. As cloud applications
+progressively adopt microservices, resource managers are faced with two
+distinct levels of system behavior: end-to-end application latency and
+per-service resource usage. Translating between the two levels, however, is
+challenging because user requests traverse heterogeneous services that
+collectively (but unevenly) contribute to the end-to-end latency. We present
+Autothrottle, a bi-level resource management framework for microservices with
+latency SLOs (service-level objectives). It architecturally decouples
+application SLO feedback from service resource control, and bridges them
+through the notion of performance targets. Specifically, an application-wide
+learning-based controller is employed to periodically set performance targets
+-- expressed as CPU throttle ratios -- for per-service heuristic controllers to
+attain. We evaluate Autothrottle on three microservice applications, with
+workload traces from production scenarios. Results show superior CPU savings,
+up to 26.21% over the best-performing baseline and up to 93.84% over all
+baselines.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by USENIX NSDI '24</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Data-Driven Network Neuroscience: On Data Collection and Benchmark 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2211.12421v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2211.12421v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaxing Xu, Yunhan Yang, David Tse Jung Huang, Sophi Shilpa Gururajapathy, Yiping Ke, Miao Qiao, Alan Wang, Haribalan Kumar, Josh McGeown, Eryn Kwon
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  This paper presents a comprehensive and quality collection of functional
+human brain \emph{network} data for potential research in the intersection of
+neuroscience, machine learning, and graph analytics. Anatomical and functional
+MRI images have been used to understand the functional connectivity of the
+human brain and are particularly important in identifying underlying
+neurodegenerative conditions such as Alzheimer's, Parkinson's, and Autism.
+Recently, the study of the brain in the form of brain networks using machine
+learning and graph analytics has become increasingly popular, especially to
+predict the early onset of these conditions. A brain network, represented as a
+graph, retains rich structural and positional information that traditional
+examination methods are unable to capture. However, the lack of publicly
+accessible brain network data prevents researchers from data-driven
+explorations. One of the main difficulties lies in the complicated
+domain-specific preprocessing steps and the exhaustive computation required to
+convert the data from MRI images into brain networks. We bridge this gap by
+collecting a large amount of MRI images from public databases and a private
+source, working with domain experts to make sensible design choices, and
+preprocessing the MRI images to produce a collection of brain network datasets.
+The datasets originate from 6 different sources, cover 4 brain conditions, and
+consist of a total of 2,702 subjects. We test our graph datasets on 12 machine
+learning models to provide baselines and validate the data quality on a recent
+graph analysis model. To lower the barrier to entry and promote the research in
+this interdisciplinary field, we release our brain network data and complete
+preprocessing details including codes at
+https://doi.org/10.17608/k6.auckland.21397377 and
+https://figshare.com/s/fa33c10664ca08b022ce.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Reason for Future, Act for Now: A Principled Framework for Autonomous
+  LLM Agents with Provable Sample Efficiency 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2309.17382v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2309.17382v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Zhihan Liu, Hao Hu, Shenao Zhang, Hongyi Guo, Shuqi Ke, Boyi Liu, Zhaoran Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Large language models (LLMs) demonstrate impressive reasoning abilities, but
+translating reasoning into actions in the real world remains challenging. In
+particular, it remains unclear how to complete a given task provably within a
+minimum number of interactions with the external environment, e.g., through an
+internal mechanism of reasoning. To this end, we propose a principled framework
+with provable regret guarantees to orchestrate reasoning and acting, which we
+call "reason for future, act for now" (\texttt{RAFA}). Specifically, we design
+a prompt template for reasoning that learns from the memory buffer and plans a
+future trajectory over a long horizon ("reason for future"). At each step, the
+LLM agent takes the initial action of the planned trajectory ("act for now"),
+stores the collected feedback in the memory buffer, and reinvokes the reasoning
+routine to replan the future trajectory from the new state.
+  The key idea is to cast reasoning in LLMs as learning and planning in
+Bayesian adaptive Markov decision processes (MDPs). Correspondingly, we prompt
+LLMs to form an updated posterior of the unknown environment from the memory
+buffer (learning) and generate an optimal trajectory for multiple future steps
+that maximizes a value function (planning). The learning and planning
+subroutines are performed in an "in-context" manner to emulate the actor-critic
+update for MDPs. Our theoretical analysis proves that the novel combination of
+long-term reasoning and short-term acting achieves a $\sqrt{T}$ regret. In
+particular, the regret bound highlights an intriguing interplay between the
+prior knowledge obtained through pretraining and the uncertainty reduction
+achieved by reasoning and acting. Our empirical validation shows that it
+outperforms various existing frameworks and achieves nearly perfect scores on a
+few benchmarks.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Cell Tracking-by-detection using Elliptical Bounding Boxes 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04895v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04895v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Lucas N. Kirsten, Cláudio R. Jung
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Cell detection and tracking are paramount for bio-analysis. Recent approaches
+rely on the tracking-by-model evolution paradigm, which usually consists of
+training end-to-end deep learning models to detect and track the cells on the
+frames with promising results. However, such methods require extensive amounts
+of annotated data, which is time-consuming to obtain and often requires
+specialized annotators. This work proposes a new approach based on the
+classical tracking-by-detection paradigm that alleviates the requirement of
+annotated data. More precisely, it approximates the cell shapes as oriented
+ellipses and then uses generic-purpose oriented object detectors to identify
+the cells in each frame. We then rely on a global data association algorithm
+that explores temporal cell similarity using probability distance metrics,
+considering that the ellipses relate to two-dimensional Gaussian distributions.
+Our results show that our method can achieve detection and tracking results
+competitively with state-of-the-art techniques that require considerably more
+extensive data annotation. Our code is available at:
+https://github.com/LucasKirsten/Deep-Cell-Tracking-EBB.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Paper under review on IEEE/ACM Transactions on Computational Biology
+  and Bioinformatics</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+            <article>
+                <details>
+                    <Summary>
+                        Multimedia <span class="chip" style="font-size: 60%">7</span>
+                    </Summary>
+                    <div class="details-content">
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ CM-PIE: Cross-modal perception for interactive-enhanced audio-visual
+  video parsing 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07517v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07517v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Yaru Chen, Ruohao Guo, Xubo Liu, Peipei Wu, Guangyao Li, Zhenbo Li, Wenwu Wang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Audio-visual video parsing is the task of categorizing a video at the segment
+level with weak labels, and predicting them as audible or visible events.
+Recent methods for this task leverage the attention mechanism to capture the
+semantic correlations among the whole video across the audio-visual modalities.
+However, these approaches have overlooked the importance of individual segments
+within a video and the relationship among them, and tend to rely on a single
+modality when learning features. In this paper, we propose a novel
+interactive-enhanced cross-modal perception method~(CM-PIE), which can learn
+fine-grained features by applying a segment-based attention module.
+Furthermore, a cross-modal aggregation block is introduced to jointly optimize
+the semantic representation of audio and visual signals by enhancing
+inter-modal interactions. The experimental results show that our model offers
+improved parsing performance on the Look, Listen, and Parse dataset compared to
+other methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>5 pages, 3 figures, 15 references</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Point Cloud Denoising and Outlier Detection with Local Geometric
+  Structure by Dynamic Graph CNN 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07376v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07376v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Kosuke Nakayama, Hiroto Fukuta, Hiroshi Watanabe
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  The digitalization of society is rapidly developing toward the realization of
+the digital twin and metaverse. In particular, point clouds are attracting
+attention as a media format for 3D space. Point cloud data is contaminated with
+noise and outliers due to measurement errors. Therefore, denoising and outlier
+detection are necessary for point cloud processing. Among them, PointCleanNet
+is an effective method for point cloud denoising and outlier detection.
+However, it does not consider the local geometric structure of the patch. We
+solve this problem by applying two types of graph convolutional layer designed
+based on the Dynamic Graph CNN. Experimental results show that the proposed
+methods outperform the conventional method in AUPR, which indicates outlier
+detection accuracy, and Chamfer Distance, which indicates denoising accuracy.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>2023 IEEE 12th Global Conference on Consumer Electronics (GCCE 2023)</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Interactive Interior Design Recommendation via Coarse-to-fine Multimodal
+  Reinforcement Learning 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07287v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07287v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        He Zhang, Ying Sun, Weiyu Guo, Yafei Liu, Haonan Lu, Xiaodong Lin, Hui Xiong
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Personalized interior decoration design often incurs high labor costs. Recent
+efforts in developing intelligent interior design systems have focused on
+generating textual requirement-based decoration designs while neglecting the
+problem of how to mine homeowner's hidden preferences and choose the proper
+initial design. To fill this gap, we propose an Interactive Interior Design
+Recommendation System (IIDRS) based on reinforcement learning (RL). IIDRS aims
+to find an ideal plan by interacting with the user, who provides feedback on
+the gap between the recommended plan and their ideal one. To improve
+decision-making efficiency and effectiveness in large decoration spaces, we
+propose a Decoration Recommendation Coarse-to-Fine Policy Network (DecorRCFN).
+Additionally, to enhance generalization in online scenarios, we propose an
+object-aware feedback generation method that augments model training with
+diversified and dynamic textual feedback. Extensive experiments on a real-world
+dataset demonstrate our method outperforms traditional methods by a large
+margin in terms of recommendation accuracy. Further user studies demonstrate
+that our method reaches higher real-world user satisfaction than baseline
+methods.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Accepted by ACM International Conference on Multimedia'23. 9 pages, 7
+  figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ AdaMesh: Personalized Facial Expressions and Head Poses for
+  Speech-Driven 3D Facial Animation 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07236v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07236v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Liyang Chen, Weihong Bao, Shun Lei, Boshi Tang, Zhiyong Wu, Shiyin Kang, Haozhi Huang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Speech-driven 3D facial animation aims at generating facial movements that
+are synchronized with the driving speech, which has been widely explored
+recently. Existing works mostly neglect the person-specific talking style in
+generation, including facial expression and head pose styles. Several works
+intend to capture the personalities by fine-tuning modules. However, limited
+training data leads to the lack of vividness. In this work, we propose AdaMesh,
+a novel adaptive speech-driven facial animation approach, which learns the
+personalized talking style from a reference video of about 10 seconds and
+generates vivid facial expressions and head poses. Specifically, we propose
+mixture-of-low-rank adaptation (MoLoRA) to fine-tune the expression adapter,
+which efficiently captures the facial expression style. For the personalized
+pose style, we propose a pose adapter by building a discrete pose prior and
+retrieving the appropriate style embedding with a semantic-aware pose style
+matrix without fine-tuning. Extensive experimental results show that our
+approach outperforms state-of-the-art methods, preserves the talking style in
+the reference video, and generates vivid facial animation. The supplementary
+video and code will be available at https://adamesh.github.io.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>Project Page: https://adamesh.github.io</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                         ☆ Motion Vector-Domain Video Steganalysis Exploiting Skipped Macroblocks 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.07121v1">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.07121v1.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jun Li, Minqing Zhang, Ke Niu, Yingnan Zhang, Xiaoyuan Yang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Video steganography has the potential to be used to convey illegal
+information, and video steganalysis is a vital tool to detect the presence of
+this illicit act. Currently, all the motion vector (MV)-based video
+steganalysis algorithms extract feature sets directly on the MVs, but ignoring
+the steganograhic operation may perturb the statistics distribution of other
+video encoding elements, such as the skipped macroblocks (no direct MVs). This
+paper proposes a novel 11-dimensional feature set to detect MV-based video
+steganography based on the above observation. The proposed feature is extracted
+based on the skipped macroblocks by recompression calibration. Specifically,
+the feature consists of two components. The first is the probability
+distribution of motion vector prediction (MVP) difference, and the second is
+the probability distribution of partition state transfer. Extensive experiments
+on different conditions demonstrate that the proposed feature set achieves good
+detection accuracy, especially in lower embedding capacity. In addition, the
+loss of detection performance caused by recompression calibration using
+mismatched quantization parameters (QP) is within the acceptable range, so the
+proposed method can be used in practical scenarios.
+</span>
+                                    </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ altiro3D: Scene representation from single image and novel view
+  synthesis 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2304.11161v2">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2304.11161v2.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        E. Canessa, L. Tenze
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  We introduce altiro3D, a free extended library developed to represent reality
+starting from a given original RGB image or flat video. It allows to generate a
+light-field (or Native) image or video and get a realistic 3D experience. To
+synthesize N-number of virtual images and add them sequentially into a Quilt
+collage, we apply MiDaS models for the monocular depth estimation, simple
+OpenCV and Telea inpainting techniques to map all pixels, and implement a
+'Fast' algorithm to handle 3D projection camera and scene transformations along
+N-viewpoints. We use the degree of depth to move proportionally the pixels,
+assuming the original image to be at the center of all the viewpoints. altiro3D
+can also be used with DIBR algorithm to compute intermediate snapshots from a
+equivalent 'Real (slower)' camera with N-geometric viewpoints, which requires
+to calibrate a priori several intrinsic and extrinsic camera parameters. We
+adopt a pixel- and device-based Lookup Table to optimize computing time. The
+multiple viewpoints and video generated from a single image or frame can be
+displayed in a free-view LCD display.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>In press (2023) Springer International Journal of Information
+  Technology (IJIT) 10 pages, 3 figures</span>
+                                        </div>
+                                </details>
+                            </article>
+                            <article>
+                                <details class="article-expander">
+                                    <summary class="article-expander-title">
+                                        ♻ ☆ Laura<span class="highlight-title">GPT</span>: Listen, Attend, Understand, and Regenerate Audio with <span class="highlight-title">GPT</span> 
+                                    </summary>
+                                    <div class="article-authors">
+                                        <a href="http://arxiv.org/abs/2310.04673v3">
+                                            <i class="ri-links-line"></i>
+                                        </a>
+                                        <a href="https://arxiv.org/pdf/2310.04673v3.pdf">
+                                            <i class="ri-file-paper-2-line"></i>
+                                        </a>
+                                        Jiaming Wang, Zhihao Du, Qian Chen, Yunfei Chu, Zhifu Gao, Zerui Li, Kai Hu, Xiaohuan Zhou, Jin Xu, Ziyang Ma, Wen Wang, Siqi Zheng, Chang Zhou, Zhijie Yan, Shiliang Zhang
+                                    </div>
+                                    <div class="article-summary-box-inner">
+                                        <span>  Generative Pre-trained Transformer (GPT) models have achieved remarkable
+performance on various natural language processing tasks. However, there has
+been limited research on applying similar frameworks to audio tasks. Previously
+proposed large language models for audio tasks either lack sufficient
+quantitative evaluations, or are limited to tasks for recognizing and
+understanding audio content, or significantly underperform existing
+state-of-the-art (SOTA) models. In this paper, we propose LauraGPT, a unified
+GPT model for audio recognition, understanding, and generation. LauraGPT is a
+versatile language model that can process both audio and text inputs and
+generate outputs in either modalities. It can perform a wide range of tasks
+related to content, semantics, paralinguistics, and audio-signal analysis. Some
+of its noteworthy tasks include automatic speech recognition, speech-to-text
+translation, text-to-speech synthesis, machine translation, speech enhancement,
+automated audio captioning, speech emotion recognition, and spoken language
+understanding. To achieve this goal, we use a combination of continuous and
+discrete features for audio. We encode input audio into continuous
+representations using an audio encoder and decode output audio from discrete
+codec codes. We then fine-tune a large decoder-only Transformer-based language
+model on multiple audio-to-text, text-to-audio, audio-to-audio, and
+text-to-text tasks using a supervised multitask learning approach. Extensive
+experiments show that LauraGPT achieves competitive or superior performance
+compared to existing SOTA models on various audio processing benchmarks.
+</span>
+                                    </div>
+                                        <div class="article-summary-box-inner">
+                                            <span class="chip">comment</span>: <span>10 pages, under review</span>
+                                        </div>
+                                </details>
+                            </article>
+                    </div>
+                </details>
+            </article>
+    </section>
+
+</body>
+
+<footer>
+    <div>
+        <time id="build-timestamp" datetime="2023-10-19T05:21:23.946674856Z">
+            2023-10-19 05:21:23 UTC
+        </time>
+    </div>
+</footer>
+<script src="index.js"></script>
+</html>
diff --git a/index.js b/index.js
new file mode 100644
index 00000000..69f5da7b
--- /dev/null
+++ b/index.js
@@ -0,0 +1,39 @@
+/* Exapand/Collapse with TAB key */
+var expanded = false;
+document.onkeydown = function (e) {
+    if (e.keyCode === 9) {
+        expanded = !expanded;
+        document.querySelectorAll("details").forEach(detail => detail.open = expanded);
+        return false;
+    }
+};
+
+/* Switch Theme */
+const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]');
+
+function switchTheme(e) {
+    if (e.target.checked) {
+        document.documentElement.setAttribute('data-theme', 'light');
+        document.getElementById("theme-icon").className = "ri-sun-line";
+        localStorage.setItem('theme', 'light'); //add this
+    } else {
+        document.documentElement.setAttribute('data-theme', 'dark');
+        document.getElementById("theme-icon").className = "ri-moon-line";
+        localStorage.setItem('theme', 'dark'); //add this
+    }
+}
+
+toggleSwitch.addEventListener('change', switchTheme, false);
+const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null;
+if (currentTheme) {
+    document.documentElement.setAttribute('data-theme', currentTheme);
+    if (currentTheme === 'light') {
+        toggleSwitch.checked = true;
+    }
+}
+
+const timestamp = document.getElementById("build-timestamp");
+const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString();
+
+const badge = document.getElementById("build-timestamp-badge");
+// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`